From a1b72325216e4c17de35c23612106236192bd2d5 Mon Sep 17 00:00:00 2001 From: Laurent Bachelier Date: Mon, 26 Mar 2012 05:50:15 +0200 Subject: [PATCH] browser2: Browser load urls, post, track state With working tests! --- weboob/tools/browser2/browser.py | 247 +++++++++++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 weboob/tools/browser2/browser.py diff --git a/weboob/tools/browser2/browser.py b/weboob/tools/browser2/browser.py new file mode 100644 index 00000000..0436bbd6 --- /dev/null +++ b/weboob/tools/browser2/browser.py @@ -0,0 +1,247 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2012 Laurent Bachelier +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +import requests +from requests.status_codes import codes + + +class Profile(object): + """ + A profile represents the way Browser should act. + Usually it is to mimic a real browser. + """ + + def setup_session(self, session): + """ + Change default headers, set up hooks, etc. + + Warning: Do not enable lzma, bzip or bzip2, sdch encodings + as python-requests does not support it yet. + In doubt, do not change the default Accept-Encoding header + of python-requests. + """ + raise NotImplementedError() + + +class Weboob(Profile): + """ + It's us! + Recommended for Weboob-friendly websites only. + """ + + def __init__(self, version): + self.version = version + + def setup_session(self, session): + session.config['base_headers']['User-Agent'] = 'weboob/%s' % self.version + + +class Firefox(Profile): + """ + Try to mimic a specific version of Firefox. + Ideally, it should follow the current ESR Firefox: + https://www.mozilla.org/en-US/firefox/organizations/all.html + Do not change the Firefox version without changing the Gecko one! + """ + + def setup_session(self, session): + """ + Set up headers for a standard Firefox request + (except for DNT which isn't on by default but is a good idea). + """ + # Replace all base requests headers + # https://developer.mozilla.org/en/Gecko_user_agent_string_reference + # https://bugzilla.mozilla.org/show_bug.cgi?id=572650 + session.config['base_headers'] = {'Accept-Language': 'en-us,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0.3) Gecko/20100101 Firefox/10.0.3', + 'DNT': '1'} + # It also has "Connection: Keep-Alive", that should only be added this way: + session.config['keep_alive'] = True + + +class Wget(Profile): + """ + Common alternative user agent. + Some websites will give you a version with less JavaScript. + Some others could ban you (after all, wget is not a real browser). + """ + def __init__(self, version='1.11.4'): + self.version = version + + def setup_session(self, session): + # Don't remove base headers, if websites want to block fake browsers, + # they will probably block any wget user agent anyway. + session.config['base_headers'].update({'Accept': '*/*', + 'User-Agent': 'Wget/%s' % self.version}) + session.config['keep_alive'] = True + + +class Browser(object): + PROFILE = Firefox() + + def __init__(self): + profile = self.PROFILE + self._setup_session(profile) + self.url = None + self.response = None + + def _setup_session(self, profile): + session = requests.Session() + + # Raise exceptions on HTTP errors + session.config['safe_mode'] = False + session.config['danger_mode'] = True + # TODO max_retries? + # TODO connect config['verbose'] to our logger + + # TODO find a way to have multiple session hooks + # lists don't work in this context + session.hooks['response'] = self._fix_redirect + + profile.setup_session(session) + + self.session = session + + def _fix_redirect(self, response): + """ + TL;DR: Web browsers and web developers suck. + + Most browsers do not follow the RFC for HTTP 301 and 302 + but python-requests does. + And web developers assume we don't follow it either. + https://en.wikipedia.org/wiki/Post/Redirect/Get + + Gets a Response, and returns a new Response. + Used as a 'response' hook for python-requests. + + This is a hack, it would be better as an option in python-requests. + """ + request = response.request + # If the request wasn't redirected, and is a redirection, + # and we allowed it to be fixed, + # restart the request building, but with a changed action. + if request.allow_redirects is False \ + and request.response.status_code in (codes.moved, codes.found) \ + and request.config.get('fix-redirect'): + # force the next request to be GET + real_method = request.method + request.method = 'GET' + real_data = request.data + request.data = None + + # build the response again + request.allow_redirects = True + request._build_response(response.raw) + + # restore info + request.method = real_method + request.data = real_data + + return request.response + + def location(self, url, data=None, fix_redirect=True, **kwargs): + """ + Like open() but also changes the current URL and response. + This is the most common method to request web pages. + """ + response = self.open(url, data, fix_redirect, **kwargs) + self.response = response + self.url = self.response.url + return response + + def open(self, url, data=None, fix_redirect=True, **kwargs): + """ + Wrapper around request(). + Makes a GET request, or a POST if data is provided. + + Call this if you do not want to "visit" the URL (for instance, you + are downloading a file). + """ + method = kwargs.pop('method', None) + if method is None: + if data is None: + method = 'GET' + else: + method = 'POST' + kwargs['data'] = data + if fix_redirect: + kwargs.setdefault('config', {}).setdefault('fix-redirect', True) + response = self.request(method, url, **kwargs) + return response + + def request(self, *args, **kwargs): + """ + Creates a Request object and calls it. + Takes the sames arguments as request.request() + Returns a Response object. + + Most of the time, you should use location() or open(). + """ + # python-requests or urllib3 does not handle + # empty POST requests properly, so some websites refuse it. + data = kwargs.get('data') + if data is not None and len(data) == 0: + kwargs.setdefault('headers', {}).setdefault('Content-Length', '0') + return self.session.request(*args, **kwargs) + + +def test_base(): + b = Browser() + r = b.location('http://httpbin.org/headers') + assert isinstance(r.text, unicode) + assert 'Firefox' in r.text + assert 'python' not in r.text + assert 'identity' not in r.text + assert b.url == 'http://httpbin.org/headers' + + +def test_redirects(): + b = Browser() + b.location('http://httpbin.org/redirect/1') + assert b.url == 'http://httpbin.org/get' + + +def test_brokenpost(): + """ + Tests _fix_redirect() + """ + b = Browser() + # postbin is picky with empty posts. that's good! + r = b.location('http://www.postbin.org/', {}) + # ensures empty data (but not None) does a POST + assert r.request.method == 'POST' + # ensure we were redirected after submitting a post + assert len(r.url) >= len('http://www.postbin.org/') + # send a POST with data + b.location(r.url, {'hello': 'world'}) + r = b.location(r.url + '/feed') + assert 'hello' in r.text + assert 'world' in r.text + + +def test_weboob(): + class BooBrowser(Browser): + PROFILE = Weboob('0.0') + + b = BooBrowser() + r = b.location('http://httpbin.org/headers') + assert 'weboob/0.0' in r.text + assert 'identity' in r.text