browser2: Handle redirects internally

python-requests isn't secure enough, and some behavior depends on the latest version. Tested on 0.10.6. So instead of the previous hack, we have some copy-paste. But we gain secure cookies handling (not there yet), referrer handling, "proper" redirect on POST behavior.
2012-04-13 16:21:56 +02:00 · 2012-04-13 16:21:56 +02:00 · 57e16e9fe4
commit 57e16e9fe4
parent 4b802f32dd
2 changed files with 185 additions and 84 deletions
--- a/weboob/tools/browser2/browser.py
+++ b/weboob/tools/browser2/browser.py
@ -19,10 +19,11 @@
 from __future__ import absolute_import
-import urlparse
+from urlparse import urlparse, urljoin
 import requests
 from requests.status_codes import codes
 from copy import deepcopy
 # TODO define __all__
@ -130,67 +131,111 @@ class BaseBrowser(object):
        # TODO max_retries?
        # TODO connect config['verbose'] to our logger
        # TODO find a way to have multiple session hooks
        # lists don't work in this context
        session.hooks['response'] = self._fix_redirect
        profile.setup_session(session)
        self.session = session
-    def _fix_redirect(self, response):
+    def follow_redirects(self, response, orig_args=None):
        """
-        TL;DR: Web browsers and web developers suck.
+        Follow redirects *properly*.
        * Mimic what browsers do on 302
        * TODO Handle cookies securely
-        Most browsers do not follow the RFC for HTTP 302
+        :type response: :class:`requests.Response`
-        but python-requests does.
+        :type orig_args: dict
-        And web developers assume we don't follow it either:
+        :rtype: :class:`requests.Response`
        https://en.wikipedia.org/wiki/Post/Redirect/Get
        Gets a Response, and returns a new Response.
        Used as a 'response' hook for python-requests.
        This is a hack, it would be better as an option in python-requests.
        What we do is run again the response building,
        but this time with allow_redirects=True, and if we have a HTTP 302,
        we set a temporary fake method='GET' and empty data.
        So in order to have proper allow_redirects=True handling of POSTs
        you have to create a request with allow_redirects=False,
        and fix-redirect=True in config (which is for the first one the
        python-requests default for POSTs, and for the second one the
        BaseBrowser default).
        """
        # The response chain. We start with the one we got.
        responses = [response]
        request = response.request
        # If the request wasn't redirected, and is a redirection,
        # and we allowed it to be fixed,
        # restart the request building, but with a changed action.
        if request.allow_redirects is False \
        and request.response.status_code in requests.models.REDIRECT_STATI \
        and request.config.get('fix-redirect'):
            if (request.response.status_code in (codes.moved, codes.found) \
                and request.method == 'POST') \
            or (request.response.status_code == 303 and request.method != 'HEAD'):
                # force the next request to be GET
                real_method = request.method
                request.method = 'GET'
                real_data = request.data
                request.data = None
-            # build the response again
+        # Default method for redirects
-            request.allow_redirects = True
+        orig_args = orig_args or {}
-            request._build_response(response.raw)
+        orig_args.setdefault('method', request.method)
        orig_args.setdefault('data', request.data)
        # If we have the original arguments, take them, and fix them
        orig_args.pop('url', None)
        orig_referrer = orig_args.pop('referrer', None)
        # Avoid infinite loops
        orig_args['allow_redirects'] = False
-            if request.response.status_code is codes.found:
+        # TL;DR: Web browsers and web developers suck.
-                # restore info
+        #
-                request.method = real_method
+        # Most browsers do not follow the RFC for HTTP 302
-                request.data = real_data
+        # but python-requests does.
        # And web developers assume we don't follow it either:
        # https://en.wikipedia.org/wiki/Post/Redirect/Get
        #
        # Later python-request versions do it that way, but to stay
        # compatible with older versions, we use this.
        while request.allow_redirects is False \
        and response.status_code in requests.models.REDIRECT_STATI \
        and 'location' in response.headers:
            ## This is from requests.models._build_response
            response.content  # Consume socket so it can be released
-            return request.response
+            if len(responses) > response.config.get('max_redirects'):
                raise requests.exceptions.TooManyRedirects()
            # Release the connection back into the pool.
            response.raw.release_conn()
            ## End of code from requests.models._build_response
            # http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3.4
            if response.status_code == codes.see_other:
                orig_args['method'] = 'GET'
                orig_args['data'] = None
                orig_args['files'] = None
            if not request.config.get('strict_mode'):
                # Do the same as Google Chrome.
                # http://git.chromium.org/gitweb/?p=chromium/src/net.git;a=blob;f=url_request/url_request.cc;h=8597917f0cbf49c84b3bdae3a7bebacbc264f1e0;hb=HEAD#l673
                if (response.status_code == 303 and request.method != 'HEAD') \
                or (response.status_code in (codes.moved, codes.found) and request.method == 'POST'):
                    # Once we use GET, all next requests will use GET.
                    orig_args['method'] = 'GET'
                    orig_args['data'] = None
                    orig_args['files'] = None
            ## This is from requests.models._build_response
            url = response.headers['location']
            # Handle redirection without scheme (see: RFC 1808 Section 4)
            if url.startswith('//'):
                parsed_rurl = urlparse(response.url)
                url = '%s:%s' % (parsed_rurl.scheme, url)
            # Facilitate non-RFC2616-compliant 'location' headers
            # (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
            if not urlparse(url).netloc:
                url = urljoin(response.url,
                                # Compliant with RFC3986, we percent
                                # encode the url.
                                requests.utils.requote_uri(url))
            ## End of code from requests.models._build_response
            if orig_referrer is False:
                # Referer disabled in original request, disable in next
                referrer = orig_referrer
            else:
                # Guess from last response
                referrer = self._get_referrer(response.url, url)
            call_args = deepcopy(orig_args)
            response = self.open(url, referrer=referrer, **call_args)
            responses.append(response)
        # get the final response
        response = responses.pop()
        # _build_response does this
        response.history = responses
        request.response = response
        return response
    def location(self, url, data=None,
-            fix_redirect=True, referrer=None,
+            allow_redirects=True, referrer=None,
            **kwargs):
        """
        Like open() but also changes the current URL and response.
@ -198,25 +243,27 @@ class BaseBrowser(object):
        Other than that, has the exact same behavior of open().
        """
-        response = self.open(url, data, fix_redirect, **kwargs)
+        response = self.open(url, data, allow_redirects, referrer, **kwargs)
        self.response = response
        self.url = self.response.url
        return response
    def open(self, url, data=None,
-            fix_redirect=True, referrer=None,
+            allow_redirects=True, referrer=None,
            **kwargs):
        """
        Wrapper around request().
        Makes a GET request, or a POST if data is not None, unless a `method`
        is explicitly provided.
        An empty `data` (not None) *will* make a post.
-        All request() options are available, and it is possible to disable the
+        It is a wrapper around session.request().
-        automatic method, referrer, and redirection fixes.
+        All session.request() options are available.
        You should use location() or open() and not session.request(),
        since it has some interesting additions, which are easily
        individually disabled through the arguments.
-        Call this if you do not want to "visit" the URL (for instance, you
+        Call this instead of location() if you do not want to "visit" the URL
-        are downloading a file).
+        (for instance, you are downloading a file).
        :param url: URL
        :type url: str
@ -224,14 +271,16 @@ class BaseBrowser(object):
        :param data: POST data
        :type url: str or dict or None
        :param fix_redirect: Fix POST 302 redirects
        :type fix_redirect: True or False
        :param referrer: Force referrer. False to disable sending it, None for guessing
        :type referrer: str or False or None
        :rtype: :class:`requests.Response`
        """
        kwargs = deepcopy(kwargs)
        orig_args = deepcopy(kwargs)
        orig_args['referrer'] = referrer
        # guess method
        method = kwargs.pop('method', None)
        if method is None:
            if data is None:
@ -239,35 +288,36 @@ class BaseBrowser(object):
            else:
                method = 'POST'
        kwargs['data'] = data
-        if fix_redirect:
+
-            kwargs.setdefault('config', {}).setdefault('fix-redirect', True)
+        # python-requests or urllib3 does not handle
-            kwargs.setdefault('allow_redirects', False)
+        # empty POST requests properly, so some websites refuse it.
        if data is not None and len(data) == 0:
            kwargs.setdefault('headers', {}).setdefault('Content-Length', '0')
        # Use our own redirection handling
        # python-requests's sucks to much to be allowed.
        kwargs.setdefault('config', {}).setdefault('strict_mode', False)
        kwargs['allow_redirects'] = False
        if referrer is None:
            referrer = self._get_referrer(self.url, url)
        if referrer:
            # Yes, it is a misspelling.
            kwargs.setdefault('headers', {}).setdefault('Referer', referrer)
-        response = self.request(method, url, **kwargs)
+
        if self.TIMEOUT:
            kwargs.setdefault('timeout', self.TIMEOUT)
        # call python-requests
        response = self.session.request(method, url, **kwargs)
        if allow_redirects:
            response = self.follow_redirects(response, orig_args)
        # erase all cookies, python-requests does not handle them securely
        self.session.cookies = {}
        return response
    def request(self, *args, **kwargs):
        """
        Creates a Request object and calls it.
        Takes the sames arguments as request.request()
        Returns a Response object.
        Most of the time, you should use location() or open(),
        since it ignores some interesting additions, which are easily
        individually disabled through the arguments.
        """
        # python-requests or urllib3 does not handle
        # empty POST requests properly, so some websites refuse it.
        data = kwargs.get('data')
        if data is not None and len(data) == 0:
            kwargs.setdefault('headers', {}).setdefault('Content-Length', '0')
        kwargs.setdefault('timeout', self.TIMEOUT)
        return self.session.request(*args, **kwargs)
    def _get_referrer(self, oldurl, newurl):
        """
        Get the referrer to send when doing a request.
@ -285,8 +335,8 @@ class BaseBrowser(object):
        """
        if oldurl is None:
            return None
-        old = urlparse.urlparse(oldurl)
+        old = urlparse(oldurl)
-        new = urlparse.urlparse(newurl)
+        new = urlparse(newurl)
        # Do not leak secure URLs to insecure URLs
        if old.scheme == 'https' and new.scheme != 'https':
            return None
@ -333,7 +383,7 @@ class DomainBrowser(BaseBrowser):
            base = self.BASEURL
        if base is None or base is False:
            base = self.url
-        return urlparse.urljoin(base, uri)
+        return urljoin(base, uri)
    def open(self, uri, *args, **kwargs):
        return BaseBrowser.open(self, self.absurl(uri), *args, **kwargs)
--- a/weboob/tools/browser2/test.py
+++ b/weboob/tools/browser2/test.py
@ -57,6 +57,47 @@ def test_redirects():
    b.location(HTTPBIN + 'redirect/1')
    assert b.url == HTTPBIN + 'get'
    r = b.location(HTTPBIN + 'redirect/1')
    assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
    assert r.url == HTTPBIN + 'get'
    # Normal redirect chain
    b.url = None
    r = b.location(HTTPBIN + 'redirect/4')
    assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
    assert len(r.history) == 4
    assert r.history[3].request.url == HTTPBIN + 'redirect/1'
    assert r.history[3].request.headers.get('Referer') == HTTPBIN + 'redirect/2'
    assert r.history[2].request.url == HTTPBIN + 'redirect/2'
    assert r.history[2].request.headers.get('Referer') == HTTPBIN + 'redirect/3'
    assert r.history[1].request.url == HTTPBIN + 'redirect/3'
    assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/4'
    assert r.history[0].request.url == HTTPBIN + 'redirect/4'
    assert r.history[0].request.headers.get('Referer') == None
    assert r.url == HTTPBIN + 'get'
    # Disable all referers
    r = b.location(HTTPBIN + 'redirect/2', referrer=False)
    assert json.loads(r.text)['headers'].get('Referer') == None
    assert len(r.history) == 2
    assert r.history[1].request.headers.get('Referer') == None
    assert r.history[0].request.headers.get('Referer') == None
    assert r.url == HTTPBIN + 'get'
    # Only overrides first referer
    r = b.location(HTTPBIN + 'redirect/2', referrer='http://example.com/')
    assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
    assert len(r.history) == 2
    assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/2'
    assert r.history[0].request.headers.get('Referer') == 'http://example.com/'
    assert r.url == HTTPBIN + 'get'
    # Don't follow
    r = b.location(HTTPBIN + 'redirect/2', allow_redirects=False)
    assert len(r.history) == 0
    assert r.url == HTTPBIN + 'redirect/2'
    assert r.status_code == 302
 def test_brokenpost():
    """
@ -179,6 +220,16 @@ def test_referrer():
    r = b.location(HTTPBIN + 'headers')
    assert 'Referer' not in json.loads(r.text)['headers']
    # Force another referrer
    r = b.location(HTTPBIN + 'get')
    r = b.location(HTTPBIN + 'headers', referrer='http://example.com/')
    assert json.loads(r.text)['headers'].get('Referer') == 'http://example.com/'
    # Force no referrer
    r = b.location(HTTPBIN + 'get')
    r = b.location(HTTPBIN + 'headers', referrer=False)
    assert 'Referer' not in json.loads(r.text)['headers']
    assert b._get_referrer('https://example.com/', 'http://example.com/') is None