browser2: Handle redirects internally

python-requests isn't secure enough, and some behavior depends on the
latest version.
Tested on 0.10.6.

So instead of the previous hack, we have some copy-paste.
But we gain secure cookies handling (not there yet),
referrer handling, "proper" redirect on POST behavior.
This commit is contained in:
Laurent Bachelier 2012-04-13 16:21:56 +02:00 committed by Romain Bignon
commit 57e16e9fe4
2 changed files with 185 additions and 84 deletions

View file

@ -19,10 +19,11 @@
from __future__ import absolute_import from __future__ import absolute_import
import urlparse from urlparse import urlparse, urljoin
import requests import requests
from requests.status_codes import codes from requests.status_codes import codes
from copy import deepcopy
# TODO define __all__ # TODO define __all__
@ -130,67 +131,111 @@ class BaseBrowser(object):
# TODO max_retries? # TODO max_retries?
# TODO connect config['verbose'] to our logger # TODO connect config['verbose'] to our logger
# TODO find a way to have multiple session hooks
# lists don't work in this context
session.hooks['response'] = self._fix_redirect
profile.setup_session(session) profile.setup_session(session)
self.session = session self.session = session
def _fix_redirect(self, response): def follow_redirects(self, response, orig_args=None):
""" """
TL;DR: Web browsers and web developers suck. Follow redirects *properly*.
* Mimic what browsers do on 302
* TODO Handle cookies securely
Most browsers do not follow the RFC for HTTP 302 :type response: :class:`requests.Response`
but python-requests does. :type orig_args: dict
And web developers assume we don't follow it either: :rtype: :class:`requests.Response`
https://en.wikipedia.org/wiki/Post/Redirect/Get
Gets a Response, and returns a new Response.
Used as a 'response' hook for python-requests.
This is a hack, it would be better as an option in python-requests.
What we do is run again the response building,
but this time with allow_redirects=True, and if we have a HTTP 302,
we set a temporary fake method='GET' and empty data.
So in order to have proper allow_redirects=True handling of POSTs
you have to create a request with allow_redirects=False,
and fix-redirect=True in config (which is for the first one the
python-requests default for POSTs, and for the second one the
BaseBrowser default).
""" """
# The response chain. We start with the one we got.
responses = [response]
request = response.request request = response.request
# If the request wasn't redirected, and is a redirection,
# and we allowed it to be fixed,
# restart the request building, but with a changed action.
if request.allow_redirects is False \
and request.response.status_code in requests.models.REDIRECT_STATI \
and request.config.get('fix-redirect'):
if (request.response.status_code in (codes.moved, codes.found) \
and request.method == 'POST') \
or (request.response.status_code == 303 and request.method != 'HEAD'):
# force the next request to be GET
real_method = request.method
request.method = 'GET'
real_data = request.data
request.data = None
# build the response again # Default method for redirects
request.allow_redirects = True orig_args = orig_args or {}
request._build_response(response.raw) orig_args.setdefault('method', request.method)
orig_args.setdefault('data', request.data)
# If we have the original arguments, take them, and fix them
orig_args.pop('url', None)
orig_referrer = orig_args.pop('referrer', None)
# Avoid infinite loops
orig_args['allow_redirects'] = False
if request.response.status_code is codes.found: # TL;DR: Web browsers and web developers suck.
# restore info #
request.method = real_method # Most browsers do not follow the RFC for HTTP 302
request.data = real_data # but python-requests does.
# And web developers assume we don't follow it either:
# https://en.wikipedia.org/wiki/Post/Redirect/Get
#
# Later python-request versions do it that way, but to stay
# compatible with older versions, we use this.
while request.allow_redirects is False \
and response.status_code in requests.models.REDIRECT_STATI \
and 'location' in response.headers:
## This is from requests.models._build_response
response.content # Consume socket so it can be released
return request.response if len(responses) > response.config.get('max_redirects'):
raise requests.exceptions.TooManyRedirects()
# Release the connection back into the pool.
response.raw.release_conn()
## End of code from requests.models._build_response
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3.4
if response.status_code == codes.see_other:
orig_args['method'] = 'GET'
orig_args['data'] = None
orig_args['files'] = None
if not request.config.get('strict_mode'):
# Do the same as Google Chrome.
# http://git.chromium.org/gitweb/?p=chromium/src/net.git;a=blob;f=url_request/url_request.cc;h=8597917f0cbf49c84b3bdae3a7bebacbc264f1e0;hb=HEAD#l673
if (response.status_code == 303 and request.method != 'HEAD') \
or (response.status_code in (codes.moved, codes.found) and request.method == 'POST'):
# Once we use GET, all next requests will use GET.
orig_args['method'] = 'GET'
orig_args['data'] = None
orig_args['files'] = None
## This is from requests.models._build_response
url = response.headers['location']
# Handle redirection without scheme (see: RFC 1808 Section 4)
if url.startswith('//'):
parsed_rurl = urlparse(response.url)
url = '%s:%s' % (parsed_rurl.scheme, url)
# Facilitate non-RFC2616-compliant 'location' headers
# (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
if not urlparse(url).netloc:
url = urljoin(response.url,
# Compliant with RFC3986, we percent
# encode the url.
requests.utils.requote_uri(url))
## End of code from requests.models._build_response
if orig_referrer is False:
# Referer disabled in original request, disable in next
referrer = orig_referrer
else:
# Guess from last response
referrer = self._get_referrer(response.url, url)
call_args = deepcopy(orig_args)
response = self.open(url, referrer=referrer, **call_args)
responses.append(response)
# get the final response
response = responses.pop()
# _build_response does this
response.history = responses
request.response = response
return response
def location(self, url, data=None, def location(self, url, data=None,
fix_redirect=True, referrer=None, allow_redirects=True, referrer=None,
**kwargs): **kwargs):
""" """
Like open() but also changes the current URL and response. Like open() but also changes the current URL and response.
@ -198,25 +243,27 @@ class BaseBrowser(object):
Other than that, has the exact same behavior of open(). Other than that, has the exact same behavior of open().
""" """
response = self.open(url, data, fix_redirect, **kwargs) response = self.open(url, data, allow_redirects, referrer, **kwargs)
self.response = response self.response = response
self.url = self.response.url self.url = self.response.url
return response return response
def open(self, url, data=None, def open(self, url, data=None,
fix_redirect=True, referrer=None, allow_redirects=True, referrer=None,
**kwargs): **kwargs):
""" """
Wrapper around request().
Makes a GET request, or a POST if data is not None, unless a `method` Makes a GET request, or a POST if data is not None, unless a `method`
is explicitly provided. is explicitly provided.
An empty `data` (not None) *will* make a post. An empty `data` (not None) *will* make a post.
All request() options are available, and it is possible to disable the It is a wrapper around session.request().
automatic method, referrer, and redirection fixes. All session.request() options are available.
You should use location() or open() and not session.request(),
since it has some interesting additions, which are easily
individually disabled through the arguments.
Call this if you do not want to "visit" the URL (for instance, you Call this instead of location() if you do not want to "visit" the URL
are downloading a file). (for instance, you are downloading a file).
:param url: URL :param url: URL
:type url: str :type url: str
@ -224,14 +271,16 @@ class BaseBrowser(object):
:param data: POST data :param data: POST data
:type url: str or dict or None :type url: str or dict or None
:param fix_redirect: Fix POST 302 redirects
:type fix_redirect: True or False
:param referrer: Force referrer. False to disable sending it, None for guessing :param referrer: Force referrer. False to disable sending it, None for guessing
:type referrer: str or False or None :type referrer: str or False or None
:rtype: :class:`requests.Response` :rtype: :class:`requests.Response`
""" """
kwargs = deepcopy(kwargs)
orig_args = deepcopy(kwargs)
orig_args['referrer'] = referrer
# guess method
method = kwargs.pop('method', None) method = kwargs.pop('method', None)
if method is None: if method is None:
if data is None: if data is None:
@ -239,35 +288,36 @@ class BaseBrowser(object):
else: else:
method = 'POST' method = 'POST'
kwargs['data'] = data kwargs['data'] = data
if fix_redirect:
kwargs.setdefault('config', {}).setdefault('fix-redirect', True) # python-requests or urllib3 does not handle
kwargs.setdefault('allow_redirects', False) # empty POST requests properly, so some websites refuse it.
if data is not None and len(data) == 0:
kwargs.setdefault('headers', {}).setdefault('Content-Length', '0')
# Use our own redirection handling
# python-requests's sucks to much to be allowed.
kwargs.setdefault('config', {}).setdefault('strict_mode', False)
kwargs['allow_redirects'] = False
if referrer is None: if referrer is None:
referrer = self._get_referrer(self.url, url) referrer = self._get_referrer(self.url, url)
if referrer: if referrer:
# Yes, it is a misspelling. # Yes, it is a misspelling.
kwargs.setdefault('headers', {}).setdefault('Referer', referrer) kwargs.setdefault('headers', {}).setdefault('Referer', referrer)
response = self.request(method, url, **kwargs)
if self.TIMEOUT:
kwargs.setdefault('timeout', self.TIMEOUT)
# call python-requests
response = self.session.request(method, url, **kwargs)
if allow_redirects:
response = self.follow_redirects(response, orig_args)
# erase all cookies, python-requests does not handle them securely
self.session.cookies = {}
return response return response
def request(self, *args, **kwargs):
"""
Creates a Request object and calls it.
Takes the sames arguments as request.request()
Returns a Response object.
Most of the time, you should use location() or open(),
since it ignores some interesting additions, which are easily
individually disabled through the arguments.
"""
# python-requests or urllib3 does not handle
# empty POST requests properly, so some websites refuse it.
data = kwargs.get('data')
if data is not None and len(data) == 0:
kwargs.setdefault('headers', {}).setdefault('Content-Length', '0')
kwargs.setdefault('timeout', self.TIMEOUT)
return self.session.request(*args, **kwargs)
def _get_referrer(self, oldurl, newurl): def _get_referrer(self, oldurl, newurl):
""" """
Get the referrer to send when doing a request. Get the referrer to send when doing a request.
@ -285,8 +335,8 @@ class BaseBrowser(object):
""" """
if oldurl is None: if oldurl is None:
return None return None
old = urlparse.urlparse(oldurl) old = urlparse(oldurl)
new = urlparse.urlparse(newurl) new = urlparse(newurl)
# Do not leak secure URLs to insecure URLs # Do not leak secure URLs to insecure URLs
if old.scheme == 'https' and new.scheme != 'https': if old.scheme == 'https' and new.scheme != 'https':
return None return None
@ -333,7 +383,7 @@ class DomainBrowser(BaseBrowser):
base = self.BASEURL base = self.BASEURL
if base is None or base is False: if base is None or base is False:
base = self.url base = self.url
return urlparse.urljoin(base, uri) return urljoin(base, uri)
def open(self, uri, *args, **kwargs): def open(self, uri, *args, **kwargs):
return BaseBrowser.open(self, self.absurl(uri), *args, **kwargs) return BaseBrowser.open(self, self.absurl(uri), *args, **kwargs)

View file

@ -57,6 +57,47 @@ def test_redirects():
b.location(HTTPBIN + 'redirect/1') b.location(HTTPBIN + 'redirect/1')
assert b.url == HTTPBIN + 'get' assert b.url == HTTPBIN + 'get'
r = b.location(HTTPBIN + 'redirect/1')
assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
assert r.url == HTTPBIN + 'get'
# Normal redirect chain
b.url = None
r = b.location(HTTPBIN + 'redirect/4')
assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
assert len(r.history) == 4
assert r.history[3].request.url == HTTPBIN + 'redirect/1'
assert r.history[3].request.headers.get('Referer') == HTTPBIN + 'redirect/2'
assert r.history[2].request.url == HTTPBIN + 'redirect/2'
assert r.history[2].request.headers.get('Referer') == HTTPBIN + 'redirect/3'
assert r.history[1].request.url == HTTPBIN + 'redirect/3'
assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/4'
assert r.history[0].request.url == HTTPBIN + 'redirect/4'
assert r.history[0].request.headers.get('Referer') == None
assert r.url == HTTPBIN + 'get'
# Disable all referers
r = b.location(HTTPBIN + 'redirect/2', referrer=False)
assert json.loads(r.text)['headers'].get('Referer') == None
assert len(r.history) == 2
assert r.history[1].request.headers.get('Referer') == None
assert r.history[0].request.headers.get('Referer') == None
assert r.url == HTTPBIN + 'get'
# Only overrides first referer
r = b.location(HTTPBIN + 'redirect/2', referrer='http://example.com/')
assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
assert len(r.history) == 2
assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/2'
assert r.history[0].request.headers.get('Referer') == 'http://example.com/'
assert r.url == HTTPBIN + 'get'
# Don't follow
r = b.location(HTTPBIN + 'redirect/2', allow_redirects=False)
assert len(r.history) == 0
assert r.url == HTTPBIN + 'redirect/2'
assert r.status_code == 302
def test_brokenpost(): def test_brokenpost():
""" """
@ -179,6 +220,16 @@ def test_referrer():
r = b.location(HTTPBIN + 'headers') r = b.location(HTTPBIN + 'headers')
assert 'Referer' not in json.loads(r.text)['headers'] assert 'Referer' not in json.loads(r.text)['headers']
# Force another referrer
r = b.location(HTTPBIN + 'get')
r = b.location(HTTPBIN + 'headers', referrer='http://example.com/')
assert json.loads(r.text)['headers'].get('Referer') == 'http://example.com/'
# Force no referrer
r = b.location(HTTPBIN + 'get')
r = b.location(HTTPBIN + 'headers', referrer=False)
assert 'Referer' not in json.loads(r.text)['headers']
assert b._get_referrer('https://example.com/', 'http://example.com/') is None assert b._get_referrer('https://example.com/', 'http://example.com/') is None