browser2: Handle redirects internally

python-requests isn't secure enough, and some behavior depends on the
latest version.
Tested on 0.10.6.

So instead of the previous hack, we have some copy-paste.
But we gain secure cookies handling (not there yet),
referrer handling, "proper" redirect on POST behavior.
This commit is contained in:
Laurent Bachelier 2012-04-13 16:21:56 +02:00 committed by Romain Bignon
commit 57e16e9fe4
2 changed files with 185 additions and 84 deletions

View file

@ -19,10 +19,11 @@
from __future__ import absolute_import
import urlparse
from urlparse import urlparse, urljoin
import requests
from requests.status_codes import codes
from copy import deepcopy
# TODO define __all__
@ -130,67 +131,111 @@ class BaseBrowser(object):
# TODO max_retries?
# TODO connect config['verbose'] to our logger
# TODO find a way to have multiple session hooks
# lists don't work in this context
session.hooks['response'] = self._fix_redirect
profile.setup_session(session)
self.session = session
def _fix_redirect(self, response):
def follow_redirects(self, response, orig_args=None):
"""
TL;DR: Web browsers and web developers suck.
Follow redirects *properly*.
* Mimic what browsers do on 302
* TODO Handle cookies securely
Most browsers do not follow the RFC for HTTP 302
but python-requests does.
And web developers assume we don't follow it either:
https://en.wikipedia.org/wiki/Post/Redirect/Get
Gets a Response, and returns a new Response.
Used as a 'response' hook for python-requests.
This is a hack, it would be better as an option in python-requests.
What we do is run again the response building,
but this time with allow_redirects=True, and if we have a HTTP 302,
we set a temporary fake method='GET' and empty data.
So in order to have proper allow_redirects=True handling of POSTs
you have to create a request with allow_redirects=False,
and fix-redirect=True in config (which is for the first one the
python-requests default for POSTs, and for the second one the
BaseBrowser default).
:type response: :class:`requests.Response`
:type orig_args: dict
:rtype: :class:`requests.Response`
"""
# The response chain. We start with the one we got.
responses = [response]
request = response.request
# If the request wasn't redirected, and is a redirection,
# and we allowed it to be fixed,
# restart the request building, but with a changed action.
if request.allow_redirects is False \
and request.response.status_code in requests.models.REDIRECT_STATI \
and request.config.get('fix-redirect'):
if (request.response.status_code in (codes.moved, codes.found) \
and request.method == 'POST') \
or (request.response.status_code == 303 and request.method != 'HEAD'):
# force the next request to be GET
real_method = request.method
request.method = 'GET'
real_data = request.data
request.data = None
# build the response again
request.allow_redirects = True
request._build_response(response.raw)
# Default method for redirects
orig_args = orig_args or {}
orig_args.setdefault('method', request.method)
orig_args.setdefault('data', request.data)
# If we have the original arguments, take them, and fix them
orig_args.pop('url', None)
orig_referrer = orig_args.pop('referrer', None)
# Avoid infinite loops
orig_args['allow_redirects'] = False
if request.response.status_code is codes.found:
# restore info
request.method = real_method
request.data = real_data
# TL;DR: Web browsers and web developers suck.
#
# Most browsers do not follow the RFC for HTTP 302
# but python-requests does.
# And web developers assume we don't follow it either:
# https://en.wikipedia.org/wiki/Post/Redirect/Get
#
# Later python-request versions do it that way, but to stay
# compatible with older versions, we use this.
while request.allow_redirects is False \
and response.status_code in requests.models.REDIRECT_STATI \
and 'location' in response.headers:
## This is from requests.models._build_response
response.content # Consume socket so it can be released
return request.response
if len(responses) > response.config.get('max_redirects'):
raise requests.exceptions.TooManyRedirects()
# Release the connection back into the pool.
response.raw.release_conn()
## End of code from requests.models._build_response
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3.4
if response.status_code == codes.see_other:
orig_args['method'] = 'GET'
orig_args['data'] = None
orig_args['files'] = None
if not request.config.get('strict_mode'):
# Do the same as Google Chrome.
# http://git.chromium.org/gitweb/?p=chromium/src/net.git;a=blob;f=url_request/url_request.cc;h=8597917f0cbf49c84b3bdae3a7bebacbc264f1e0;hb=HEAD#l673
if (response.status_code == 303 and request.method != 'HEAD') \
or (response.status_code in (codes.moved, codes.found) and request.method == 'POST'):
# Once we use GET, all next requests will use GET.
orig_args['method'] = 'GET'
orig_args['data'] = None
orig_args['files'] = None
## This is from requests.models._build_response
url = response.headers['location']
# Handle redirection without scheme (see: RFC 1808 Section 4)
if url.startswith('//'):
parsed_rurl = urlparse(response.url)
url = '%s:%s' % (parsed_rurl.scheme, url)
# Facilitate non-RFC2616-compliant 'location' headers
# (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
if not urlparse(url).netloc:
url = urljoin(response.url,
# Compliant with RFC3986, we percent
# encode the url.
requests.utils.requote_uri(url))
## End of code from requests.models._build_response
if orig_referrer is False:
# Referer disabled in original request, disable in next
referrer = orig_referrer
else:
# Guess from last response
referrer = self._get_referrer(response.url, url)
call_args = deepcopy(orig_args)
response = self.open(url, referrer=referrer, **call_args)
responses.append(response)
# get the final response
response = responses.pop()
# _build_response does this
response.history = responses
request.response = response
return response
def location(self, url, data=None,
fix_redirect=True, referrer=None,
allow_redirects=True, referrer=None,
**kwargs):
"""
Like open() but also changes the current URL and response.
@ -198,25 +243,27 @@ class BaseBrowser(object):
Other than that, has the exact same behavior of open().
"""
response = self.open(url, data, fix_redirect, **kwargs)
response = self.open(url, data, allow_redirects, referrer, **kwargs)
self.response = response
self.url = self.response.url
return response
def open(self, url, data=None,
fix_redirect=True, referrer=None,
allow_redirects=True, referrer=None,
**kwargs):
"""
Wrapper around request().
Makes a GET request, or a POST if data is not None, unless a `method`
is explicitly provided.
An empty `data` (not None) *will* make a post.
All request() options are available, and it is possible to disable the
automatic method, referrer, and redirection fixes.
It is a wrapper around session.request().
All session.request() options are available.
You should use location() or open() and not session.request(),
since it has some interesting additions, which are easily
individually disabled through the arguments.
Call this if you do not want to "visit" the URL (for instance, you
are downloading a file).
Call this instead of location() if you do not want to "visit" the URL
(for instance, you are downloading a file).
:param url: URL
:type url: str
@ -224,14 +271,16 @@ class BaseBrowser(object):
:param data: POST data
:type url: str or dict or None
:param fix_redirect: Fix POST 302 redirects
:type fix_redirect: True or False
:param referrer: Force referrer. False to disable sending it, None for guessing
:type referrer: str or False or None
:rtype: :class:`requests.Response`
"""
kwargs = deepcopy(kwargs)
orig_args = deepcopy(kwargs)
orig_args['referrer'] = referrer
# guess method
method = kwargs.pop('method', None)
if method is None:
if data is None:
@ -239,35 +288,36 @@ class BaseBrowser(object):
else:
method = 'POST'
kwargs['data'] = data
if fix_redirect:
kwargs.setdefault('config', {}).setdefault('fix-redirect', True)
kwargs.setdefault('allow_redirects', False)
# python-requests or urllib3 does not handle
# empty POST requests properly, so some websites refuse it.
if data is not None and len(data) == 0:
kwargs.setdefault('headers', {}).setdefault('Content-Length', '0')
# Use our own redirection handling
# python-requests's sucks to much to be allowed.
kwargs.setdefault('config', {}).setdefault('strict_mode', False)
kwargs['allow_redirects'] = False
if referrer is None:
referrer = self._get_referrer(self.url, url)
if referrer:
# Yes, it is a misspelling.
kwargs.setdefault('headers', {}).setdefault('Referer', referrer)
response = self.request(method, url, **kwargs)
if self.TIMEOUT:
kwargs.setdefault('timeout', self.TIMEOUT)
# call python-requests
response = self.session.request(method, url, **kwargs)
if allow_redirects:
response = self.follow_redirects(response, orig_args)
# erase all cookies, python-requests does not handle them securely
self.session.cookies = {}
return response
def request(self, *args, **kwargs):
"""
Creates a Request object and calls it.
Takes the sames arguments as request.request()
Returns a Response object.
Most of the time, you should use location() or open(),
since it ignores some interesting additions, which are easily
individually disabled through the arguments.
"""
# python-requests or urllib3 does not handle
# empty POST requests properly, so some websites refuse it.
data = kwargs.get('data')
if data is not None and len(data) == 0:
kwargs.setdefault('headers', {}).setdefault('Content-Length', '0')
kwargs.setdefault('timeout', self.TIMEOUT)
return self.session.request(*args, **kwargs)
def _get_referrer(self, oldurl, newurl):
"""
Get the referrer to send when doing a request.
@ -285,8 +335,8 @@ class BaseBrowser(object):
"""
if oldurl is None:
return None
old = urlparse.urlparse(oldurl)
new = urlparse.urlparse(newurl)
old = urlparse(oldurl)
new = urlparse(newurl)
# Do not leak secure URLs to insecure URLs
if old.scheme == 'https' and new.scheme != 'https':
return None
@ -333,7 +383,7 @@ class DomainBrowser(BaseBrowser):
base = self.BASEURL
if base is None or base is False:
base = self.url
return urlparse.urljoin(base, uri)
return urljoin(base, uri)
def open(self, uri, *args, **kwargs):
return BaseBrowser.open(self, self.absurl(uri), *args, **kwargs)

View file

@ -57,6 +57,47 @@ def test_redirects():
b.location(HTTPBIN + 'redirect/1')
assert b.url == HTTPBIN + 'get'
r = b.location(HTTPBIN + 'redirect/1')
assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
assert r.url == HTTPBIN + 'get'
# Normal redirect chain
b.url = None
r = b.location(HTTPBIN + 'redirect/4')
assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
assert len(r.history) == 4
assert r.history[3].request.url == HTTPBIN + 'redirect/1'
assert r.history[3].request.headers.get('Referer') == HTTPBIN + 'redirect/2'
assert r.history[2].request.url == HTTPBIN + 'redirect/2'
assert r.history[2].request.headers.get('Referer') == HTTPBIN + 'redirect/3'
assert r.history[1].request.url == HTTPBIN + 'redirect/3'
assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/4'
assert r.history[0].request.url == HTTPBIN + 'redirect/4'
assert r.history[0].request.headers.get('Referer') == None
assert r.url == HTTPBIN + 'get'
# Disable all referers
r = b.location(HTTPBIN + 'redirect/2', referrer=False)
assert json.loads(r.text)['headers'].get('Referer') == None
assert len(r.history) == 2
assert r.history[1].request.headers.get('Referer') == None
assert r.history[0].request.headers.get('Referer') == None
assert r.url == HTTPBIN + 'get'
# Only overrides first referer
r = b.location(HTTPBIN + 'redirect/2', referrer='http://example.com/')
assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
assert len(r.history) == 2
assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/2'
assert r.history[0].request.headers.get('Referer') == 'http://example.com/'
assert r.url == HTTPBIN + 'get'
# Don't follow
r = b.location(HTTPBIN + 'redirect/2', allow_redirects=False)
assert len(r.history) == 0
assert r.url == HTTPBIN + 'redirect/2'
assert r.status_code == 302
def test_brokenpost():
"""
@ -179,6 +220,16 @@ def test_referrer():
r = b.location(HTTPBIN + 'headers')
assert 'Referer' not in json.loads(r.text)['headers']
# Force another referrer
r = b.location(HTTPBIN + 'get')
r = b.location(HTTPBIN + 'headers', referrer='http://example.com/')
assert json.loads(r.text)['headers'].get('Referer') == 'http://example.com/'
# Force no referrer
r = b.location(HTTPBIN + 'get')
r = b.location(HTTPBIN + 'headers', referrer=False)
assert 'Referer' not in json.loads(r.text)['headers']
assert b._get_referrer('https://example.com/', 'http://example.com/') is None