browser2: Handle redirects internally
python-requests isn't secure enough, and some behavior depends on the latest version. Tested on 0.10.6. So instead of the previous hack, we have some copy-paste. But we gain secure cookies handling (not there yet), referrer handling, "proper" redirect on POST behavior.
This commit is contained in:
parent
4b802f32dd
commit
57e16e9fe4
2 changed files with 185 additions and 84 deletions
|
|
@ -19,10 +19,11 @@
|
||||||
|
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
import urlparse
|
from urlparse import urlparse, urljoin
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from requests.status_codes import codes
|
from requests.status_codes import codes
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
|
||||||
# TODO define __all__
|
# TODO define __all__
|
||||||
|
|
@ -130,67 +131,111 @@ class BaseBrowser(object):
|
||||||
# TODO max_retries?
|
# TODO max_retries?
|
||||||
# TODO connect config['verbose'] to our logger
|
# TODO connect config['verbose'] to our logger
|
||||||
|
|
||||||
# TODO find a way to have multiple session hooks
|
|
||||||
# lists don't work in this context
|
|
||||||
session.hooks['response'] = self._fix_redirect
|
|
||||||
|
|
||||||
profile.setup_session(session)
|
profile.setup_session(session)
|
||||||
|
|
||||||
self.session = session
|
self.session = session
|
||||||
|
|
||||||
def _fix_redirect(self, response):
|
def follow_redirects(self, response, orig_args=None):
|
||||||
"""
|
"""
|
||||||
TL;DR: Web browsers and web developers suck.
|
Follow redirects *properly*.
|
||||||
|
* Mimic what browsers do on 302
|
||||||
|
* TODO Handle cookies securely
|
||||||
|
|
||||||
Most browsers do not follow the RFC for HTTP 302
|
:type response: :class:`requests.Response`
|
||||||
but python-requests does.
|
:type orig_args: dict
|
||||||
And web developers assume we don't follow it either:
|
:rtype: :class:`requests.Response`
|
||||||
https://en.wikipedia.org/wiki/Post/Redirect/Get
|
|
||||||
|
|
||||||
Gets a Response, and returns a new Response.
|
|
||||||
Used as a 'response' hook for python-requests.
|
|
||||||
|
|
||||||
This is a hack, it would be better as an option in python-requests.
|
|
||||||
|
|
||||||
What we do is run again the response building,
|
|
||||||
but this time with allow_redirects=True, and if we have a HTTP 302,
|
|
||||||
we set a temporary fake method='GET' and empty data.
|
|
||||||
|
|
||||||
So in order to have proper allow_redirects=True handling of POSTs
|
|
||||||
you have to create a request with allow_redirects=False,
|
|
||||||
and fix-redirect=True in config (which is for the first one the
|
|
||||||
python-requests default for POSTs, and for the second one the
|
|
||||||
BaseBrowser default).
|
|
||||||
"""
|
"""
|
||||||
|
# The response chain. We start with the one we got.
|
||||||
|
responses = [response]
|
||||||
request = response.request
|
request = response.request
|
||||||
# If the request wasn't redirected, and is a redirection,
|
|
||||||
# and we allowed it to be fixed,
|
|
||||||
# restart the request building, but with a changed action.
|
|
||||||
if request.allow_redirects is False \
|
|
||||||
and request.response.status_code in requests.models.REDIRECT_STATI \
|
|
||||||
and request.config.get('fix-redirect'):
|
|
||||||
if (request.response.status_code in (codes.moved, codes.found) \
|
|
||||||
and request.method == 'POST') \
|
|
||||||
or (request.response.status_code == 303 and request.method != 'HEAD'):
|
|
||||||
# force the next request to be GET
|
|
||||||
real_method = request.method
|
|
||||||
request.method = 'GET'
|
|
||||||
real_data = request.data
|
|
||||||
request.data = None
|
|
||||||
|
|
||||||
# build the response again
|
# Default method for redirects
|
||||||
request.allow_redirects = True
|
orig_args = orig_args or {}
|
||||||
request._build_response(response.raw)
|
orig_args.setdefault('method', request.method)
|
||||||
|
orig_args.setdefault('data', request.data)
|
||||||
|
# If we have the original arguments, take them, and fix them
|
||||||
|
orig_args.pop('url', None)
|
||||||
|
orig_referrer = orig_args.pop('referrer', None)
|
||||||
|
# Avoid infinite loops
|
||||||
|
orig_args['allow_redirects'] = False
|
||||||
|
|
||||||
if request.response.status_code is codes.found:
|
# TL;DR: Web browsers and web developers suck.
|
||||||
# restore info
|
#
|
||||||
request.method = real_method
|
# Most browsers do not follow the RFC for HTTP 302
|
||||||
request.data = real_data
|
# but python-requests does.
|
||||||
|
# And web developers assume we don't follow it either:
|
||||||
|
# https://en.wikipedia.org/wiki/Post/Redirect/Get
|
||||||
|
#
|
||||||
|
# Later python-request versions do it that way, but to stay
|
||||||
|
# compatible with older versions, we use this.
|
||||||
|
while request.allow_redirects is False \
|
||||||
|
and response.status_code in requests.models.REDIRECT_STATI \
|
||||||
|
and 'location' in response.headers:
|
||||||
|
## This is from requests.models._build_response
|
||||||
|
response.content # Consume socket so it can be released
|
||||||
|
|
||||||
return request.response
|
if len(responses) > response.config.get('max_redirects'):
|
||||||
|
raise requests.exceptions.TooManyRedirects()
|
||||||
|
|
||||||
|
# Release the connection back into the pool.
|
||||||
|
response.raw.release_conn()
|
||||||
|
## End of code from requests.models._build_response
|
||||||
|
|
||||||
|
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3.4
|
||||||
|
if response.status_code == codes.see_other:
|
||||||
|
orig_args['method'] = 'GET'
|
||||||
|
orig_args['data'] = None
|
||||||
|
orig_args['files'] = None
|
||||||
|
|
||||||
|
if not request.config.get('strict_mode'):
|
||||||
|
# Do the same as Google Chrome.
|
||||||
|
# http://git.chromium.org/gitweb/?p=chromium/src/net.git;a=blob;f=url_request/url_request.cc;h=8597917f0cbf49c84b3bdae3a7bebacbc264f1e0;hb=HEAD#l673
|
||||||
|
if (response.status_code == 303 and request.method != 'HEAD') \
|
||||||
|
or (response.status_code in (codes.moved, codes.found) and request.method == 'POST'):
|
||||||
|
# Once we use GET, all next requests will use GET.
|
||||||
|
orig_args['method'] = 'GET'
|
||||||
|
orig_args['data'] = None
|
||||||
|
orig_args['files'] = None
|
||||||
|
|
||||||
|
## This is from requests.models._build_response
|
||||||
|
url = response.headers['location']
|
||||||
|
|
||||||
|
# Handle redirection without scheme (see: RFC 1808 Section 4)
|
||||||
|
if url.startswith('//'):
|
||||||
|
parsed_rurl = urlparse(response.url)
|
||||||
|
url = '%s:%s' % (parsed_rurl.scheme, url)
|
||||||
|
|
||||||
|
# Facilitate non-RFC2616-compliant 'location' headers
|
||||||
|
# (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
|
||||||
|
if not urlparse(url).netloc:
|
||||||
|
url = urljoin(response.url,
|
||||||
|
# Compliant with RFC3986, we percent
|
||||||
|
# encode the url.
|
||||||
|
requests.utils.requote_uri(url))
|
||||||
|
|
||||||
|
## End of code from requests.models._build_response
|
||||||
|
|
||||||
|
if orig_referrer is False:
|
||||||
|
# Referer disabled in original request, disable in next
|
||||||
|
referrer = orig_referrer
|
||||||
|
else:
|
||||||
|
# Guess from last response
|
||||||
|
referrer = self._get_referrer(response.url, url)
|
||||||
|
|
||||||
|
call_args = deepcopy(orig_args)
|
||||||
|
response = self.open(url, referrer=referrer, **call_args)
|
||||||
|
responses.append(response)
|
||||||
|
|
||||||
|
# get the final response
|
||||||
|
response = responses.pop()
|
||||||
|
# _build_response does this
|
||||||
|
response.history = responses
|
||||||
|
request.response = response
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
def location(self, url, data=None,
|
def location(self, url, data=None,
|
||||||
fix_redirect=True, referrer=None,
|
allow_redirects=True, referrer=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""
|
"""
|
||||||
Like open() but also changes the current URL and response.
|
Like open() but also changes the current URL and response.
|
||||||
|
|
@ -198,25 +243,27 @@ class BaseBrowser(object):
|
||||||
|
|
||||||
Other than that, has the exact same behavior of open().
|
Other than that, has the exact same behavior of open().
|
||||||
"""
|
"""
|
||||||
response = self.open(url, data, fix_redirect, **kwargs)
|
response = self.open(url, data, allow_redirects, referrer, **kwargs)
|
||||||
self.response = response
|
self.response = response
|
||||||
self.url = self.response.url
|
self.url = self.response.url
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def open(self, url, data=None,
|
def open(self, url, data=None,
|
||||||
fix_redirect=True, referrer=None,
|
allow_redirects=True, referrer=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""
|
"""
|
||||||
Wrapper around request().
|
|
||||||
Makes a GET request, or a POST if data is not None, unless a `method`
|
Makes a GET request, or a POST if data is not None, unless a `method`
|
||||||
is explicitly provided.
|
is explicitly provided.
|
||||||
An empty `data` (not None) *will* make a post.
|
An empty `data` (not None) *will* make a post.
|
||||||
|
|
||||||
All request() options are available, and it is possible to disable the
|
It is a wrapper around session.request().
|
||||||
automatic method, referrer, and redirection fixes.
|
All session.request() options are available.
|
||||||
|
You should use location() or open() and not session.request(),
|
||||||
|
since it has some interesting additions, which are easily
|
||||||
|
individually disabled through the arguments.
|
||||||
|
|
||||||
Call this if you do not want to "visit" the URL (for instance, you
|
Call this instead of location() if you do not want to "visit" the URL
|
||||||
are downloading a file).
|
(for instance, you are downloading a file).
|
||||||
|
|
||||||
:param url: URL
|
:param url: URL
|
||||||
:type url: str
|
:type url: str
|
||||||
|
|
@ -224,14 +271,16 @@ class BaseBrowser(object):
|
||||||
:param data: POST data
|
:param data: POST data
|
||||||
:type url: str or dict or None
|
:type url: str or dict or None
|
||||||
|
|
||||||
:param fix_redirect: Fix POST 302 redirects
|
|
||||||
:type fix_redirect: True or False
|
|
||||||
|
|
||||||
:param referrer: Force referrer. False to disable sending it, None for guessing
|
:param referrer: Force referrer. False to disable sending it, None for guessing
|
||||||
:type referrer: str or False or None
|
:type referrer: str or False or None
|
||||||
|
|
||||||
:rtype: :class:`requests.Response`
|
:rtype: :class:`requests.Response`
|
||||||
"""
|
"""
|
||||||
|
kwargs = deepcopy(kwargs)
|
||||||
|
orig_args = deepcopy(kwargs)
|
||||||
|
orig_args['referrer'] = referrer
|
||||||
|
|
||||||
|
# guess method
|
||||||
method = kwargs.pop('method', None)
|
method = kwargs.pop('method', None)
|
||||||
if method is None:
|
if method is None:
|
||||||
if data is None:
|
if data is None:
|
||||||
|
|
@ -239,35 +288,36 @@ class BaseBrowser(object):
|
||||||
else:
|
else:
|
||||||
method = 'POST'
|
method = 'POST'
|
||||||
kwargs['data'] = data
|
kwargs['data'] = data
|
||||||
if fix_redirect:
|
|
||||||
kwargs.setdefault('config', {}).setdefault('fix-redirect', True)
|
# python-requests or urllib3 does not handle
|
||||||
kwargs.setdefault('allow_redirects', False)
|
# empty POST requests properly, so some websites refuse it.
|
||||||
|
if data is not None and len(data) == 0:
|
||||||
|
kwargs.setdefault('headers', {}).setdefault('Content-Length', '0')
|
||||||
|
|
||||||
|
# Use our own redirection handling
|
||||||
|
# python-requests's sucks to much to be allowed.
|
||||||
|
kwargs.setdefault('config', {}).setdefault('strict_mode', False)
|
||||||
|
kwargs['allow_redirects'] = False
|
||||||
|
|
||||||
if referrer is None:
|
if referrer is None:
|
||||||
referrer = self._get_referrer(self.url, url)
|
referrer = self._get_referrer(self.url, url)
|
||||||
if referrer:
|
if referrer:
|
||||||
# Yes, it is a misspelling.
|
# Yes, it is a misspelling.
|
||||||
kwargs.setdefault('headers', {}).setdefault('Referer', referrer)
|
kwargs.setdefault('headers', {}).setdefault('Referer', referrer)
|
||||||
response = self.request(method, url, **kwargs)
|
|
||||||
|
if self.TIMEOUT:
|
||||||
|
kwargs.setdefault('timeout', self.TIMEOUT)
|
||||||
|
|
||||||
|
# call python-requests
|
||||||
|
response = self.session.request(method, url, **kwargs)
|
||||||
|
if allow_redirects:
|
||||||
|
response = self.follow_redirects(response, orig_args)
|
||||||
|
|
||||||
|
# erase all cookies, python-requests does not handle them securely
|
||||||
|
self.session.cookies = {}
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def request(self, *args, **kwargs):
|
|
||||||
"""
|
|
||||||
Creates a Request object and calls it.
|
|
||||||
Takes the sames arguments as request.request()
|
|
||||||
Returns a Response object.
|
|
||||||
|
|
||||||
Most of the time, you should use location() or open(),
|
|
||||||
since it ignores some interesting additions, which are easily
|
|
||||||
individually disabled through the arguments.
|
|
||||||
"""
|
|
||||||
# python-requests or urllib3 does not handle
|
|
||||||
# empty POST requests properly, so some websites refuse it.
|
|
||||||
data = kwargs.get('data')
|
|
||||||
if data is not None and len(data) == 0:
|
|
||||||
kwargs.setdefault('headers', {}).setdefault('Content-Length', '0')
|
|
||||||
kwargs.setdefault('timeout', self.TIMEOUT)
|
|
||||||
return self.session.request(*args, **kwargs)
|
|
||||||
|
|
||||||
def _get_referrer(self, oldurl, newurl):
|
def _get_referrer(self, oldurl, newurl):
|
||||||
"""
|
"""
|
||||||
Get the referrer to send when doing a request.
|
Get the referrer to send when doing a request.
|
||||||
|
|
@ -285,8 +335,8 @@ class BaseBrowser(object):
|
||||||
"""
|
"""
|
||||||
if oldurl is None:
|
if oldurl is None:
|
||||||
return None
|
return None
|
||||||
old = urlparse.urlparse(oldurl)
|
old = urlparse(oldurl)
|
||||||
new = urlparse.urlparse(newurl)
|
new = urlparse(newurl)
|
||||||
# Do not leak secure URLs to insecure URLs
|
# Do not leak secure URLs to insecure URLs
|
||||||
if old.scheme == 'https' and new.scheme != 'https':
|
if old.scheme == 'https' and new.scheme != 'https':
|
||||||
return None
|
return None
|
||||||
|
|
@ -333,7 +383,7 @@ class DomainBrowser(BaseBrowser):
|
||||||
base = self.BASEURL
|
base = self.BASEURL
|
||||||
if base is None or base is False:
|
if base is None or base is False:
|
||||||
base = self.url
|
base = self.url
|
||||||
return urlparse.urljoin(base, uri)
|
return urljoin(base, uri)
|
||||||
|
|
||||||
def open(self, uri, *args, **kwargs):
|
def open(self, uri, *args, **kwargs):
|
||||||
return BaseBrowser.open(self, self.absurl(uri), *args, **kwargs)
|
return BaseBrowser.open(self, self.absurl(uri), *args, **kwargs)
|
||||||
|
|
|
||||||
|
|
@ -57,6 +57,47 @@ def test_redirects():
|
||||||
b.location(HTTPBIN + 'redirect/1')
|
b.location(HTTPBIN + 'redirect/1')
|
||||||
assert b.url == HTTPBIN + 'get'
|
assert b.url == HTTPBIN + 'get'
|
||||||
|
|
||||||
|
r = b.location(HTTPBIN + 'redirect/1')
|
||||||
|
assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
|
||||||
|
assert r.url == HTTPBIN + 'get'
|
||||||
|
|
||||||
|
# Normal redirect chain
|
||||||
|
b.url = None
|
||||||
|
r = b.location(HTTPBIN + 'redirect/4')
|
||||||
|
assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
|
||||||
|
assert len(r.history) == 4
|
||||||
|
assert r.history[3].request.url == HTTPBIN + 'redirect/1'
|
||||||
|
assert r.history[3].request.headers.get('Referer') == HTTPBIN + 'redirect/2'
|
||||||
|
assert r.history[2].request.url == HTTPBIN + 'redirect/2'
|
||||||
|
assert r.history[2].request.headers.get('Referer') == HTTPBIN + 'redirect/3'
|
||||||
|
assert r.history[1].request.url == HTTPBIN + 'redirect/3'
|
||||||
|
assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/4'
|
||||||
|
assert r.history[0].request.url == HTTPBIN + 'redirect/4'
|
||||||
|
assert r.history[0].request.headers.get('Referer') == None
|
||||||
|
assert r.url == HTTPBIN + 'get'
|
||||||
|
|
||||||
|
# Disable all referers
|
||||||
|
r = b.location(HTTPBIN + 'redirect/2', referrer=False)
|
||||||
|
assert json.loads(r.text)['headers'].get('Referer') == None
|
||||||
|
assert len(r.history) == 2
|
||||||
|
assert r.history[1].request.headers.get('Referer') == None
|
||||||
|
assert r.history[0].request.headers.get('Referer') == None
|
||||||
|
assert r.url == HTTPBIN + 'get'
|
||||||
|
|
||||||
|
# Only overrides first referer
|
||||||
|
r = b.location(HTTPBIN + 'redirect/2', referrer='http://example.com/')
|
||||||
|
assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
|
||||||
|
assert len(r.history) == 2
|
||||||
|
assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/2'
|
||||||
|
assert r.history[0].request.headers.get('Referer') == 'http://example.com/'
|
||||||
|
assert r.url == HTTPBIN + 'get'
|
||||||
|
|
||||||
|
# Don't follow
|
||||||
|
r = b.location(HTTPBIN + 'redirect/2', allow_redirects=False)
|
||||||
|
assert len(r.history) == 0
|
||||||
|
assert r.url == HTTPBIN + 'redirect/2'
|
||||||
|
assert r.status_code == 302
|
||||||
|
|
||||||
|
|
||||||
def test_brokenpost():
|
def test_brokenpost():
|
||||||
"""
|
"""
|
||||||
|
|
@ -179,6 +220,16 @@ def test_referrer():
|
||||||
r = b.location(HTTPBIN + 'headers')
|
r = b.location(HTTPBIN + 'headers')
|
||||||
assert 'Referer' not in json.loads(r.text)['headers']
|
assert 'Referer' not in json.loads(r.text)['headers']
|
||||||
|
|
||||||
|
# Force another referrer
|
||||||
|
r = b.location(HTTPBIN + 'get')
|
||||||
|
r = b.location(HTTPBIN + 'headers', referrer='http://example.com/')
|
||||||
|
assert json.loads(r.text)['headers'].get('Referer') == 'http://example.com/'
|
||||||
|
|
||||||
|
# Force no referrer
|
||||||
|
r = b.location(HTTPBIN + 'get')
|
||||||
|
r = b.location(HTTPBIN + 'headers', referrer=False)
|
||||||
|
assert 'Referer' not in json.loads(r.text)['headers']
|
||||||
|
|
||||||
assert b._get_referrer('https://example.com/', 'http://example.com/') is None
|
assert b._get_referrer('https://example.com/', 'http://example.com/') is None
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue