browser2: Handle Referrer, more documentation
This commit is contained in:
parent
d2c4085508
commit
538c0ee92b
2 changed files with 78 additions and 5 deletions
|
|
@ -187,21 +187,31 @@ class BaseBrowser(object):
|
||||||
|
|
||||||
return request.response
|
return request.response
|
||||||
|
|
||||||
def location(self, url, data=None, fix_redirect=True, **kwargs):
|
def location(self, url, data=None,
|
||||||
|
fix_redirect=True, referrer=None,
|
||||||
|
**kwargs):
|
||||||
"""
|
"""
|
||||||
Like open() but also changes the current URL and response.
|
Like open() but also changes the current URL and response.
|
||||||
This is the most common method to request web pages.
|
This is the most common method to request web pages.
|
||||||
|
|
||||||
|
Other than that, has the exact same behavior of open().
|
||||||
"""
|
"""
|
||||||
response = self.open(url, data, fix_redirect, **kwargs)
|
response = self.open(url, data, fix_redirect, **kwargs)
|
||||||
self.response = response
|
self.response = response
|
||||||
self.url = self.response.url
|
self.url = self.response.url
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def open(self, url, data=None, fix_redirect=True, **kwargs):
|
def open(self, url, data=None,
|
||||||
|
fix_redirect=True, referrer=None,
|
||||||
|
**kwargs):
|
||||||
"""
|
"""
|
||||||
Wrapper around request().
|
Wrapper around request().
|
||||||
Makes a GET request, or a POST if data is not None.
|
Makes a GET request, or a POST if data is not None, unless a `method`
|
||||||
An empty data *will* make a post.
|
is explicitly provided.
|
||||||
|
An empty `data` (not None) *will* make a post.
|
||||||
|
|
||||||
|
All request() options are available, and it is possible to disable the
|
||||||
|
automatic method, referrer, and redirection fixes.
|
||||||
|
|
||||||
Call this if you do not want to "visit" the URL (for instance, you
|
Call this if you do not want to "visit" the URL (for instance, you
|
||||||
are downloading a file).
|
are downloading a file).
|
||||||
|
|
@ -209,6 +219,15 @@ class BaseBrowser(object):
|
||||||
:param url: URL
|
:param url: URL
|
||||||
:type url: str
|
:type url: str
|
||||||
|
|
||||||
|
:param data: POST data
|
||||||
|
:type url: str or dict or None
|
||||||
|
|
||||||
|
:param fix_redirect: Fix POST 302 redirects
|
||||||
|
:type fix_redirect: True or False
|
||||||
|
|
||||||
|
:param referrer: Force referrer. False to disable sending it, None for guessing
|
||||||
|
:type referrer: str or False or None
|
||||||
|
|
||||||
:rtype: :class:`requests.Response`
|
:rtype: :class:`requests.Response`
|
||||||
"""
|
"""
|
||||||
method = kwargs.pop('method', None)
|
method = kwargs.pop('method', None)
|
||||||
|
|
@ -220,6 +239,11 @@ class BaseBrowser(object):
|
||||||
kwargs['data'] = data
|
kwargs['data'] = data
|
||||||
if fix_redirect:
|
if fix_redirect:
|
||||||
kwargs.setdefault('config', {}).setdefault('fix-redirect', True)
|
kwargs.setdefault('config', {}).setdefault('fix-redirect', True)
|
||||||
|
if referrer is None:
|
||||||
|
referrer = self._get_referrer(self.url, url)
|
||||||
|
if referrer:
|
||||||
|
# Yes, it is a misspelling.
|
||||||
|
kwargs.setdefault('headers', {}).setdefault('Referer', referrer)
|
||||||
response = self.request(method, url, **kwargs)
|
response = self.request(method, url, **kwargs)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
@ -229,7 +253,9 @@ class BaseBrowser(object):
|
||||||
Takes the sames arguments as request.request()
|
Takes the sames arguments as request.request()
|
||||||
Returns a Response object.
|
Returns a Response object.
|
||||||
|
|
||||||
Most of the time, you should use location() or open().
|
Most of the time, you should use location() or open(),
|
||||||
|
since it ignores some interesting additions, which are easily
|
||||||
|
individually disabled through the arguments.
|
||||||
"""
|
"""
|
||||||
# python-requests or urllib3 does not handle
|
# python-requests or urllib3 does not handle
|
||||||
# empty POST requests properly, so some websites refuse it.
|
# empty POST requests properly, so some websites refuse it.
|
||||||
|
|
@ -239,6 +265,38 @@ class BaseBrowser(object):
|
||||||
kwargs.setdefault('timeout', self.TIMEOUT)
|
kwargs.setdefault('timeout', self.TIMEOUT)
|
||||||
return self.session.request(*args, **kwargs)
|
return self.session.request(*args, **kwargs)
|
||||||
|
|
||||||
|
def _get_referrer(self, oldurl, newurl):
|
||||||
|
"""
|
||||||
|
Get the referrer to send when doing a request.
|
||||||
|
If we should not send a referrer, it will return None.
|
||||||
|
|
||||||
|
Reference: https://en.wikipedia.org/wiki/HTTP_referer
|
||||||
|
|
||||||
|
:param oldurl: Current absolute URL
|
||||||
|
:type oldurl: str or None
|
||||||
|
|
||||||
|
:param newurl: Target absolute URL
|
||||||
|
:type newurl: str
|
||||||
|
|
||||||
|
:rtype: str or None
|
||||||
|
"""
|
||||||
|
if oldurl is None:
|
||||||
|
return None
|
||||||
|
old = urlparse.urlparse(oldurl)
|
||||||
|
new = urlparse.urlparse(newurl)
|
||||||
|
# Do not leak secure URLs to insecure URLs
|
||||||
|
if old.scheme == 'https' and new.scheme != 'https':
|
||||||
|
return None
|
||||||
|
# Reloading the page. Usually no referrer.
|
||||||
|
if oldurl == newurl:
|
||||||
|
return None
|
||||||
|
# TODO maybe implement some *optional* privacy features:
|
||||||
|
# * do not leak referrer to other domains (often breaks websites)
|
||||||
|
# * send a fake referrer (root of the current domain)
|
||||||
|
# * never send the referrer
|
||||||
|
# Inspired by the RefControl Firefox addon.
|
||||||
|
return oldurl
|
||||||
|
|
||||||
|
|
||||||
class DomainBrowser(BaseBrowser):
|
class DomainBrowser(BaseBrowser):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -160,3 +160,18 @@ def test_changereq():
|
||||||
r = b.location(HTTPBIN + 'headers', headers={'User-Agent': 'Web Out of Browsers'})
|
r = b.location(HTTPBIN + 'headers', headers={'User-Agent': 'Web Out of Browsers'})
|
||||||
assert 'Web Out of Browsers' in r.text
|
assert 'Web Out of Browsers' in r.text
|
||||||
assert 'Firefox' not in r.text
|
assert 'Firefox' not in r.text
|
||||||
|
|
||||||
|
|
||||||
|
def test_referrer():
|
||||||
|
"""
|
||||||
|
Test automatic referrer setting
|
||||||
|
"""
|
||||||
|
b = BaseBrowser()
|
||||||
|
r = b.location(HTTPBIN + 'get')
|
||||||
|
assert 'Referer' not in json.loads(r.text)['headers']
|
||||||
|
r = b.location(HTTPBIN + 'headers')
|
||||||
|
assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'get'
|
||||||
|
r = b.location(HTTPBIN + 'headers')
|
||||||
|
assert 'Referer' not in json.loads(r.text)['headers']
|
||||||
|
|
||||||
|
assert b._get_referrer('https://example.com/', 'http://example.com/') is None
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue