diff --git a/weboob/tools/browser2/browser.py b/weboob/tools/browser2/browser.py index a56b4397..7626c966 100644 --- a/weboob/tools/browser2/browser.py +++ b/weboob/tools/browser2/browser.py @@ -187,21 +187,31 @@ class BaseBrowser(object): return request.response - def location(self, url, data=None, fix_redirect=True, **kwargs): + def location(self, url, data=None, + fix_redirect=True, referrer=None, + **kwargs): """ Like open() but also changes the current URL and response. This is the most common method to request web pages. + + Other than that, has the exact same behavior of open(). """ response = self.open(url, data, fix_redirect, **kwargs) self.response = response self.url = self.response.url return response - def open(self, url, data=None, fix_redirect=True, **kwargs): + def open(self, url, data=None, + fix_redirect=True, referrer=None, + **kwargs): """ Wrapper around request(). - Makes a GET request, or a POST if data is not None. - An empty data *will* make a post. + Makes a GET request, or a POST if data is not None, unless a `method` + is explicitly provided. + An empty `data` (not None) *will* make a post. + + All request() options are available, and it is possible to disable the + automatic method, referrer, and redirection fixes. Call this if you do not want to "visit" the URL (for instance, you are downloading a file). @@ -209,6 +219,15 @@ class BaseBrowser(object): :param url: URL :type url: str + :param data: POST data + :type url: str or dict or None + + :param fix_redirect: Fix POST 302 redirects + :type fix_redirect: True or False + + :param referrer: Force referrer. False to disable sending it, None for guessing + :type referrer: str or False or None + :rtype: :class:`requests.Response` """ method = kwargs.pop('method', None) @@ -220,6 +239,11 @@ class BaseBrowser(object): kwargs['data'] = data if fix_redirect: kwargs.setdefault('config', {}).setdefault('fix-redirect', True) + if referrer is None: + referrer = self._get_referrer(self.url, url) + if referrer: + # Yes, it is a misspelling. + kwargs.setdefault('headers', {}).setdefault('Referer', referrer) response = self.request(method, url, **kwargs) return response @@ -229,7 +253,9 @@ class BaseBrowser(object): Takes the sames arguments as request.request() Returns a Response object. - Most of the time, you should use location() or open(). + Most of the time, you should use location() or open(), + since it ignores some interesting additions, which are easily + individually disabled through the arguments. """ # python-requests or urllib3 does not handle # empty POST requests properly, so some websites refuse it. @@ -239,6 +265,38 @@ class BaseBrowser(object): kwargs.setdefault('timeout', self.TIMEOUT) return self.session.request(*args, **kwargs) + def _get_referrer(self, oldurl, newurl): + """ + Get the referrer to send when doing a request. + If we should not send a referrer, it will return None. + + Reference: https://en.wikipedia.org/wiki/HTTP_referer + + :param oldurl: Current absolute URL + :type oldurl: str or None + + :param newurl: Target absolute URL + :type newurl: str + + :rtype: str or None + """ + if oldurl is None: + return None + old = urlparse.urlparse(oldurl) + new = urlparse.urlparse(newurl) + # Do not leak secure URLs to insecure URLs + if old.scheme == 'https' and new.scheme != 'https': + return None + # Reloading the page. Usually no referrer. + if oldurl == newurl: + return None + # TODO maybe implement some *optional* privacy features: + # * do not leak referrer to other domains (often breaks websites) + # * send a fake referrer (root of the current domain) + # * never send the referrer + # Inspired by the RefControl Firefox addon. + return oldurl + class DomainBrowser(BaseBrowser): """ diff --git a/weboob/tools/browser2/test.py b/weboob/tools/browser2/test.py index 9368c02f..026f4c77 100644 --- a/weboob/tools/browser2/test.py +++ b/weboob/tools/browser2/test.py @@ -160,3 +160,18 @@ def test_changereq(): r = b.location(HTTPBIN + 'headers', headers={'User-Agent': 'Web Out of Browsers'}) assert 'Web Out of Browsers' in r.text assert 'Firefox' not in r.text + + +def test_referrer(): + """ + Test automatic referrer setting + """ + b = BaseBrowser() + r = b.location(HTTPBIN + 'get') + assert 'Referer' not in json.loads(r.text)['headers'] + r = b.location(HTTPBIN + 'headers') + assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'get' + r = b.location(HTTPBIN + 'headers') + assert 'Referer' not in json.loads(r.text)['headers'] + + assert b._get_referrer('https://example.com/', 'http://example.com/') is None