# -*- coding: utf-8 -*- # Copyright(C) 2012 Laurent Bachelier # # This file is part of weboob. # # weboob is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # weboob is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . from __future__ import absolute_import from datetime import datetime from random import choice import re import string from requests import HTTPError from nose.plugins.skip import SkipTest from nose.tools import assert_raises from .browser import BaseBrowser, DomainBrowser, Weboob, UrlNotAllowed from .cookiejar import CookieJar, CookiePolicy from .cookies import Cookies from weboob.tools.json import json # Those services can be run locally. More or less. HTTPBIN = 'http://httpbin.org/' # https://github.com/kennethreitz/httpbin POSTBIN = 'http://www.postbin.org/' # https://github.com/progrium/postbin REQUESTBIN = 'http://requestb.in/' # https://github.com/progrium/requestbin # if you change HTTPBIN, you should also change these URLs for some tests: # redirect to http://httpbin.org/get REDIRECTS1 = ('http://tinyurl.com/ouiboube-b2', 'http://bit.ly/st4Hcv') # redirect to http://httpbin.org/cookies REDIRECTS2 = ('http://tinyurl.com/7zp3jnr', 'http://bit.ly/HZCCX7') def test_base(): b = BaseBrowser() r = b.location(HTTPBIN + 'headers') assert isinstance(r.text, unicode) assert 'Firefox' in r.text assert 'python' not in r.text assert 'identity' not in r.text assert b.url == HTTPBIN + 'headers' r = b.location(HTTPBIN + 'gzip') assert 'Firefox' in r.text def test_redirects(): """ Check redirects are followed """ b = BaseBrowser() b.location(HTTPBIN + 'redirect/1') assert b.url == HTTPBIN + 'get' r = b.location(HTTPBIN + 'redirect/1') assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1' assert r.url == HTTPBIN + 'get' # Normal redirect chain b.url = None r = b.location(HTTPBIN + 'redirect/4') assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1' assert len(r.history) == 4 assert r.history[3].request.url == HTTPBIN + 'redirect/1' assert r.history[3].request.headers.get('Referer') == HTTPBIN + 'redirect/2' assert r.history[2].request.url == HTTPBIN + 'redirect/2' assert r.history[2].request.headers.get('Referer') == HTTPBIN + 'redirect/3' assert r.history[1].request.url == HTTPBIN + 'redirect/3' assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/4' assert r.history[0].request.url == HTTPBIN + 'redirect/4' assert r.history[0].request.headers.get('Referer') is None assert r.url == HTTPBIN + 'get' # Disable all referers r = b.location(HTTPBIN + 'redirect/2', referrer=False) assert json.loads(r.text)['headers'].get('Referer') is None assert len(r.history) == 2 assert r.history[1].request.headers.get('Referer') is None assert r.history[0].request.headers.get('Referer') is None assert r.url == HTTPBIN + 'get' # Only overrides first referer r = b.location(HTTPBIN + 'redirect/2', referrer='http://example.com/') assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1' assert len(r.history) == 2 assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/2' assert r.history[0].request.headers.get('Referer') == 'http://example.com/' assert r.url == HTTPBIN + 'get' # Don't follow r = b.location(HTTPBIN + 'redirect/2', allow_redirects=False) assert len(r.history) == 0 assert r.url == HTTPBIN + 'redirect/2' assert r.status_code == 302 def test_redirect2(): """ More redirect tests """ rurl = choice(REDIRECTS1) b = BaseBrowser() r = b.location(rurl) assert r.url == HTTPBIN + 'get' assert json.loads(r.text)['headers'].get('Referer') == rurl # TODO referrer privacy settings def test_brokenpost(): """ Test empty POST and redirect after POST """ raise SkipTest('PostBin is disabled') try: b = BaseBrowser() # postbin is picky with empty posts. that's good! r = b.location(POSTBIN, {}) # ensures empty data (but not None) does a POST assert r.request.method == 'POST' # ensure we were redirected after submitting a post assert len(r.url) >= len(POSTBIN) # send a POST with data b.location(r.url, {'hello': 'world'}) r = b.location(r.url + '/feed') assert 'hello' in r.text assert 'world' in r.text except HTTPError as e: if str(e).startswith('503 '): raise SkipTest('Quota exceeded') else: raise def _getrqbin(b): """ Get a RequestBin """ # empty POST r = b.location(REQUESTBIN + 'api/v1/bins', '') name = json.loads(r.text)['name'] assert name return name def test_smartpost(): """ Checks we use POST or GET depending on the parameters """ b = BaseBrowser() n = _getrqbin(b) r = b.location(REQUESTBIN + n) assert 'ok' in r.text r = b.location(REQUESTBIN + n + '?inspect') assert 'GET /%s' % n in r.text r = b.location(REQUESTBIN + n, {'hello': 'world'}) assert 'ok' in r.text r = b.location(REQUESTBIN + n + '?inspect') assert 'POST /%s' % n in r.text assert 'hello' in r.text assert 'world' in r.text def test_weboob(): """ Test the Weboob Profile """ class BooBrowser(BaseBrowser): PROFILE = Weboob('0.0') b = BooBrowser() r = b.location(HTTPBIN + 'headers') assert 'weboob/0.0' in r.text assert 'identity' in r.text def test_relative(): """ Check relative URL / domain handling """ b = DomainBrowser() b.location(HTTPBIN) b.location('/ip') assert b.url == HTTPBIN + 'ip' assert b.absurl('/ip') == HTTPBIN + 'ip' b.location(REQUESTBIN) assert b.absurl('/ip') == REQUESTBIN + 'ip' b.BASEURL = HTTPBIN + 'aaaaaa' assert b.absurl('/ip') == HTTPBIN + 'ip' assert b.absurl('ip') == HTTPBIN + 'ip' assert b.absurl('/ip', False) == REQUESTBIN + 'ip' b.BASEURL = HTTPBIN + 'aaaaaa/' assert b.absurl('/') == HTTPBIN assert b.absurl('/bb') == HTTPBIN + 'bb' assert b.absurl('') == HTTPBIN + 'aaaaaa/' assert b.absurl('bb') == HTTPBIN + 'aaaaaa/bb' # Give an absolute URL, should get it unaltered b.BASEURL = 'http://example.net/' assert b.absurl('http://example.com/aaa/bbb') == 'http://example.com/aaa/bbb' assert b.absurl('https://example.com/aaa/bbb') == 'https://example.com/aaa/bbb' # Schemeless absolute URL assert b.absurl('//example.com/aaa/bbb') == 'http://example.com/aaa/bbb' b.BASEURL = 'https://example.net/' assert b.absurl('//example.com/aaa/bbb') == 'https://example.com/aaa/bbb' def test_allow_url(): b = DomainBrowser() b.RESTRICT_URL = True assert b.url_allowed('http://example.com/') assert b.url_allowed('http://example.net/') b.BASEURL = 'http://example.com/' assert b.url_allowed('http://example.com/') assert b.url_allowed('http://example.com/aaa') assert not b.url_allowed('https://example.com/') assert not b.url_allowed('http://example.net/') assert not b.url_allowed('http://') b.BASEURL = 'https://example.com/' assert not b.url_allowed('http://example.com/') assert not b.url_allowed('http://example.com/aaa') assert b.url_allowed('https://example.com/') assert b.url_allowed('https://example.com/aaa/bbb') b.RESTRICT_URL = ['https://example.com/', 'http://example.com/'] assert b.url_allowed('http://example.com/aaa/bbb') assert b.url_allowed('https://example.com/aaa/bbb') assert not b.url_allowed('http://example.net/aaa/bbb') assert not b.url_allowed('https://example.net/aaa/bbb') assert_raises(UrlNotAllowed, b.location, 'http://example.net/') assert_raises(UrlNotAllowed, b.open, 'http://example.net/') def test_changereq(): """ Test overloading request defaults """ b = BaseBrowser() r = b.location(HTTPBIN + 'headers', method='HEAD') assert r.text is None r = b.location(HTTPBIN + 'put', method='PUT', data={'hello': 'world'}) assert 'hello' in r.text assert 'world' in r.text r = b.location(HTTPBIN + 'headers', headers={'User-Agent': 'Web Out of Browsers'}) assert 'Web Out of Browsers' in r.text assert 'Firefox' not in r.text def test_referrer(): """ Test automatic referrer setting """ b = BaseBrowser() r = b.location(HTTPBIN + 'get') assert 'Referer' not in json.loads(r.text)['headers'] r = b.location(HTTPBIN + 'headers') assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'get' r = b.location(HTTPBIN + 'headers') assert 'Referer' not in json.loads(r.text)['headers'] # Force another referrer r = b.location(HTTPBIN + 'get') r = b.location(HTTPBIN + 'headers', referrer='http://example.com/') assert json.loads(r.text)['headers'].get('Referer') == 'http://example.com/' # Force no referrer r = b.location(HTTPBIN + 'get') r = b.location(HTTPBIN + 'headers', referrer=False) assert 'Referer' not in json.loads(r.text)['headers'] assert b.get_referrer('https://example.com/', 'http://example.com/') is None def test_cookiepolicy(): """ Test cookie parsing and processing """ policy = CookiePolicy() def bc(data): """ build one cookie, and normalize it """ cs = Cookies() cs.parse_response(data) for c in cs.itervalues(): policy.normalize_cookie(c, 'http://example.com/') return c # parse max-age assert bc('__bwid=58244366; max-age=42; path=/').expires # security for received cookies assert policy.can_set(bc('k=v; domain=www.example.com'), 'http://www.example.com/') assert policy.can_set(bc('k=v; domain=sub.example.com'), 'http://www.example.com/') assert policy.can_set(bc('k=v; domain=sub.example.com'), 'http://example.com/') assert policy.can_set(bc('k=v; domain=.example.com'), 'http://example.com/') assert policy.can_set(bc('k=v; domain=www.example.com'), 'http://example.com/') assert not policy.can_set(bc('k=v; domain=example.com'), 'http://example.net/') assert not policy.can_set(bc('k=v; domain=.net'), 'http://example.net/') assert not policy.can_set(bc('k=v; domain=www.example.net'), 'http://www.example.com/') assert not policy.can_set(bc('k=v; domain=wwwexample.com'), 'http://example.com/') assert not policy.can_set(bc('k=v; domain=.example.com'), 'http://wwwexample.com/') # pattern matching domains assert not policy.domain_match('example.com', 's.example.com') assert policy.domain_match('.example.com', 's.example.com') assert not policy.domain_match('.example.com', 'example.com') # yep. assert policy.domain_match('s.example.com', 's.example.com') assert not policy.domain_match('s.example.com', 's2.example.com') assert policy.domain_match_list(True, 'example.com') assert not policy.domain_match_list([], 'example.com') assert policy.domain_match_list(['example.net', 'example.com'], 'example.com') assert not policy.domain_match_list(['example.net', 'example.org'], 'example.com') def test_cookiejar(): """ Test adding, removing, finding cookies to and from the jar """ def bc(data): """ build one cookie """ cs = Cookies() cs.parse_response(data) for c in cs.itervalues(): return c # filtering cookies cookie0 = bc('j=v; domain=www.example.com; path=/') cookie1 = bc('k=v1; domain=www.example.com; path=/; secure') cookie2 = bc('k=v2; domain=.example.com; path=/') cookie3 = bc('k=v3; domain=www.example.com; path=/lol/cat/') cookie4 = bc('k=v4; domain=www.example.com; path=/lol/') cj = CookieJar(CookiePolicy()) cj.set(cookie0) cj.set(cookie1) cj.set(cookie2) cj.set(cookie3) cj.set(cookie4) assert len(cj.all()) == 5 # all cookies assert len(cj.all(path='/')) == 3 # all cookies except the ones with deep paths assert len(cj.all(name='k')) == 4 # this excludes cookie0 assert len(cj.all(domain='example.com')) == 0 # yep assert len(cj.all(domain='s.example.com')) == 1 # cookie2 assert len(cj.all(domain='.example.com')) == 1 # cookie2 (exact match) assert len(cj.all(domain='www.example.com')) == 5 # all cookies assert len(cj.all(domain='www.example.com', path="/lol/")) == 4 # all + cookie4 assert len(cj.all(domain='www.example.com', path="/lol/cat")) == 4 # all + cookie4 assert len(cj.all(domain='www.example.com', path="/lol/cat/")) == 5 # all + cookie4 + cookie3 assert len(cj.all(secure=True)) == 1 # cookie1 assert len(cj.all(secure=False)) == 4 # all except cookie1 assert cj.get(domain='www.example.com', path="/lol/") is cookie4 assert cj.get(domain='www.example.com', path="/lol/cat/") is cookie3 assert cj.get(domain='www.example.com', path="/") is cookie1 assert cj.get(name='j', domain='www.example.com', path="/") is cookie0 assert cj.get(name='k', domain='www.example.com', path="/") is cookie1 assert cj.get(name='k', domain='s.example.com', path="/") is cookie2 assert cj.get(name='k', domain='www.example.com', path="/aaa") is cookie1 assert cj.get(domain='www.example.com', path='/') is cookie1 assert cj.get(domain='www.example.com', path='/', secure=False) is cookie0 assert cj.get(domain='www.example.com', path='/', secure=True) is cookie1 # this is just not API choice, but how browsers act assert cj.for_request('http://www.example.com/') == {'k': 'v2', 'j': 'v'} assert cj.for_request('https://www.example.com/') == {'k': 'v1', 'j': 'v'} assert cj.for_request('http://www.example.com/lol/') == {'k': 'v4', 'j': 'v'} assert cj.for_request('http://s.example.com/lol/') == {'k': 'v2'} assert cj.for_request('http://example.com/lol/') == {} # remove/add/replace assert cj.remove(cookie1) is True assert cj.get(secure=True) is None cj.set(cookie1) assert cj.get(secure=True) is cookie1 cookie5 = bc('k=w; domain=www.example.com; path=/; secure') cj.set(cookie5) assert cj.get(secure=True) is cookie5 assert len(cj.all(secure=True)) == 1 # not the same cookie, but the same identifiers assert cj.remove(cookie1) is True cj.clear() cookie6 = bc('e1=1; domain=www.example.com; path=/; Expires=Thu, 01 Jan 1970 00:00:01 GMT;') cookie7 = bc('e2=1; domain=www.example.com; path=/; Expires=Thu, 01 Jan 2010 00:00:01 GMT;') now = datetime(2000, 01, 01) cj.set(cookie0) cj.set(cookie6) cj.set(cookie7) assert cj.for_request('http://www.example.com/', now) == {'e2': '1', 'j': 'v'} assert cj.for_request('http://www.example.com/', datetime(2020, 01, 01)) == {'j': 'v'} assert len(cj.all()) == 3 cj.flush(now) assert len(cj.all()) == 2 assert cj.remove(cookie6) is False # already removed cj.flush(now, session=True) assert len(cj.all()) == 1 def test_buildcookie(): """ Test easy cookie building """ cj = CookieJar(CookiePolicy()) c = cj.build('kk', 'vv', 'http://example.com/') assert c.domain == 'example.com' assert not c.secure assert c.path == '/' c = cj.build('kk', 'vv', 'http://example.com/', path='/plop/', wildcard=True) assert c.domain == '.example.com' assert c.path == '/plop/' c = cj.build('kk', 'vv', 'http://example.com/plop/') assert c.path == '/plop/' c = cj.build('kk', 'vv', 'http://example.com/plop/plap') assert c.path == '/plop/' c = cj.build('kk', 'vv', 'http://example.com/plop/?http://example.net/plip/') assert c.path == '/plop/' assert c.domain == 'example.com' c = cj.build('kk', 'vv', 'http://example.com/plop/plap', path='/') assert c.path == '/' c = cj.build('kk', 'vv', 'https://example.com/') assert c.domain == 'example.com' assert c.secure # check the cookie works c.name = 'k' c.value = 'v' cj.set(c) assert cj.for_request('https://example.com/') == {'k': 'v'} assert cj.for_request('http://example.com/') == {} def test_cookienav(): """ Test browsing while getting new cookies """ b = BaseBrowser() r = b.location(HTTPBIN + 'cookies') assert len(json.loads(r.text)['cookies']) == 0 r = b.location(HTTPBIN + 'cookies/set/hello/world') assert len(json.loads(r.text)['cookies']) == 1 assert json.loads(r.text)['cookies']['hello'] == 'world' r = b.location(HTTPBIN + 'cookies/set/hello2/world2') assert len(json.loads(r.text)['cookies']) == 2 assert json.loads(r.text)['cookies']['hello2'] == 'world2' r = b.location(REQUESTBIN) assert 'session' in r.cookies # requestbin should give this by default assert 'hello' not in r.cookies # we didn't send the wrong cookie # return to httpbin, check we didn't give the wrong cookie r = b.location(HTTPBIN + 'cookies') assert 'session' not in json.loads(r.text)['cookies'] # override cookies temporarily r = b.location(HTTPBIN + 'cookies', cookies={'bla': 'bli'}) assert len(json.loads(r.text)['cookies']) == 1 assert json.loads(r.text)['cookies']['bla'] == 'bli' # reload, the "fake" cookie should not be there r = b.location(HTTPBIN + 'cookies') assert len(json.loads(r.text)['cookies']) == 2 assert 'bla' not in json.loads(r.text)['cookies'] def test_cookieredirect(): """ Test cookie redirection security """ rurl = choice(REDIRECTS2) b = BaseBrowser() r = b.location(HTTPBIN + 'cookies') assert len(json.loads(r.text)['cookies']) == 0 # add a cookie to the redirection service domain (not the target!) cookie = b.cookies.build('k', 'v1', rurl) b.cookies.set(cookie) r = b.location(rurl) assert r.url == HTTPBIN + 'cookies' # the cookie was not forwarded; it's for another domain # this is important for security reasons, # and because python-requests tries to do it by default! assert len(json.loads(r.text)['cookies']) == 0 # add a cookie for the target cookie = b.cookies.build('k', 'v2', HTTPBIN) b.cookies.set(cookie) r = b.location(rurl) assert r.url == HTTPBIN + 'cookies' assert len(json.loads(r.text)['cookies']) == 1 assert json.loads(r.text)['cookies']['k'] == 'v2' # check all cookies sent in the request chain assert r.cookies == {'k': 'v2'} assert r.history[0].cookies['k'] == 'v1' # some services add other cookies def test_cookie_srv1(): """ Test cookie in real conditions (service 1) """ class TestBrowser(DomainBrowser): BASEURL = 'http://www.mria-arim.ca/' b = TestBrowser() b.location('testCookies.asp') # TODO this is also a good place to test form parsing/submission b.location('testCookies.asp', {'makeMe': 'Create Cookie'}) r = b.location('testCookies.asp', {'testMe': 'Test Browser'}) assert 'Your Browser accepts cookies' in r.text def test_cookie_srv2(): """ Test cookie in real conditions (service 2) """ def randtext(): return ''.join(choice(string.digits + string.letters) for _ in xrange(32)) class TestBrowser(DomainBrowser): BASEURL = 'http://www.html-kit.com/tools/cookietester/' def cookienum(self): return int(re.search('Number of cookies received: (\d+)', self.response.text).groups()[0]) def mypost(self, **data): return self.location('', data) b = TestBrowser() b.home() assert b.cookienum() == 0 r1 = randtext() r1v = randtext() # TODO this is also a good place to test form parsing/submission # get a new cookie r = b.mypost(cn=r1, cv=r1v) assert b.cookienum() == 1 assert r1 in r.text assert r1v in r.text # cookie deletion r = b.mypost(cr=r1) assert b.cookienum() == 0 assert r1 not in r.text assert r1v not in r.text # om nom nom b.mypost(cn=randtext(), cv=randtext()) b.mypost(cn=randtext(), cv=randtext()) b.mypost(cn=randtext(), cv=randtext()) b.mypost(cn=randtext(), cv=randtext()) assert b.cookienum() == 4