weboob-devel/weboob/tools/browser2/test.py

# -*- coding: utf-8 -*-

# Copyright(C) 2012 Laurent Bachelier
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.

from __future__ import absolute_import

from datetime import datetime

from requests import HTTPError
from nose.plugins.skip import SkipTest

from .browser import BaseBrowser, DomainBrowser, Weboob
from .cookiejar import CookieJar
from .cookies import Cookies

from weboob.tools.json import json

# Those services can be run locally. More or less.
HTTPBIN = 'http://httpbin.org/'  # https://github.com/kennethreitz/httpbin
POSTBIN = 'http://www.postbin.org/'  # https://github.com/progrium/postbin
REQUESTBIN = 'http://requestb.in/'  # https://github.com/progrium/requestbin


def test_base():
    b = BaseBrowser()
    r = b.location(HTTPBIN + 'headers')
    assert isinstance(r.text, unicode)
    assert 'Firefox' in r.text
    assert 'python' not in r.text
    assert 'identity' not in r.text
    assert b.url == HTTPBIN + 'headers'

    r = b.location(HTTPBIN + 'gzip')
    assert 'Firefox' in r.text


def test_redirects():
    """
    Check redirects are followed
    """
    b = BaseBrowser()
    b.location(HTTPBIN + 'redirect/1')
    assert b.url == HTTPBIN + 'get'

    r = b.location(HTTPBIN + 'redirect/1')
    assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
    assert r.url == HTTPBIN + 'get'

    # Normal redirect chain
    b.url = None
    r = b.location(HTTPBIN + 'redirect/4')
    assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
    assert len(r.history) == 4
    assert r.history[3].request.url == HTTPBIN + 'redirect/1'
    assert r.history[3].request.headers.get('Referer') == HTTPBIN + 'redirect/2'
    assert r.history[2].request.url == HTTPBIN + 'redirect/2'
    assert r.history[2].request.headers.get('Referer') == HTTPBIN + 'redirect/3'
    assert r.history[1].request.url == HTTPBIN + 'redirect/3'
    assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/4'
    assert r.history[0].request.url == HTTPBIN + 'redirect/4'
    assert r.history[0].request.headers.get('Referer') == None
    assert r.url == HTTPBIN + 'get'

    # Disable all referers
    r = b.location(HTTPBIN + 'redirect/2', referrer=False)
    assert json.loads(r.text)['headers'].get('Referer') == None
    assert len(r.history) == 2
    assert r.history[1].request.headers.get('Referer') == None
    assert r.history[0].request.headers.get('Referer') == None
    assert r.url == HTTPBIN + 'get'

    # Only overrides first referer
    r = b.location(HTTPBIN + 'redirect/2', referrer='http://example.com/')
    assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'redirect/1'
    assert len(r.history) == 2
    assert r.history[1].request.headers.get('Referer') == HTTPBIN + 'redirect/2'
    assert r.history[0].request.headers.get('Referer') == 'http://example.com/'
    assert r.url == HTTPBIN + 'get'

    # Don't follow
    r = b.location(HTTPBIN + 'redirect/2', allow_redirects=False)
    assert len(r.history) == 0
    assert r.url == HTTPBIN + 'redirect/2'
    assert r.status_code == 302


def test_brokenpost():
    """
    Tests _fix_redirect()
    """
    raise SkipTest('PostBin is disabled')
    try:
        b = BaseBrowser()
        # postbin is picky with empty posts. that's good!
        r = b.location(POSTBIN, {})
        # ensures empty data (but not None) does a POST
        assert r.request.method == 'POST'
        # ensure we were redirected after submitting a post
        assert len(r.url) >= len(POSTBIN)
        # send a POST with data
        b.location(r.url, {'hello': 'world'})
        r = b.location(r.url + '/feed')
        assert 'hello' in r.text
        assert 'world' in r.text
    except HTTPError, e:
        if str(e).startswith('503 '):
            raise SkipTest('Quota exceeded')
        else:
            raise


def _getrqbin(b):
    """
    Get a RequestBin
    """
    # empty POST
    r = b.location(REQUESTBIN + 'api/v1/bins', '')
    name = json.loads(r.text)['name']
    assert name
    return name


def test_smartpost():
    """
    Checks we use POST or GET depending on the parameters
    """
    b = BaseBrowser()
    n = _getrqbin(b)

    r = b.location(REQUESTBIN + n)
    assert 'ok' in r.text
    r = b.location(REQUESTBIN + n + '?inspect')
    assert 'GET /%s' % n in r.text

    r = b.location(REQUESTBIN + n, {'hello': 'world'})
    assert 'ok' in r.text
    r = b.location(REQUESTBIN + n + '?inspect')
    assert 'POST /%s' % n in r.text
    assert 'hello' in r.text
    assert 'world' in r.text


def test_weboob():
    """
    Test the Weboob Profile
    """
    class BooBrowser(BaseBrowser):
        PROFILE = Weboob('0.0')

    b = BooBrowser()
    r = b.location(HTTPBIN + 'headers')
    assert 'weboob/0.0' in r.text
    assert 'identity' in r.text


def test_relative():
    """
    Check relative URL / domain handling
    """
    b = DomainBrowser()
    b.location(HTTPBIN)
    b.location('/ip')
    assert b.url == HTTPBIN + 'ip'

    assert b.absurl('/ip') == HTTPBIN + 'ip'
    b.location(REQUESTBIN)
    assert b.absurl('/ip') == REQUESTBIN + 'ip'
    b.BASEURL = HTTPBIN + 'aaaaaa'
    assert b.absurl('/ip') == HTTPBIN + 'ip'
    assert b.absurl('ip') == HTTPBIN + 'ip'
    assert b.absurl('/ip', False) == REQUESTBIN + 'ip'
    b.BASEURL = HTTPBIN + 'aaaaaa/'
    assert b.absurl('/') == HTTPBIN
    assert b.absurl('/bb') == HTTPBIN + 'bb'
    assert b.absurl('') == HTTPBIN + 'aaaaaa/'
    assert b.absurl('bb') == HTTPBIN + 'aaaaaa/bb'


def test_changereq():
    """
    Test overloading request defaults
    """
    b = BaseBrowser()
    r = b.location(HTTPBIN + 'headers', method='HEAD')
    assert r.text is None

    r = b.location(HTTPBIN + 'put', method='PUT', data={'hello': 'world'})
    assert 'hello' in r.text
    assert 'world' in r.text

    r = b.location(HTTPBIN + 'headers', headers={'User-Agent': 'Web Out of Browsers'})
    assert 'Web Out of Browsers' in r.text
    assert 'Firefox' not in r.text


def test_referrer():
    """
    Test automatic referrer setting
    """
    b = BaseBrowser()
    r = b.location(HTTPBIN + 'get')
    assert 'Referer' not in json.loads(r.text)['headers']
    r = b.location(HTTPBIN + 'headers')
    assert json.loads(r.text)['headers'].get('Referer') == HTTPBIN + 'get'
    r = b.location(HTTPBIN + 'headers')
    assert 'Referer' not in json.loads(r.text)['headers']

    # Force another referrer
    r = b.location(HTTPBIN + 'get')
    r = b.location(HTTPBIN + 'headers', referrer='http://example.com/')
    assert json.loads(r.text)['headers'].get('Referer') == 'http://example.com/'

    # Force no referrer
    r = b.location(HTTPBIN + 'get')
    r = b.location(HTTPBIN + 'headers', referrer=False)
    assert 'Referer' not in json.loads(r.text)['headers']

    assert b._get_referrer('https://example.com/', 'http://example.com/') is None


def test_cookieparse():
    cj = CookieJar()

    def bc(data):
        """
        build one cookie, and normalize it
        """
        cs = Cookies()
        cs.parse_response(data)
        for c in cs.itervalues():
            cj._normalize_cookie(c, 'http://example.com/')
            return c

    # parse max-age
    assert bc('__bwid=58244366; max-age=42; path=/').expires

    # security for received cookies
    assert cj._can_set(bc('k=v; domain=www.example.com'),
            'http://www.example.com/')
    assert cj._can_set(bc('k=v; domain=sub.example.com'),
            'http://www.example.com/')
    assert cj._can_set(bc('k=v; domain=sub.example.com'),
            'http://example.com/')
    assert cj._can_set(bc('k=v; domain=.example.com'),
            'http://example.com/')
    assert cj._can_set(bc('k=v; domain=www.example.com'),
            'http://example.com/')
    assert not cj._can_set(bc('k=v; domain=example.com'),
            'http://example.net/')
    assert not cj._can_set(bc('k=v; domain=.net'),
            'http://example.net/')
    assert not cj._can_set(bc('k=v; domain=www.example.net'),
            'http://www.example.com/')
    assert not cj._can_set(bc('k=v; domain=wwwexample.com'),
            'http://example.com/')
    assert not cj._can_set(bc('k=v; domain=.example.com'),
            'http://wwwexample.com/')

    # pattern matching domains
    assert not cj._domain_match('example.com', 's.example.com')
    assert cj._domain_match('.example.com', 's.example.com')
    assert not cj._domain_match('.example.com', 'example.com')  # yep.
    assert cj._domain_match('s.example.com', 's.example.com')
    assert not cj._domain_match('s.example.com', 's2.example.com')
    assert cj._domain_match_list(True, 'example.com')
    assert not cj._domain_match_list([], 'example.com')
    assert cj._domain_match_list(['example.net', 'example.com'], 'example.com')
    assert not cj._domain_match_list(['example.net', 'example.org'], 'example.com')


def test_cookiejar():
    def bc(data):
        """
        build one cookie
        """
        cs = Cookies()
        cs.parse_response(data)
        for c in cs.itervalues():
            return c

    # filtering cookies
    cookie0 = bc('j=v; domain=www.example.com; path=/')
    cookie1 = bc('k=v1; domain=www.example.com; path=/; secure')
    cookie2 = bc('k=v2; domain=.example.com; path=/')
    cookie3 = bc('k=v3; domain=www.example.com; path=/lol/cat/')
    cookie4 = bc('k=v4; domain=www.example.com; path=/lol/')

    cj = CookieJar()
    cj.set(cookie0)
    cj.set(cookie1)
    cj.set(cookie2)
    cj.set(cookie3)
    cj.set(cookie4)

    assert len(cj.all()) == 5  # all cookies
    assert len(cj.all(path='/')) == 3  # all cookies except the ones with deep paths
    assert len(cj.all(name='k')) == 4  # this excludes cookie0
    assert len(cj.all(domain='example.com')) == 0  # yep
    assert len(cj.all(domain='s.example.com')) == 1  # cookie2
    assert len(cj.all(domain='.example.com')) == 1  # cookie2 (exact match)
    assert len(cj.all(domain='www.example.com')) == 5  # all cookies
    assert len(cj.all(domain='www.example.com', path="/lol/")) == 4  # all + cookie4
    assert len(cj.all(domain='www.example.com', path="/lol/cat")) == 4  # all + cookie4
    assert len(cj.all(domain='www.example.com', path="/lol/cat/")) == 5  # all + cookie4 + cookie3
    assert len(cj.all(secure=True)) == 1  # cookie1
    assert len(cj.all(secure=False)) == 4  # all except cookie1

    assert cj.get(domain='www.example.com', path="/lol/") is cookie4
    assert cj.get(domain='www.example.com', path="/lol/cat/") is cookie3
    assert cj.get(domain='www.example.com', path="/") is cookie1
    assert cj.get(name='j', domain='www.example.com', path="/") is cookie0
    assert cj.get(name='k', domain='www.example.com', path="/") is cookie1
    assert cj.get(name='k', domain='s.example.com', path="/") is cookie2
    assert cj.get(name='k', domain='www.example.com', path="/aaa") is cookie1
    assert cj.get(domain='www.example.com', path='/') is cookie1
    assert cj.get(domain='www.example.com', path='/', secure=False) is cookie0
    assert cj.get(domain='www.example.com', path='/', secure=True) is cookie1

    # this is just not API choice, but how browsers act
    assert cj.for_request('http://www.example.com/') == {'k': 'v2', 'j': 'v'}
    assert cj.for_request('https://www.example.com/') == {'k': 'v1', 'j': 'v'}
    assert cj.for_request('http://www.example.com/lol/') == {'k': 'v4', 'j': 'v'}
    assert cj.for_request('http://s.example.com/lol/') == {'k': 'v2'}
    assert cj.for_request('http://example.com/lol/') == {}

    # remove/add/replace
    assert cj.remove(cookie1) is True
    assert cj.get(secure=True) is None
    cj.set(cookie1)
    assert cj.get(secure=True) is cookie1
    cookie5 = bc('k=w; domain=www.example.com; path=/; secure')
    cj.set(cookie5)
    assert cj.get(secure=True) is cookie5
    assert len(cj.all(secure=True)) == 1
    # not the same cookie, but the same identifiers
    assert cj.remove(cookie1) is True

    cj.clear()
    cookie6 = bc('e1=1; domain=www.example.com; path=/; Expires=Thu, 01 Jan 1970 00:00:01 GMT;')
    cookie7 = bc('e2=1; domain=www.example.com; path=/; Expires=Thu, 01 Jan 2010 00:00:01 GMT;')
    now = datetime(2000, 01, 01)
    cj.set(cookie0)
    cj.set(cookie6)
    cj.set(cookie7)

    assert cj.for_request('http://www.example.com/', now) == {'e2': '1', 'j': 'v'}
    assert cj.for_request('http://www.example.com/', datetime(2020, 01, 01)) == {'j': 'v'}

    assert len(cj.all()) == 3
    cj.flush(now)
    assert len(cj.all()) == 2
    assert cj.remove(cookie6) is False  # already removed
    cj.flush(now, session=True)
    assert len(cj.all()) == 1


def test_buildcookie():
    cj = CookieJar()
    """
    Test cookie building
    """
    c = cj.build('kk', 'vv', 'http://example.com/')
    assert c.domain == 'example.com'
    assert not c.secure
    assert c.path == '/'

    c = cj.build('kk', 'vv', 'http://example.com/', path='/plop/', wildcard=True)
    assert c.domain == '.example.com'

    assert c.path == '/plop/'
    c = cj.build('kk', 'vv', 'http://example.com/plop/')
    assert c.path == '/plop/'
    c = cj.build('kk', 'vv', 'http://example.com/plop/plap')
    assert c.path == '/plop/'
    c = cj.build('kk', 'vv', 'http://example.com/plop/?http://example.net/plip/')
    assert c.path == '/plop/'
    assert c.domain == 'example.com'
    c = cj.build('kk', 'vv', 'http://example.com/plop/plap', path='/')
    assert c.path == '/'

    c = cj.build('kk', 'vv', 'https://example.com/')
    assert c.domain == 'example.com'
    assert c.secure

    # check the cookie works
    c.name = 'k'
    c.value = 'v'
    cj.set(c)
    assert cj.for_request('https://example.com/') == {'k': 'v'}
    assert cj.for_request('http://example.com/') == {}


def test_cookienav():
    """
    Test browsing while getting new cookies
    """
    b = BaseBrowser()
    r = b.location(HTTPBIN + 'cookies')
    assert len(json.loads(r.text)['cookies']) == 0

    r = b.location(HTTPBIN + 'cookies/set/hello/world')
    assert len(json.loads(r.text)['cookies']) == 1
    assert json.loads(r.text)['cookies']['hello'] == 'world'
    r = b.location(HTTPBIN + 'cookies/set/hello2/world2')
    assert len(json.loads(r.text)['cookies']) == 2
    assert json.loads(r.text)['cookies']['hello2'] == 'world2'

    r = b.location(REQUESTBIN)
    assert 'session' in r.cookies  # requestbin should give this by default
    assert 'hello' not in r.cookies  # we didn't send the wrong cookie
    # return to httpbin, check we didn't give the wrong cookie
    r = b.location(HTTPBIN + 'cookies')
    assert 'session' not in json.loads(r.text)['cookies']

    # override cookies temporarily
    r = b.location(HTTPBIN + 'cookies', cookies={'bla': 'bli'})
    assert len(json.loads(r.text)['cookies']) == 1
    assert json.loads(r.text)['cookies']['bla'] == 'bli'
    # reload, the "fake" cookie should not be there
    r = b.location(HTTPBIN + 'cookies')
    assert len(json.loads(r.text)['cookies']) == 2
    assert 'bla' not in json.loads(r.text)['cookies']