browser2: Add CookieJar and related cookie handling
Not used by Browser for now, but with very detailed tests. Does not handle expirations yet.
This commit is contained in:
parent
cd9b9300bd
commit
295e07b3ed
2 changed files with 453 additions and 0 deletions
333
weboob/tools/browser2/cookiejar.py
Normal file
333
weboob/tools/browser2/cookiejar.py
Normal file
|
|
@ -0,0 +1,333 @@
|
||||||
|
# TODO declare __all__
|
||||||
|
# TODO support logging
|
||||||
|
|
||||||
|
import urlparse
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
from .cookies import Cookie, Cookies, strip_spaces_and_quotes, Definitions
|
||||||
|
|
||||||
|
|
||||||
|
def valid_domain(domain):
|
||||||
|
"""
|
||||||
|
Like cookies.valid_domain, but allows leading periods.
|
||||||
|
Because it is *very* common and useful for us.
|
||||||
|
"""
|
||||||
|
domain.encode('ascii')
|
||||||
|
if domain and domain[0] == '.':
|
||||||
|
domain = domain[1:]
|
||||||
|
if domain and domain[0] in '"':
|
||||||
|
return False
|
||||||
|
if Definitions.DOMAIN_RE.match(domain):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def parse_domain(value):
|
||||||
|
"""
|
||||||
|
Like cookies.parse_domain, but allows leading periods.
|
||||||
|
Because it is *very* common and useful for us.
|
||||||
|
"""
|
||||||
|
value = strip_spaces_and_quotes(value)
|
||||||
|
if value:
|
||||||
|
assert valid_domain(value)
|
||||||
|
return value
|
||||||
|
|
||||||
|
# this is ok because we are using our own copy of the lib
|
||||||
|
# TODO push a better way upstream
|
||||||
|
Cookie.attribute_parsers['domain'] = parse_domain
|
||||||
|
Cookie.attribute_validators['domain'] = valid_domain
|
||||||
|
|
||||||
|
|
||||||
|
class CookieJar(object):
|
||||||
|
"""
|
||||||
|
Manage Cookies like a real browser, with security and privacy in mind.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ACCEPT_DOMAINS = []
|
||||||
|
"""
|
||||||
|
Domains where to accept cookies, even when we should not.
|
||||||
|
Add a "." before a domain to accept subdomains.
|
||||||
|
If True, accept all cookies (a bit insecure).
|
||||||
|
ACCEPT_DOMAINS has higher priority over REJECT_DOMAINS.
|
||||||
|
|
||||||
|
Disabling third-party cookies on most browsers acts like [], enabling them
|
||||||
|
acts like True. Since it is a very common browser option, we use the most
|
||||||
|
secure and privacy-aware method by default.
|
||||||
|
"""
|
||||||
|
|
||||||
|
REJECT_DOMAINS = []
|
||||||
|
"""
|
||||||
|
Domains where to reject cookies, even when we should not.
|
||||||
|
Add a "." before a domain to reject subdomains.
|
||||||
|
If True, reject all cookies.
|
||||||
|
REJECT_DOMAINS has lower priority over ACCEPT_DOMAINS.
|
||||||
|
"""
|
||||||
|
|
||||||
|
SECURE_DOMAINS = True
|
||||||
|
"""
|
||||||
|
When we get a cookie through an secure connection, mark it as secure
|
||||||
|
(not to be sent on insecure channels) if the server did not tell us to.
|
||||||
|
If True, do it automatically for all domains. Alternatively, you can put
|
||||||
|
a list of domains, like ACCEPT_DOMAINS or REJECT_DOMAINS.
|
||||||
|
If False, never do it (but still accept secure cookies as they are).
|
||||||
|
|
||||||
|
NoScript for Firefox does this, either by automated guesses or forced from a list.
|
||||||
|
"""
|
||||||
|
|
||||||
|
INSECURE_MATCHING = True
|
||||||
|
"""
|
||||||
|
Do sloppy matching to mimic what browsers do.
|
||||||
|
This is only for setting cookies; it should be relatively safe in Weboob.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.cookies = dict()
|
||||||
|
|
||||||
|
def _domain_match(self, pattern, domain):
|
||||||
|
"""
|
||||||
|
Checks a domain matches a domain pattern.
|
||||||
|
Patterns can be either the exact domain, or a wildcard (starting with a dot).
|
||||||
|
|
||||||
|
example.com matches example.com only
|
||||||
|
.example.com matches *.example.com (but not example.com)
|
||||||
|
|
||||||
|
:param pattern: str
|
||||||
|
:param domain: str
|
||||||
|
:rytpe: bool
|
||||||
|
"""
|
||||||
|
if pattern.startswith('.'):
|
||||||
|
return domain.endswith(pattern)
|
||||||
|
return domain == pattern
|
||||||
|
|
||||||
|
def _domain_match_list(self, patterns, domain):
|
||||||
|
"""
|
||||||
|
Checks domains match, from a list of patters.
|
||||||
|
If the list of patterns is True, it always matches.
|
||||||
|
|
||||||
|
:param pattern: list or True
|
||||||
|
:param domain: str
|
||||||
|
:rytpe: bool
|
||||||
|
"""
|
||||||
|
if patterns is True:
|
||||||
|
return True
|
||||||
|
for pattern in patterns:
|
||||||
|
if self._domain_match(pattern, domain):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _can_set(self, cookie, url):
|
||||||
|
"""
|
||||||
|
Checks an URL can set a particular cookie.
|
||||||
|
See ACCEPT_DOMAINS, REJECT_DOMAINS to set exceptions.
|
||||||
|
|
||||||
|
The cookie must have a domain already set, you can
|
||||||
|
use _normalize_cookie() for that.
|
||||||
|
|
||||||
|
:param cookie: The cookie the server set
|
||||||
|
:type cookie: Cookie
|
||||||
|
:param url: URL of the response
|
||||||
|
:type url: str
|
||||||
|
|
||||||
|
:rtype: bool
|
||||||
|
"""
|
||||||
|
url = urlparse.urlparse(url)
|
||||||
|
domain = url.hostname
|
||||||
|
|
||||||
|
# Accept/reject overrides
|
||||||
|
if self._domain_match_list(self.ACCEPT_DOMAINS, domain):
|
||||||
|
return True
|
||||||
|
if self._domain_match_list(self.REJECT_DOMAINS, domain):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# check path
|
||||||
|
if not url.path.startswith(cookie.path):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# check domain (secure & simple)
|
||||||
|
if cookie.domain.startswith('.'):
|
||||||
|
if cookie.domain.endswith(domain) or '.%s' % domain == cookie.domain:
|
||||||
|
return True
|
||||||
|
elif domain == cookie.domain:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# whatever.example.com should be able to set .example.com
|
||||||
|
# Unbelievably stupid, but widely used.
|
||||||
|
#
|
||||||
|
# Our method is not ideal, as it isn't very secure for some TLDs.
|
||||||
|
# A solution could be to use tldextract.
|
||||||
|
if self.INSECURE_MATCHING:
|
||||||
|
if domain.split('.')[-2:] == cookie.domain.split('.')[-2:]:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _normalize_cookie(self, cookie, url):
|
||||||
|
"""
|
||||||
|
Update a cookie we got from the response.
|
||||||
|
The goal is to have data relevant for use in future requests.
|
||||||
|
* Sets domain if there is not one.
|
||||||
|
* Sets path if there is not one.
|
||||||
|
* Set Expires from Max-Age. We need the expires to have an absolute expiration date.
|
||||||
|
* Force the Secure flag if required. (see SECURE_DOMAINS)
|
||||||
|
"""
|
||||||
|
url = urlparse.urlparse(url)
|
||||||
|
if cookie.domain is None:
|
||||||
|
cookie.domain = url.hostname
|
||||||
|
if cookie.path is None:
|
||||||
|
cookie.path = '/'
|
||||||
|
if cookie.max_age is not None:
|
||||||
|
cookie.expires = datetime.now() + timedelta(seconds=cookie.max_age)
|
||||||
|
if url.scheme == 'https' \
|
||||||
|
and self._match_domain_list(self.SECURE_DOMAINS, cookie.domain):
|
||||||
|
cookie.secure = True
|
||||||
|
|
||||||
|
def from_response(self, response):
|
||||||
|
"""
|
||||||
|
Import cookies from the response.
|
||||||
|
|
||||||
|
:type response: responses.Response
|
||||||
|
"""
|
||||||
|
if 'Set-Cookie' in response.headers:
|
||||||
|
cs = Cookies.from_response(response.headers['Set-Cookie'], True)
|
||||||
|
for c in cs.itervalues():
|
||||||
|
self._normalize_cookie(c, response.url)
|
||||||
|
if self._can_set(c, response.url):
|
||||||
|
self.set(c)
|
||||||
|
|
||||||
|
def for_request(self, url):
|
||||||
|
"""
|
||||||
|
Get a key/value dictionnary of cookies for a given request URL.
|
||||||
|
|
||||||
|
:type url: str
|
||||||
|
:rtype: dict
|
||||||
|
"""
|
||||||
|
url = urlparse.urlparse(url)
|
||||||
|
# we want insecure cookies in https too!
|
||||||
|
secure = None if url.scheme == 'https' else False
|
||||||
|
cdict = dict()
|
||||||
|
# get sorted cookies
|
||||||
|
cookies = self.all(domain=url.hostname, path=url.path, secure=secure)
|
||||||
|
for cookie in cookies:
|
||||||
|
# update only if not set, since first cookies are "better"
|
||||||
|
cdict.setdefault(cookie.name, cookie.value)
|
||||||
|
return cdict
|
||||||
|
|
||||||
|
def set(self, cookie):
|
||||||
|
"""
|
||||||
|
Add or replace a Cookie in the jar.
|
||||||
|
This is for normalized and checked cookies, no validation is done.
|
||||||
|
Use from_response() to import cookies from a python-requests response.
|
||||||
|
|
||||||
|
:type cookie: cookies.Cookie
|
||||||
|
"""
|
||||||
|
# cookies are unique by domain, path and of course name
|
||||||
|
assert len(cookie.domain)
|
||||||
|
assert len(cookie.path)
|
||||||
|
assert len(cookie.name)
|
||||||
|
self.cookies.setdefault(cookie.domain, {}). \
|
||||||
|
setdefault(cookie.path, {})[cookie.name] = cookie
|
||||||
|
|
||||||
|
def iter(self, name=None, domain=None, path=None, secure=None):
|
||||||
|
"""
|
||||||
|
Iterate matching cookies.
|
||||||
|
You can restrict by name, domain, path or security.
|
||||||
|
|
||||||
|
:type name: str
|
||||||
|
:type domain: str
|
||||||
|
:type path: str
|
||||||
|
:type secure: bool
|
||||||
|
|
||||||
|
:rtype: iter[:class:`cookies.Cookie`]
|
||||||
|
"""
|
||||||
|
for cdomain, cpaths in self.cookies.iteritems():
|
||||||
|
# domain matches (all domains if None)
|
||||||
|
if domain is None or self._domain_match(cdomain, domain):
|
||||||
|
for cpath, cnames in cpaths.iteritems():
|
||||||
|
# path matches (all if None)
|
||||||
|
if path is None or path.startswith(cpath):
|
||||||
|
for cname, cookie in cnames.iteritems():
|
||||||
|
# only wanted name (all if None)
|
||||||
|
if name is None or name == cname:
|
||||||
|
# wanted security (all if None)
|
||||||
|
# cookie.secure can be "None" if not secure!
|
||||||
|
if secure is None \
|
||||||
|
or (secure is False and not cookie.secure) \
|
||||||
|
or (secure is True and cookie.secure):
|
||||||
|
yield cookie
|
||||||
|
|
||||||
|
def all(self, name=None, domain=None, path=None, secure=None):
|
||||||
|
"""
|
||||||
|
Like iter(), but sorts the cookies, from most precise to less precise.
|
||||||
|
|
||||||
|
:rtype: list[:class:`cookies.Cookie`]
|
||||||
|
"""
|
||||||
|
cookies = list(self.iter(name, domain, path, secure))
|
||||||
|
|
||||||
|
# slowly compare all cookies
|
||||||
|
# XXX one of the worst things I've ever written
|
||||||
|
COOKIE1 = 1
|
||||||
|
COOKIE2 = -1
|
||||||
|
|
||||||
|
def ccmp(cookie1, cookie2):
|
||||||
|
# most precise matching domain
|
||||||
|
if domain and cookie1.domain != cookie2.domain:
|
||||||
|
if cookie1.domain == domain:
|
||||||
|
return COOKIE1
|
||||||
|
if cookie2.domain == domain:
|
||||||
|
return COOKIE2
|
||||||
|
if len(cookie1.domain) > len(cookie2.domain):
|
||||||
|
return COOKIE1
|
||||||
|
if len(cookie2.domain) > len(cookie1.domain):
|
||||||
|
return COOKIE2
|
||||||
|
# most precise matching path
|
||||||
|
if len(cookie1.path) > len(cookie2.path):
|
||||||
|
return COOKIE1
|
||||||
|
if len(cookie2.path) > len(cookie1.path):
|
||||||
|
return COOKIE2
|
||||||
|
# most secure
|
||||||
|
if cookie1.secure and not cookie2.secure:
|
||||||
|
return COOKIE1
|
||||||
|
if cookie2.secure and not cookie1.secure:
|
||||||
|
return COOKIE2
|
||||||
|
return 0
|
||||||
|
|
||||||
|
return sorted(cookies, cmp=ccmp, reverse=True)
|
||||||
|
|
||||||
|
def get(self, name=None, domain=None, path=None, secure=None):
|
||||||
|
"""
|
||||||
|
Return the best cookie from all().
|
||||||
|
Useful for changing the value or deleting a cookie.
|
||||||
|
|
||||||
|
name, domain, path and secure are the same as iter().
|
||||||
|
|
||||||
|
:rtype: :class:`cookies.Cookie` or None
|
||||||
|
"""
|
||||||
|
cookies = self.all(name, domain, path, secure)
|
||||||
|
try:
|
||||||
|
return cookies[0]
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def remove(self, cookie):
|
||||||
|
"""
|
||||||
|
Remove a cookie. The cookie argument must have the same domain, path and name.
|
||||||
|
Return False if not present, True if just removed.
|
||||||
|
|
||||||
|
:type cookie: :class:`cookies.Cookie`
|
||||||
|
:rtype: bool
|
||||||
|
"""
|
||||||
|
# cookies are unique by domain, path and of course name
|
||||||
|
assert len(cookie.domain)
|
||||||
|
assert len(cookie.path)
|
||||||
|
assert len(cookie.name)
|
||||||
|
d = self.cookies.get(cookie.domain, {}).get(cookie.path)
|
||||||
|
if cookie.name in d:
|
||||||
|
del d[cookie.name]
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
"""
|
||||||
|
Remove all cookies.
|
||||||
|
"""
|
||||||
|
self.cookies.clear()
|
||||||
|
|
@ -23,6 +23,8 @@ import requests
|
||||||
from nose.plugins.skip import SkipTest
|
from nose.plugins.skip import SkipTest
|
||||||
|
|
||||||
from .browser import BaseBrowser, DomainBrowser, Weboob
|
from .browser import BaseBrowser, DomainBrowser, Weboob
|
||||||
|
from . import cookiejar
|
||||||
|
from .cookies import Cookies
|
||||||
|
|
||||||
from weboob.tools.json import json
|
from weboob.tools.json import json
|
||||||
|
|
||||||
|
|
@ -175,3 +177,121 @@ def test_referrer():
|
||||||
assert 'Referer' not in json.loads(r.text)['headers']
|
assert 'Referer' not in json.loads(r.text)['headers']
|
||||||
|
|
||||||
assert b._get_referrer('https://example.com/', 'http://example.com/') is None
|
assert b._get_referrer('https://example.com/', 'http://example.com/') is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_cookieparse():
|
||||||
|
cj = cookiejar.CookieJar()
|
||||||
|
|
||||||
|
def bc(data):
|
||||||
|
"""
|
||||||
|
build one cookie, and normalize it
|
||||||
|
"""
|
||||||
|
cs = Cookies()
|
||||||
|
cs.parse_response(data)
|
||||||
|
for c in cs.itervalues():
|
||||||
|
cj._normalize_cookie(c, 'http://example.com/')
|
||||||
|
return c
|
||||||
|
|
||||||
|
# parse max-age
|
||||||
|
assert bc('__bwid=58244366; max-age=42; path=/').expires
|
||||||
|
|
||||||
|
# security for received cookies
|
||||||
|
assert cj._can_set(bc('k=v; domain=www.example.com'),
|
||||||
|
'http://www.example.com/')
|
||||||
|
assert cj._can_set(bc('k=v; domain=sub.example.com'),
|
||||||
|
'http://www.example.com/')
|
||||||
|
assert cj._can_set(bc('k=v; domain=sub.example.com'),
|
||||||
|
'http://example.com/')
|
||||||
|
assert cj._can_set(bc('k=v; domain=.example.com'),
|
||||||
|
'http://example.com/')
|
||||||
|
assert cj._can_set(bc('k=v; domain=www.example.com'),
|
||||||
|
'http://example.com/')
|
||||||
|
assert not cj._can_set(bc('k=v; domain=example.com'),
|
||||||
|
'http://example.net/')
|
||||||
|
assert not cj._can_set(bc('k=v; domain=.net'),
|
||||||
|
'http://example.net/')
|
||||||
|
assert not cj._can_set(bc('k=v; domain=www.example.net'),
|
||||||
|
'http://www.example.com/')
|
||||||
|
assert not cj._can_set(bc('k=v; domain=wwwexample.com'),
|
||||||
|
'http://example.com/')
|
||||||
|
assert not cj._can_set(bc('k=v; domain=.example.com'),
|
||||||
|
'http://wwwexample.com/')
|
||||||
|
|
||||||
|
# pattern matching domains
|
||||||
|
assert not cj._domain_match('example.com', 's.example.com')
|
||||||
|
assert cj._domain_match('.example.com', 's.example.com')
|
||||||
|
assert not cj._domain_match('.example.com', 'example.com') # yep.
|
||||||
|
assert cj._domain_match('s.example.com', 's.example.com')
|
||||||
|
assert not cj._domain_match('s.example.com', 's2.example.com')
|
||||||
|
assert cj._domain_match_list(True, 'example.com')
|
||||||
|
assert not cj._domain_match_list([], 'example.com')
|
||||||
|
assert cj._domain_match_list(['example.net', 'example.com'], 'example.com')
|
||||||
|
assert not cj._domain_match_list(['example.net', 'example.org'], 'example.com')
|
||||||
|
|
||||||
|
|
||||||
|
def test_cookiejar():
|
||||||
|
def bc(data):
|
||||||
|
"""
|
||||||
|
build one cookie
|
||||||
|
"""
|
||||||
|
cs = Cookies()
|
||||||
|
cs.parse_response(data)
|
||||||
|
for c in cs.itervalues():
|
||||||
|
return c
|
||||||
|
|
||||||
|
# filtering cookies
|
||||||
|
cookie0 = bc('j=v; domain=www.example.com; path=/')
|
||||||
|
cookie1 = bc('k=v1; domain=www.example.com; path=/; secure')
|
||||||
|
cookie2 = bc('k=v2; domain=.example.com; path=/')
|
||||||
|
cookie3 = bc('k=v3; domain=www.example.com; path=/lol/cat/')
|
||||||
|
cookie4 = bc('k=v4; domain=www.example.com; path=/lol/')
|
||||||
|
|
||||||
|
cj = cookiejar.CookieJar()
|
||||||
|
cj.set(cookie0)
|
||||||
|
cj.set(cookie1)
|
||||||
|
cj.set(cookie2)
|
||||||
|
cj.set(cookie3)
|
||||||
|
cj.set(cookie4)
|
||||||
|
|
||||||
|
assert len(cj.all()) == 5 # all cookies
|
||||||
|
assert len(cj.all(path='/')) == 3 # all cookies except the ones with deep paths
|
||||||
|
assert len(cj.all(name='k')) == 4 # this excludes cookie0
|
||||||
|
assert len(cj.all(domain='example.com')) == 0 # yep
|
||||||
|
assert len(cj.all(domain='s.example.com')) == 1 # cookie2
|
||||||
|
assert len(cj.all(domain='.example.com')) == 1 # cookie2 (exact match)
|
||||||
|
assert len(cj.all(domain='www.example.com')) == 5 # all cookies
|
||||||
|
assert len(cj.all(domain='www.example.com', path="/lol/")) == 4 # all + cookie4
|
||||||
|
assert len(cj.all(domain='www.example.com', path="/lol/cat")) == 4 # all + cookie4
|
||||||
|
assert len(cj.all(domain='www.example.com', path="/lol/cat/")) == 5 # all + cookie4 + cookie3
|
||||||
|
assert len(cj.all(secure=True)) == 1 # cookie1
|
||||||
|
assert len(cj.all(secure=False)) == 4 # all except cookie1
|
||||||
|
|
||||||
|
assert cj.get(domain='www.example.com', path="/lol/") is cookie4
|
||||||
|
assert cj.get(domain='www.example.com', path="/lol/cat/") is cookie3
|
||||||
|
assert cj.get(domain='www.example.com', path="/") is cookie1
|
||||||
|
assert cj.get(name='j', domain='www.example.com', path="/") is cookie0
|
||||||
|
assert cj.get(name='k', domain='www.example.com', path="/") is cookie1
|
||||||
|
assert cj.get(name='k', domain='s.example.com', path="/") is cookie2
|
||||||
|
assert cj.get(name='k', domain='www.example.com', path="/aaa") is cookie1
|
||||||
|
assert cj.get(domain='www.example.com', path='/') is cookie1
|
||||||
|
assert cj.get(domain='www.example.com', path='/', secure=False) is cookie0
|
||||||
|
assert cj.get(domain='www.example.com', path='/', secure=True) is cookie1
|
||||||
|
|
||||||
|
# this is just not API choice, but how browsers act
|
||||||
|
assert cj.for_request('http://www.example.com/') == {'k': 'v2', 'j': 'v'}
|
||||||
|
assert cj.for_request('https://www.example.com/') == {'k': 'v1', 'j': 'v'}
|
||||||
|
assert cj.for_request('http://www.example.com/lol/') == {'k': 'v4', 'j': 'v'}
|
||||||
|
assert cj.for_request('http://s.example.com/lol/') == {'k': 'v2'}
|
||||||
|
assert cj.for_request('http://example.com/lol/') == {}
|
||||||
|
|
||||||
|
# remove/add/replace
|
||||||
|
assert cj.remove(cookie1) is True
|
||||||
|
assert cj.get(secure=True) is None
|
||||||
|
cj.set(cookie1)
|
||||||
|
assert cj.get(secure=True) is cookie1
|
||||||
|
cookie5 = bc('k=w; domain=www.example.com; path=/; secure')
|
||||||
|
cj.set(cookie5)
|
||||||
|
assert cj.get(secure=True) is cookie5
|
||||||
|
assert len(cj.all(secure=True)) == 1
|
||||||
|
# not the same cookie, but the same identifiers
|
||||||
|
assert cj.remove(cookie1) is True
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue