browser2: Add CookieJar and related cookie handling

Not used by Browser for now, but with very detailed tests. Does not handle expirations yet.
2012-04-10 07:05:25 +02:00 · 2012-04-10 07:05:25 +02:00 · 295e07b3ed
commit 295e07b3ed
parent cd9b9300bd
2 changed files with 453 additions and 0 deletions
--- a/weboob/tools/browser2/cookiejar.py
+++ b/weboob/tools/browser2/cookiejar.py
@ -0,0 +1,333 @@
+# TODO declare __all__
+# TODO support logging
+
+import urlparse
+from datetime import datetime, timedelta
+
+from .cookies import Cookie, Cookies, strip_spaces_and_quotes, Definitions
+
+
+def valid_domain(domain):
+    """
+    Like cookies.valid_domain, but allows leading periods.
+    Because it is *very* common and useful for us.
+    """
+    domain.encode('ascii')
+    if domain and domain[0] == '.':
+        domain = domain[1:]
+    if domain and domain[0] in '"':
+        return False
+    if Definitions.DOMAIN_RE.match(domain):
+        return True
+    return False
+
+
+def parse_domain(value):
+    """
+    Like cookies.parse_domain, but allows leading periods.
+    Because it is *very* common and useful for us.
+    """
+    value = strip_spaces_and_quotes(value)
+    if value:
+        assert valid_domain(value)
+    return value
+
+# this is ok because we are using our own copy of the lib
+# TODO push a better way upstream
+Cookie.attribute_parsers['domain'] = parse_domain
+Cookie.attribute_validators['domain'] = valid_domain
+
+
+class CookieJar(object):
+    """
+    Manage Cookies like a real browser, with security and privacy in mind.
+    """
+
+    ACCEPT_DOMAINS = []
+    """
+    Domains where to accept cookies, even when we should not.
+    Add a "." before a domain to accept subdomains.
+    If True, accept all cookies (a bit insecure).
+    ACCEPT_DOMAINS has higher priority over REJECT_DOMAINS.
+
+    Disabling third-party cookies on most browsers acts like [], enabling them
+    acts like True. Since it is a very common browser option, we use the most
+    secure and privacy-aware method by default.
+    """
+
+    REJECT_DOMAINS = []
+    """
+    Domains where to reject cookies, even when we should not.
+    Add a "." before a domain to reject subdomains.
+    If True, reject all cookies.
+    REJECT_DOMAINS has lower priority over ACCEPT_DOMAINS.
+    """
+
+    SECURE_DOMAINS = True
+    """
+    When we get a cookie through an secure connection, mark it as secure
+    (not to be sent on insecure channels) if the server did not tell us to.
+    If True, do it automatically for all domains. Alternatively, you can put
+    a list of domains, like ACCEPT_DOMAINS or REJECT_DOMAINS.
+    If False, never do it (but still accept secure cookies as they are).
+
+    NoScript for Firefox does this, either by automated guesses or forced from a list.
+    """
+
+    INSECURE_MATCHING = True
+    """
+    Do sloppy matching to mimic what browsers do.
+    This is only for setting cookies; it should be relatively safe in Weboob.
+    """
+
+    def __init__(self):
+        self.cookies = dict()
+
+    def _domain_match(self, pattern, domain):
+        """
+        Checks a domain matches a domain pattern.
+        Patterns can be either the exact domain, or a wildcard (starting with a dot).
+
+        example.com matches example.com only
+        .example.com matches *.example.com (but not example.com)
+
+        :param pattern: str
+        :param domain: str
+        :rytpe: bool
+        """
+        if pattern.startswith('.'):
+            return domain.endswith(pattern)
+        return domain == pattern
+
+    def _domain_match_list(self, patterns, domain):
+        """
+        Checks domains match, from a list of patters.
+        If the list of patterns is True, it always matches.
+
+        :param pattern: list or True
+        :param domain: str
+        :rytpe: bool
+        """
+        if patterns is True:
+            return True
+        for pattern in patterns:
+            if self._domain_match(pattern, domain):
+                return True
+        return False
+
+    def _can_set(self, cookie, url):
+        """
+        Checks an URL can set a particular cookie.
+        See ACCEPT_DOMAINS, REJECT_DOMAINS to set exceptions.
+
+        The cookie must have a domain already set, you can
+        use _normalize_cookie() for that.
+
+        :param cookie: The cookie the server set
+        :type cookie: Cookie
+        :param url: URL of the response
+        :type url: str
+
+        :rtype: bool
+        """
+        url = urlparse.urlparse(url)
+        domain = url.hostname
+
+        # Accept/reject overrides
+        if self._domain_match_list(self.ACCEPT_DOMAINS, domain):
+            return True
+        if self._domain_match_list(self.REJECT_DOMAINS, domain):
+            return False
+
+        # check path
+        if not url.path.startswith(cookie.path):
+            return False
+
+        # check domain (secure & simple)
+        if cookie.domain.startswith('.'):
+            if cookie.domain.endswith(domain) or '.%s' % domain == cookie.domain:
+                return True
+        elif domain == cookie.domain:
+            return True
+
+        # whatever.example.com should be able to set .example.com
+        # Unbelievably stupid, but widely used.
+        #
+        # Our method is not ideal, as it isn't very secure for some TLDs.
+        # A solution could be to use tldextract.
+        if self.INSECURE_MATCHING:
+            if domain.split('.')[-2:] == cookie.domain.split('.')[-2:]:
+                return True
+
+        return False
+
+    def _normalize_cookie(self, cookie, url):
+        """
+        Update a cookie we got from the response.
+        The goal is to have data relevant for use in future requests.
+        * Sets domain if there is not one.
+        * Sets path if there is not one.
+        * Set Expires from Max-Age. We need the expires to have an absolute expiration date.
+        * Force the Secure flag if required. (see SECURE_DOMAINS)
+        """
+        url = urlparse.urlparse(url)
+        if cookie.domain is None:
+            cookie.domain = url.hostname
+        if cookie.path is None:
+            cookie.path = '/'
+        if cookie.max_age is not None:
+            cookie.expires = datetime.now() + timedelta(seconds=cookie.max_age)
+        if url.scheme == 'https' \
+        and self._match_domain_list(self.SECURE_DOMAINS, cookie.domain):
+            cookie.secure = True
+
+    def from_response(self, response):
+        """
+        Import cookies from the response.
+
+        :type response: responses.Response
+        """
+        if 'Set-Cookie' in response.headers:
+            cs = Cookies.from_response(response.headers['Set-Cookie'], True)
+            for c in cs.itervalues():
+                self._normalize_cookie(c, response.url)
+                if self._can_set(c, response.url):
+                    self.set(c)
+
+    def for_request(self, url):
+        """
+        Get a key/value dictionnary of cookies for a given request URL.
+
+        :type url: str
+        :rtype: dict
+        """
+        url = urlparse.urlparse(url)
+        # we want insecure cookies in https too!
+        secure = None if url.scheme == 'https' else False
+        cdict = dict()
+        # get sorted cookies
+        cookies = self.all(domain=url.hostname, path=url.path, secure=secure)
+        for cookie in cookies:
+            # update only if not set, since first cookies are "better"
+            cdict.setdefault(cookie.name, cookie.value)
+        return cdict
+
+    def set(self, cookie):
+        """
+        Add or replace a Cookie in the jar.
+        This is for normalized and checked cookies, no validation is done.
+        Use from_response() to import cookies from a python-requests response.
+
+        :type cookie: cookies.Cookie
+        """
+        # cookies are unique by domain, path and of course name
+        assert len(cookie.domain)
+        assert len(cookie.path)
+        assert len(cookie.name)
+        self.cookies.setdefault(cookie.domain, {}). \
+                setdefault(cookie.path, {})[cookie.name] = cookie
+
+    def iter(self, name=None, domain=None, path=None, secure=None):
+        """
+        Iterate matching cookies.
+        You can restrict by name, domain, path or security.
+
+        :type name: str
+        :type domain: str
+        :type path: str
+        :type secure: bool
+
+        :rtype: iter[:class:`cookies.Cookie`]
+        """
+        for cdomain, cpaths in self.cookies.iteritems():
+            # domain matches (all domains if None)
+            if domain is None or self._domain_match(cdomain, domain):
+                for cpath, cnames in cpaths.iteritems():
+                    # path matches (all if None)
+                    if path is None or path.startswith(cpath):
+                        for cname, cookie in cnames.iteritems():
+                            # only wanted name (all if None)
+                            if name is None or name == cname:
+                                # wanted security (all if None)
+                                # cookie.secure can be "None" if not secure!
+                                if secure is None \
+                                or (secure is False and not cookie.secure) \
+                                or (secure is True and cookie.secure):
+                                    yield cookie
+
+    def all(self, name=None, domain=None, path=None, secure=None):
+        """
+        Like iter(), but sorts the cookies, from most precise to less precise.
+
+        :rtype: list[:class:`cookies.Cookie`]
+        """
+        cookies = list(self.iter(name, domain, path, secure))
+
+        # slowly compare all cookies
+        # XXX one of the worst things I've ever written
+        COOKIE1 = 1
+        COOKIE2 = -1
+
+        def ccmp(cookie1, cookie2):
+            # most precise matching domain
+            if domain and cookie1.domain != cookie2.domain:
+                if cookie1.domain == domain:
+                    return COOKIE1
+                if cookie2.domain == domain:
+                    return COOKIE2
+            if len(cookie1.domain) > len(cookie2.domain):
+                return COOKIE1
+            if len(cookie2.domain) > len(cookie1.domain):
+                return COOKIE2
+            # most precise matching path
+            if len(cookie1.path) > len(cookie2.path):
+                return COOKIE1
+            if len(cookie2.path) > len(cookie1.path):
+                return COOKIE2
+            # most secure
+            if cookie1.secure and not cookie2.secure:
+                return COOKIE1
+            if cookie2.secure and not cookie1.secure:
+                return COOKIE2
+            return 0
+
+        return sorted(cookies, cmp=ccmp, reverse=True)
+
+    def get(self, name=None, domain=None, path=None, secure=None):
+        """
+        Return the best cookie from all().
+        Useful for changing the value or deleting a cookie.
+
+        name, domain, path and secure are the same as iter().
+
+        :rtype: :class:`cookies.Cookie` or None
+        """
+        cookies = self.all(name, domain, path, secure)
+        try:
+            return cookies[0]
+        except IndexError:
+            pass
+
+    def remove(self, cookie):
+        """
+        Remove a cookie. The cookie argument must have the same domain, path and name.
+        Return False if not present, True if just removed.
+
+        :type cookie: :class:`cookies.Cookie`
+        :rtype: bool
+        """
+        # cookies are unique by domain, path and of course name
+        assert len(cookie.domain)
+        assert len(cookie.path)
+        assert len(cookie.name)
+        d = self.cookies.get(cookie.domain, {}).get(cookie.path)
+        if cookie.name in d:
+            del d[cookie.name]
+            return True
+        return False
+
+    def clear(self):
+        """
+        Remove all cookies.
+        """
+        self.cookies.clear()
--- a/weboob/tools/browser2/test.py
+++ b/weboob/tools/browser2/test.py
@ -23,6 +23,8 @@ import requests
 from nose.plugins.skip import SkipTest

 from .browser import BaseBrowser, DomainBrowser, Weboob
+from . import cookiejar
+from .cookies import Cookies

 from weboob.tools.json import json

@ -175,3 +177,121 @@ def test_referrer():
    assert 'Referer' not in json.loads(r.text)['headers']

    assert b._get_referrer('https://example.com/', 'http://example.com/') is None
+
+
+def test_cookieparse():
+    cj = cookiejar.CookieJar()
+
+    def bc(data):
+        """
+        build one cookie, and normalize it
+        """
+        cs = Cookies()
+        cs.parse_response(data)
+        for c in cs.itervalues():
+            cj._normalize_cookie(c, 'http://example.com/')
+            return c
+
+    # parse max-age
+    assert bc('__bwid=58244366; max-age=42; path=/').expires
+
+    # security for received cookies
+    assert cj._can_set(bc('k=v; domain=www.example.com'),
+            'http://www.example.com/')
+    assert cj._can_set(bc('k=v; domain=sub.example.com'),
+            'http://www.example.com/')
+    assert cj._can_set(bc('k=v; domain=sub.example.com'),
+            'http://example.com/')
+    assert cj._can_set(bc('k=v; domain=.example.com'),
+            'http://example.com/')
+    assert cj._can_set(bc('k=v; domain=www.example.com'),
+            'http://example.com/')
+    assert not cj._can_set(bc('k=v; domain=example.com'),
+            'http://example.net/')
+    assert not cj._can_set(bc('k=v; domain=.net'),
+            'http://example.net/')
+    assert not cj._can_set(bc('k=v; domain=www.example.net'),
+            'http://www.example.com/')
+    assert not cj._can_set(bc('k=v; domain=wwwexample.com'),
+            'http://example.com/')
+    assert not cj._can_set(bc('k=v; domain=.example.com'),
+            'http://wwwexample.com/')
+
+    # pattern matching domains
+    assert not cj._domain_match('example.com', 's.example.com')
+    assert cj._domain_match('.example.com', 's.example.com')
+    assert not cj._domain_match('.example.com', 'example.com')  # yep.
+    assert cj._domain_match('s.example.com', 's.example.com')
+    assert not cj._domain_match('s.example.com', 's2.example.com')
+    assert cj._domain_match_list(True, 'example.com')
+    assert not cj._domain_match_list([], 'example.com')
+    assert cj._domain_match_list(['example.net', 'example.com'], 'example.com')
+    assert not cj._domain_match_list(['example.net', 'example.org'], 'example.com')
+
+
+def test_cookiejar():
+    def bc(data):
+        """
+        build one cookie
+        """
+        cs = Cookies()
+        cs.parse_response(data)
+        for c in cs.itervalues():
+            return c
+
+    # filtering cookies
+    cookie0 = bc('j=v; domain=www.example.com; path=/')
+    cookie1 = bc('k=v1; domain=www.example.com; path=/; secure')
+    cookie2 = bc('k=v2; domain=.example.com; path=/')
+    cookie3 = bc('k=v3; domain=www.example.com; path=/lol/cat/')
+    cookie4 = bc('k=v4; domain=www.example.com; path=/lol/')
+
+    cj = cookiejar.CookieJar()
+    cj.set(cookie0)
+    cj.set(cookie1)
+    cj.set(cookie2)
+    cj.set(cookie3)
+    cj.set(cookie4)
+
+    assert len(cj.all()) == 5  # all cookies
+    assert len(cj.all(path='/')) == 3  # all cookies except the ones with deep paths
+    assert len(cj.all(name='k')) == 4  # this excludes cookie0
+    assert len(cj.all(domain='example.com')) == 0  # yep
+    assert len(cj.all(domain='s.example.com')) == 1  # cookie2
+    assert len(cj.all(domain='.example.com')) == 1  # cookie2 (exact match)
+    assert len(cj.all(domain='www.example.com')) == 5  # all cookies
+    assert len(cj.all(domain='www.example.com', path="/lol/")) == 4  # all + cookie4
+    assert len(cj.all(domain='www.example.com', path="/lol/cat")) == 4  # all + cookie4
+    assert len(cj.all(domain='www.example.com', path="/lol/cat/")) == 5  # all + cookie4 + cookie3
+    assert len(cj.all(secure=True)) == 1  # cookie1
+    assert len(cj.all(secure=False)) == 4  # all except cookie1
+
+    assert cj.get(domain='www.example.com', path="/lol/") is cookie4
+    assert cj.get(domain='www.example.com', path="/lol/cat/") is cookie3
+    assert cj.get(domain='www.example.com', path="/") is cookie1
+    assert cj.get(name='j', domain='www.example.com', path="/") is cookie0
+    assert cj.get(name='k', domain='www.example.com', path="/") is cookie1
+    assert cj.get(name='k', domain='s.example.com', path="/") is cookie2
+    assert cj.get(name='k', domain='www.example.com', path="/aaa") is cookie1
+    assert cj.get(domain='www.example.com', path='/') is cookie1
+    assert cj.get(domain='www.example.com', path='/', secure=False) is cookie0
+    assert cj.get(domain='www.example.com', path='/', secure=True) is cookie1
+
+    # this is just not API choice, but how browsers act
+    assert cj.for_request('http://www.example.com/') == {'k': 'v2', 'j': 'v'}
+    assert cj.for_request('https://www.example.com/') == {'k': 'v1', 'j': 'v'}
+    assert cj.for_request('http://www.example.com/lol/') == {'k': 'v4', 'j': 'v'}
+    assert cj.for_request('http://s.example.com/lol/') == {'k': 'v2'}
+    assert cj.for_request('http://example.com/lol/') == {}
+
+    # remove/add/replace
+    assert cj.remove(cookie1) is True
+    assert cj.get(secure=True) is None
+    cj.set(cookie1)
+    assert cj.get(secure=True) is cookie1
+    cookie5 = bc('k=w; domain=www.example.com; path=/; secure')
+    cj.set(cookie5)
+    assert cj.get(secure=True) is cookie5
+    assert len(cj.all(secure=True)) == 1
+    # not the same cookie, but the same identifiers
+    assert cj.remove(cookie1) is True