weboob-devel/weboob/tools/browser2/cookiejar.py

# TODO declare __all__
# TODO support logging

from urlparse import urlparse
from datetime import datetime, timedelta
import posixpath

from .cookies import Cookie, Cookies, strip_spaces_and_quotes, Definitions


def valid_domain(domain):
    """
    Like cookies.valid_domain, but allows leading periods.
    Because it is *very* common and useful for us.
    """
    domain.encode('ascii')
    if domain and domain[0] == '.':
        domain = domain[1:]
    if domain and domain[0] in '"':
        return False
    if Definitions.DOMAIN_RE.match(domain):
        return True
    return False


def parse_domain(value):
    """
    Like cookies.parse_domain, but allows leading periods.
    Because it is *very* common and useful for us.
    """
    value = strip_spaces_and_quotes(value)
    if value:
        assert valid_domain(value)
    return value

# this is ok because we are using our own copy of the lib
# TODO push a better way upstream
Cookie.attribute_parsers['domain'] = parse_domain
Cookie.attribute_validators['domain'] = valid_domain


class CookieJar(object):
    """
    Manage Cookies like a real browser, with security and privacy in mind.

    python-requests accepts cookies blindly,
    Expirations are not taken into account,
    it can't handle the server asking to delete a cookie,
    and sends cookies even when changing domains!
    Of course, secure (SSL only) cookies aren't handled either.

    This class fixes all that.
    """

    ACCEPT_DOMAINS = []
    """
    Domains where to accept cookies, even when we should not.
    Add a "." before a domain to accept subdomains.
    If True, accept all cookies (a bit insecure).
    ACCEPT_DOMAINS has higher priority over REJECT_DOMAINS.

    Disabling third-party cookies on most browsers acts like [], enabling them
    acts like True. Since it is a very common browser option, we use the most
    secure and privacy-aware method by default.
    """

    REJECT_DOMAINS = []
    """
    Domains where to reject cookies, even when we should not.
    Add a "." before a domain to reject subdomains.
    If True, reject all cookies.
    REJECT_DOMAINS has lower priority over ACCEPT_DOMAINS.
    """

    SECURE_DOMAINS = True
    """
    When we get a cookie through an secure connection, mark it as secure
    (not to be sent on insecure channels) if the server did not tell us to.
    If True, do it automatically for all domains. Alternatively, you can put
    a list of domains, like ACCEPT_DOMAINS or REJECT_DOMAINS.
    If False, never do it (but still accept secure cookies as they are).

    NoScript for Firefox does this, either by automated guesses or forced from a list.
    """

    INSECURE_MATCHING = True
    """
    Do sloppy matching to mimic what browsers do.
    This is only for setting cookies; it should be relatively safe in Weboob.
    """

    def __init__(self):
        """
        Cookies are delicious delicacies.
        """
        self.cookies = dict()

    def _domain_match(self, pattern, domain):
        """
        Checks a domain matches a domain pattern.
        Patterns can be either the exact domain, or a wildcard (starting with a dot).

        example.com matches example.com only
        .example.com matches *.example.com (but not example.com)

        :param pattern: str
        :param domain: str
        :rytpe: bool
        """
        if pattern.startswith('.'):
            return domain.endswith(pattern)
        return domain == pattern

    def _domain_match_list(self, patterns, domain):
        """
        Checks domains match, from a list of patters.
        If the list of patterns is True, it always matches.

        :param pattern: list or True
        :param domain: str
        :rytpe: bool
        """
        if patterns is True:
            return True
        for pattern in patterns:
            if self._domain_match(pattern, domain):
                return True
        return False

    def _can_set(self, cookie, url):
        """
        Checks an URL can set a particular cookie.
        See ACCEPT_DOMAINS, REJECT_DOMAINS to set exceptions.

        The cookie must have a domain already set, you can
        use _normalize_cookie() for that.

        :param cookie: The cookie the server set
        :type cookie: Cookie
        :param url: URL of the response
        :type url: str

        :rtype: bool
        """
        url = urlparse(url)
        domain = url.hostname

        # Accept/reject overrides
        if self._domain_match_list(self.ACCEPT_DOMAINS, domain):
            return True
        if self._domain_match_list(self.REJECT_DOMAINS, domain):
            return False

        # check path
        if not url.path.startswith(cookie.path):
            return False

        # check domain (secure & simple)
        if cookie.domain.startswith('.'):
            if cookie.domain.endswith(domain) or '.%s' % domain == cookie.domain:
                return True
        elif domain == cookie.domain:
            return True

        # whatever.example.com should be able to set .example.com
        # Unbelievably stupid, but widely used.
        #
        # Our method is not ideal, as it isn't very secure for some TLDs.
        # A solution could be to use tldextract.
        if self.INSECURE_MATCHING:
            if domain.split('.')[-2:] == cookie.domain.split('.')[-2:]:
                return True

        return False

    def _normalize_cookie(self, cookie, url, now=None):
        """
        Update a cookie we got from the response.
        The goal is to have data relevant for use in future requests.
        * Sets domain if there is not one.
        * Sets path if there is not one.
        * Set Expires from Max-Age. We need the expires to have an absolute expiration date.
        * Force the Secure flag if required. (see SECURE_DOMAINS)

        :type cookie: :class:`cookies.Cookie`
        :type url: str
        :type now: datetime
        """
        url = urlparse(url)
        if cookie.domain is None:
            cookie.domain = url.hostname
        if cookie.path is None:
            cookie.path = '/'
        if cookie.max_age is not None:
            if now is None:
                now = datetime.now()
            cookie.expires = now + timedelta(seconds=cookie.max_age)
        if url.scheme == 'https' \
        and self._match_domain_list(self.SECURE_DOMAINS, cookie.domain):
            cookie.secure = True

    def from_response(self, response):
        """
        Import cookies from the response.

        :type response: responses.Response
        """
        if 'Set-Cookie' in response.headers:
            cs = Cookies.from_response(response.headers['Set-Cookie'], True)
            for c in cs.itervalues():
                self._normalize_cookie(c, response.url)
                if self._can_set(c, response.url):
                    self.set(c)

    def for_request(self, url, now=None):
        """
        Get a key/value dictionnary of cookies for a given request URL.

        :type url: str
        :type now: datetime
        :rtype: dict
        """
        url = urlparse(url)
        if now is None:
            now = datetime.now()
        # we want insecure cookies in https too!
        secure = None if url.scheme == 'https' else False

        cdict = dict()
        # get sorted cookies
        cookies = self.all(domain=url.hostname, path=url.path, secure=secure)
        for cookie in cookies:
            # only use session cookies and cookies with future expirations
            if cookie.expires is None or cookie.expires > now:
            # update only if not set, since first cookies are "better"
                cdict.setdefault(cookie.name, cookie.value)
        return cdict

    def flush(self, now=None, session=False):
        """
        Remove expired cookies. If session is True, also remove all session cookies.

        :type now: datetime
        :type session: bool
        """
        # we need a list copy since we remove from the iterable
        for cookie in list(self.iter()):
            # remove session cookies if requested
            if cookie.expires is None and session:
                self.remove(cookie)
            # remove non-session cookies if expired before now
            if cookie.expires is not None and cookie.expires < now:
                self.remove(cookie)

    def set(self, cookie):
        """
        Add or replace a Cookie in the jar.
        This is for normalized and checked cookies, no validation is done.
        Use from_response() to import cookies from a python-requests response.

        :type cookie: cookies.Cookie
        """
        # cookies are unique by domain, path and of course name
        assert len(cookie.domain)
        assert len(cookie.path)
        assert len(cookie.name)
        self.cookies.setdefault(cookie.domain, {}). \
                setdefault(cookie.path, {})[cookie.name] = cookie

    def iter(self, name=None, domain=None, path=None, secure=None):
        """
        Iterate matching cookies.
        You can restrict by name, domain, path or security.

        :type name: str
        :type domain: str
        :type path: str
        :type secure: bool

        :rtype: iter[:class:`cookies.Cookie`]
        """
        for cdomain, cpaths in self.cookies.iteritems():
            # domain matches (all domains if None)
            if domain is None or self._domain_match(cdomain, domain):
                for cpath, cnames in cpaths.iteritems():
                    # path matches (all if None)
                    if path is None or path.startswith(cpath):
                        for cname, cookie in cnames.iteritems():
                            # only wanted name (all if None)
                            if name is None or name == cname:
                                # wanted security (all if None)
                                # cookie.secure can be "None" if not secure!
                                if secure is None \
                                or (secure is False and not cookie.secure) \
                                or (secure is True and cookie.secure):
                                    yield cookie

    def all(self, name=None, domain=None, path=None, secure=None):
        """
        Like iter(), but sorts the cookies, from most precise to less precise.

        :rtype: list[:class:`cookies.Cookie`]
        """
        cookies = list(self.iter(name, domain, path, secure))

        # slowly compare all cookies
        # XXX one of the worst things I've ever written
        COOKIE1 = 1
        COOKIE2 = -1

        def ccmp(cookie1, cookie2):
            # most precise matching domain
            if domain and cookie1.domain != cookie2.domain:
                if cookie1.domain == domain:
                    return COOKIE1
                if cookie2.domain == domain:
                    return COOKIE2
            if len(cookie1.domain) > len(cookie2.domain):
                return COOKIE1
            if len(cookie2.domain) > len(cookie1.domain):
                return COOKIE2
            # most precise matching path
            if len(cookie1.path) > len(cookie2.path):
                return COOKIE1
            if len(cookie2.path) > len(cookie1.path):
                return COOKIE2
            # most secure
            if cookie1.secure and not cookie2.secure:
                return COOKIE1
            if cookie2.secure and not cookie1.secure:
                return COOKIE2
            return 0

        return sorted(cookies, cmp=ccmp, reverse=True)

    def get(self, name=None, domain=None, path=None, secure=None):
        """
        Return the best cookie from all().
        Useful for changing the value or deleting a cookie.

        name, domain, path and secure are the same as iter().

        :rtype: :class:`cookies.Cookie` or None
        """
        cookies = self.all(name, domain, path, secure)
        try:
            return cookies[0]
        except IndexError:
            pass

    def remove(self, cookie):
        """
        Remove a cookie. The cookie argument must have the same domain, path and name.
        Return False if not present, True if just removed.

        :type cookie: :class:`cookies.Cookie`
        :rtype: bool
        """
        # cookies are unique by domain, path and of course name
        assert len(cookie.domain)
        assert len(cookie.path)
        assert len(cookie.name)
        d = self.cookies.get(cookie.domain, {}).get(cookie.path)
        if cookie.name in d:
            del d[cookie.name]
            return True
        return False

    def clear(self):
        """
        Remove all cookies.
        """
        self.cookies.clear()

    def build(self, name, value, url, path=None, wildcard=False):
        """
        Build a Cookie object for the current URL.

        The domain and path are guessed. If you want to set for the whole domain,
        take care of what you put in URL!
        for_url('http://example.com/hello/world') will only set cookie for the
        /hello/ path.

        `name` and `value` are required parameters of Cookie.__init__()

        You can force the `path` if you want.

        The `wildcard` parameter will add a period before the domain.

        Typical usage would be, inside a DomainBrowser:
            cookie = self.cookies.for_url(k, v, self.url)
            cookie = self.cookies.for_url(k, v, self.absurl('/'))
            cookie = self.cookies.for_url(k, v, self.BASEURL)

        And then:
            self.cookies.set(cookie)

        For more advanced usage, create a Cookie object manually, or
        alter the returned Cookie object before set().

        :type name: basestring
        :type value: basestring
        :type url: str
        :type path: str
        :type wildcard: bool
        :rtype cookie: :class:`cookies.Cookie`
        """
        cookie = Cookie(name, value)
        url = urlparse(url)
        if wildcard:
            cookie.domain = '.' + url.hostname
        else:
            cookie.domain = url.hostname
        if path is None:
            cookie.path = posixpath.join(posixpath.dirname(url.path), '')
        else:
            cookie.path = path
        if url.scheme == 'https':
            cookie.secure = True
        return cookie