new api for content, encoding, and document building

2014-10-20 12:11:31 +02:00 · 2014-10-20 12:11:31 +02:00 · 237036ce3e
commit 237036ce3e
parent 9a3afd2dc1
1 changed files with 101 additions and 52 deletions
--- a/weboob/browser/pages.py
+++ b/weboob/browser/pages.py
@ -92,12 +92,28 @@ class Page(object):
    """
    Represents a page.

+    Encoding can be forced by setting the :attr:`ENCODING` class-wide
+    attribute, or by passing an `encoding` keyword argument, which overrides
+    :attr:`ENCODING`. Finally, it can be manually changed by assigning a new
+    value to :attr:`encoding` instance attribute. A unicode version of the
+    response content is accessible in :attr:`text`, decoded with specified
+    :attr:`encoding`.
+
    :param browser: browser used to go on the page
    :type browser: :class:`weboob.browser.browsers.Browser`
    :param response: response object
    :type response: :class:`Response`
    :param params: optional dictionary containing parameters given to the page (see :class:`weboob.browser.url.URL`)
    :type params: :class:`dict`
+    :param encoding: optional parameter to force the encoding of the page, overrides :attr:`ENCODING`
+    :type encoding: :class:`basestring`
+
+    """
+
+    ENCODING = None
+    """
+    Force a page encoding.
+    It is recommended to use None for autodetection.
    """

    logged = False
@ -106,13 +122,59 @@ class Page(object):
    :class:`LoginBrowser` and the :func:`need_login` decorator.
    """

-    def __init__(self, browser, response, params=None):
+    def __init__(self, browser, response, params=None, encoding=None):
        self.browser = browser
        self.logger = getLogger(self.__class__.__name__.lower(), browser.logger)
        self.response = response
        self.url = self.response.url
        self.params = params

+        # Setup encoding and build document
+        self.forced_encoding = encoding or self.ENCODING
+        if self.forced_encoding:
+            self.response.encoding = self.forced_encoding
+        self.doc = self.build_doc(self.data)
+
+        # Last chance to change encoding, according to :meth:`detect_encoding`,
+        # which can be used to detect a document-level encoding declaration
+        encoding = self.detect_encoding()
+        if encoding != self.encoding:
+            self.response.encoding = encoding
+            self.doc = self.build_doc(self.data)
+
+    # Encoding issues are delegated to Response instance, implemented by
+    # requests module.
+
+    @property
+    def encoding(self):
+        return self.response.encoding
+
+    @encoding.setter
+    def encoding(self, value):
+        self.forced_encoding = True
+        self.response.encoding = value
+
+    @property
+    def content(self):
+        """
+        Raw content from response.
+        """
+        return self.response.content
+
+    @property
+    def text(self):
+        """
+        Content of the response, in unicode, decoded with :attr:`encoding`.
+        """
+        return self.response.text
+
+    @property
+    def data(self):
+        """
+        Data passed to :meth:`build_doc`.
+        """
+        return self.content
+
    def on_load(self):
        """
        Event called when browser loads this page.
@ -123,6 +185,22 @@ class Page(object):
        Event called when browser leaves this page.
        """

+    def build_doc(self, content):
+        """
+        Abstract method to be implemented by subclasses to build structured
+        data (HTML, Json, CSV...) from :attr:`data` property. It also can be
+        overriden in modules pages to preprocess or postprocess data. It must
+        return an object -- that will be assigned to :attr:`doc`.
+        """
+        raise NotImplemented
+
+    def detect_encoding(self):
+        """
+        Override this method to implement detection of document-level encoding
+        declaration, if any (eg. html5's <meta charset="some-charset">).
+        """
+        return None
+

 class FormNotFound(Exception):
    """
@ -266,17 +344,14 @@ class CsvPage(Page):
    This means the rows will be also available as dictionaries.
    """

-    def __init__(self, browser, response, *args, **kwargs):
-        super(CsvPage, self).__init__(browser, response, *args, **kwargs)
-        content = response.content
-        encoding = self.ENCODING
+    def build_doc(self, content):
+        encoding = self.encoding
        if encoding == 'utf-16le':
            content = content.decode('utf-16le')[1:].encode('utf-8')
            encoding = 'utf-8'
        if self.NEWLINES_HACK:
            content = content.replace('\r\n', '\n').replace('\r', '\n')
-        fp = BytesIO(content)
-        self.doc = self.parse(fp, encoding)
+        return self.parse(BytesIO(content), encoding)

    def parse(self, data, encoding=None):
        """
@ -322,10 +397,13 @@ class JsonPage(Page):
    Json Page.
    """

-    def __init__(self, browser, response, *args, **kwargs):
-        super(JsonPage, self).__init__(browser, response, *args, **kwargs)
+    @property
+    def data(self):
+        return self.response.text
+
+    def build_doc(self, text):
        from weboob.tools.json import json
-        self.doc = json.loads(response.text)
+        return json.loads(text)


 class XMLPage(Page):
@ -333,17 +411,10 @@ class XMLPage(Page):
    XML Page.
    """

-    ENCODING = None
-    """
-    Force a page encoding.
-    It is recommended to use None for autodetection.
-    """
-
-    def __init__(self, browser, response, *args, **kwargs):
-        super(XMLPage, self).__init__(browser, response, *args, **kwargs)
+    def build_doc(self, content):
        import lxml.etree as etree
-        parser = etree.XMLParser(encoding=self.ENCODING or response.encoding)
-        self.doc = etree.parse(BytesIO(response.content), parser)
+        parser = etree.XMLParser(encoding=self.encoding)
+        return etree.parse(BytesIO(content), parser)


 class RawPage(Page):
@ -351,9 +422,8 @@ class RawPage(Page):
    Raw page where the "doc" attribute is the content string.
    """

-    def __init__(self, browser, response, *args, **kwargs):
-        super(RawPage, self).__init__(browser, response, *args, **kwargs)
-        self.doc = response.content
+    def build_doc(self, content):
+        return content


 class HTMLPage(Page):
@ -376,25 +446,12 @@ class HTMLPage(Page):
    The class to instanciate when using :meth:`HTMLPage.get_form`. Default to :class:`Form`.
    """

-    ENCODING = None
-    """
-    Force a page encoding.
-    It is recommended to use None for autodetection.
-    """
-
    def __init__(self, *args, **kwargs):
-        encoding = kwargs.pop('encoding', self.ENCODING)
-
-        super(HTMLPage, self).__init__(*args, **kwargs)
-        self.doc = None
-        self.encoding = None
-
        import lxml.html as html
        ns = html.etree.FunctionNamespace(None)
        self.define_xpath_functions(ns)
-        self.build_doc(encoding)
-        if encoding is None:
-            self.check_encoding()
+
+        super(HTMLPage, self).__init__(*args, **kwargs)

    def define_xpath_functions(self, ns):
        """
@ -435,24 +492,17 @@ class HTMLPage(Page):
            return bool(context.context_node.xpath(xpath))
        ns['has-class'] = has_class

-    def build_doc(self, encoding=None):
+    def build_doc(self, content):
        """
        Method to build the lxml document from response and given encoding.
        """
-        if encoding is None:
-            encoding = self.response.encoding
-
        import lxml.html as html
-        parser = html.HTMLParser(encoding=encoding)
-        self.doc = html.parse(BytesIO(self.response.content), parser)
-        self.encoding = encoding
-        return self.doc
+        parser = html.HTMLParser(encoding=self.encoding)
+        return html.parse(BytesIO(content), parser)

-    def check_encoding(self):
+    def detect_encoding(self):
        """
-        Check in the document the "http-equiv" and "charset" meta nodes. If the
-        specified charset isn't the one given by Content-Type HTTP header,
-        parse document again with the right encoding.
+        Look for encoding in the document "http-equiv" and "charset" meta nodes.
        """
        encoding = self.encoding
        for content in self.doc.xpath('//head/meta[lower-case(@http-equiv)="content-type"]/@content'):
@ -472,8 +522,7 @@ class HTMLPage(Page):
        except LookupError:
            encoding = 'windows-1252'

-        if encoding != self.encoding:
-            self.build_doc(encoding)
+        return encoding

    def get_form(self, xpath='//form', name=None, nr=None, submit=None):
        """