From 237036ce3e154c7ffd8d9b838f3936c0b8761a23 Mon Sep 17 00:00:00 2001
From: smurail <simon.murail@budget-insight.com>
Date: Mon, 20 Oct 2014 12:11:31 +0200
Subject: [PATCH] new api for content, encoding, and document building

---
 weboob/browser/pages.py | 153 ++++++++++++++++++++++++++--------------
 1 file changed, 101 insertions(+), 52 deletions(-)

diff --git a/weboob/browser/pages.py b/weboob/browser/pages.py
index 426f6ce9..27fcfcc8 100644
--- a/weboob/browser/pages.py
+++ b/weboob/browser/pages.py
@@ -92,12 +92,28 @@ class Page(object):
     """
     Represents a page.
 
+    Encoding can be forced by setting the :attr:`ENCODING` class-wide
+    attribute, or by passing an `encoding` keyword argument, which overrides
+    :attr:`ENCODING`. Finally, it can be manually changed by assigning a new
+    value to :attr:`encoding` instance attribute. A unicode version of the
+    response content is accessible in :attr:`text`, decoded with specified
+    :attr:`encoding`.
+
     :param browser: browser used to go on the page
     :type browser: :class:`weboob.browser.browsers.Browser`
     :param response: response object
     :type response: :class:`Response`
     :param params: optional dictionary containing parameters given to the page (see :class:`weboob.browser.url.URL`)
     :type params: :class:`dict`
+    :param encoding: optional parameter to force the encoding of the page, overrides :attr:`ENCODING`
+    :type encoding: :class:`basestring`
+
+    """
+
+    ENCODING = None
+    """
+    Force a page encoding.
+    It is recommended to use None for autodetection.
     """
 
     logged = False
@@ -106,13 +122,59 @@ class Page(object):
     :class:`LoginBrowser` and the :func:`need_login` decorator.
     """
 
-    def __init__(self, browser, response, params=None):
+    def __init__(self, browser, response, params=None, encoding=None):
         self.browser = browser
         self.logger = getLogger(self.__class__.__name__.lower(), browser.logger)
         self.response = response
         self.url = self.response.url
         self.params = params
 
+        # Setup encoding and build document
+        self.forced_encoding = encoding or self.ENCODING
+        if self.forced_encoding:
+            self.response.encoding = self.forced_encoding
+        self.doc = self.build_doc(self.data)
+
+        # Last chance to change encoding, according to :meth:`detect_encoding`,
+        # which can be used to detect a document-level encoding declaration
+        encoding = self.detect_encoding()
+        if encoding != self.encoding:
+            self.response.encoding = encoding
+            self.doc = self.build_doc(self.data)
+
+    # Encoding issues are delegated to Response instance, implemented by
+    # requests module.
+
+    @property
+    def encoding(self):
+        return self.response.encoding
+
+    @encoding.setter
+    def encoding(self, value):
+        self.forced_encoding = True
+        self.response.encoding = value
+
+    @property
+    def content(self):
+        """
+        Raw content from response.
+        """
+        return self.response.content
+
+    @property
+    def text(self):
+        """
+        Content of the response, in unicode, decoded with :attr:`encoding`.
+        """
+        return self.response.text
+
+    @property
+    def data(self):
+        """
+        Data passed to :meth:`build_doc`.
+        """
+        return self.content
+
     def on_load(self):
         """
         Event called when browser loads this page.
@@ -123,6 +185,22 @@ class Page(object):
         Event called when browser leaves this page.
         """
 
+    def build_doc(self, content):
+        """
+        Abstract method to be implemented by subclasses to build structured
+        data (HTML, Json, CSV...) from :attr:`data` property. It also can be
+        overriden in modules pages to preprocess or postprocess data. It must
+        return an object -- that will be assigned to :attr:`doc`.
+        """
+        raise NotImplemented
+
+    def detect_encoding(self):
+        """
+        Override this method to implement detection of document-level encoding
+        declaration, if any (eg. html5's <meta charset="some-charset">).
+        """
+        return None
+
 
 class FormNotFound(Exception):
     """
@@ -266,17 +344,14 @@ class CsvPage(Page):
     This means the rows will be also available as dictionaries.
     """
 
-    def __init__(self, browser, response, *args, **kwargs):
-        super(CsvPage, self).__init__(browser, response, *args, **kwargs)
-        content = response.content
-        encoding = self.ENCODING
+    def build_doc(self, content):
+        encoding = self.encoding
         if encoding == 'utf-16le':
             content = content.decode('utf-16le')[1:].encode('utf-8')
             encoding = 'utf-8'
         if self.NEWLINES_HACK:
             content = content.replace('\r\n', '\n').replace('\r', '\n')
-        fp = BytesIO(content)
-        self.doc = self.parse(fp, encoding)
+        return self.parse(BytesIO(content), encoding)
 
     def parse(self, data, encoding=None):
         """
@@ -322,10 +397,13 @@ class JsonPage(Page):
     Json Page.
     """
 
-    def __init__(self, browser, response, *args, **kwargs):
-        super(JsonPage, self).__init__(browser, response, *args, **kwargs)
+    @property
+    def data(self):
+        return self.response.text
+
+    def build_doc(self, text):
         from weboob.tools.json import json
-        self.doc = json.loads(response.text)
+        return json.loads(text)
 
 
 class XMLPage(Page):
@@ -333,17 +411,10 @@ class XMLPage(Page):
     XML Page.
     """
 
-    ENCODING = None
-    """
-    Force a page encoding.
-    It is recommended to use None for autodetection.
-    """
-
-    def __init__(self, browser, response, *args, **kwargs):
-        super(XMLPage, self).__init__(browser, response, *args, **kwargs)
+    def build_doc(self, content):
         import lxml.etree as etree
-        parser = etree.XMLParser(encoding=self.ENCODING or response.encoding)
-        self.doc = etree.parse(BytesIO(response.content), parser)
+        parser = etree.XMLParser(encoding=self.encoding)
+        return etree.parse(BytesIO(content), parser)
 
 
 class RawPage(Page):
@@ -351,9 +422,8 @@ class RawPage(Page):
     Raw page where the "doc" attribute is the content string.
     """
 
-    def __init__(self, browser, response, *args, **kwargs):
-        super(RawPage, self).__init__(browser, response, *args, **kwargs)
-        self.doc = response.content
+    def build_doc(self, content):
+        return content
 
 
 class HTMLPage(Page):
@@ -376,25 +446,12 @@ class HTMLPage(Page):
     The class to instanciate when using :meth:`HTMLPage.get_form`. Default to :class:`Form`.
     """
 
-    ENCODING = None
-    """
-    Force a page encoding.
-    It is recommended to use None for autodetection.
-    """
-
     def __init__(self, *args, **kwargs):
-        encoding = kwargs.pop('encoding', self.ENCODING)
-
-        super(HTMLPage, self).__init__(*args, **kwargs)
-        self.doc = None
-        self.encoding = None
-
         import lxml.html as html
         ns = html.etree.FunctionNamespace(None)
         self.define_xpath_functions(ns)
-        self.build_doc(encoding)
-        if encoding is None:
-            self.check_encoding()
+
+        super(HTMLPage, self).__init__(*args, **kwargs)
 
     def define_xpath_functions(self, ns):
         """
@@ -435,24 +492,17 @@ class HTMLPage(Page):
             return bool(context.context_node.xpath(xpath))
         ns['has-class'] = has_class
 
-    def build_doc(self, encoding=None):
+    def build_doc(self, content):
         """
         Method to build the lxml document from response and given encoding.
         """
-        if encoding is None:
-            encoding = self.response.encoding
-
         import lxml.html as html
-        parser = html.HTMLParser(encoding=encoding)
-        self.doc = html.parse(BytesIO(self.response.content), parser)
-        self.encoding = encoding
-        return self.doc
+        parser = html.HTMLParser(encoding=self.encoding)
+        return html.parse(BytesIO(content), parser)
 
-    def check_encoding(self):
+    def detect_encoding(self):
         """
-        Check in the document the "http-equiv" and "charset" meta nodes. If the
-        specified charset isn't the one given by Content-Type HTTP header,
-        parse document again with the right encoding.
+        Look for encoding in the document "http-equiv" and "charset" meta nodes.
         """
         encoding = self.encoding
         for content in self.doc.xpath('//head/meta[lower-case(@http-equiv)="content-type"]/@content'):
@@ -472,8 +522,7 @@ class HTMLPage(Page):
         except LookupError:
             encoding = 'windows-1252'
 
-        if encoding != self.encoding:
-            self.build_doc(encoding)
+        return encoding
 
     def get_form(self, xpath='//form', name=None, nr=None, submit=None):
         """