HTMLPage checks the inner charset and parse again document if it is not the same than Content-Type HTTP header

2014-10-08 14:07:35 +02:00 · 2014-10-08 14:07:35 +02:00 · 65ece349db
commit 65ece349db
parent dea6f1cf76
1 changed files with 40 additions and 3 deletions
--- a/weboob/browser/pages.py
+++ b/weboob/browser/pages.py
@ -21,6 +21,8 @@ from __future__ import absolute_import
 import warnings
 from io import BytesIO
 import codecs
 from cgi import parse_header
 import requests
@ -293,11 +295,46 @@ class HTMLPage(Page):
    """
    def __init__(self, browser, response, *args, **kwargs):
        encoding = kwargs.pop('encoding', self.ENCODING or response.encoding)
        super(HTMLPage, self).__init__(browser, response, *args, **kwargs)
-        self.encoding = self.ENCODING or response.encoding
+
        import lxml.html as html
-        parser = html.HTMLParser(encoding=self.encoding)
+        ns = html.etree.FunctionNamespace(None)
-        self.doc = html.parse(BytesIO(response.content), parser)
+        self.define_xpath_functions(ns)
        self.build_doc(encoding)
        self.check_encoding()
    def define_xpath_functions(self, ns):
        ns['lower-case'] = lambda context, args: ' '.join([s.lower() for s in args])
    def build_doc(self, encoding):
        import lxml.html as html
        parser = html.HTMLParser(encoding=encoding)
        self.doc = html.parse(BytesIO(self.response.content), parser)
        self.encoding = encoding
    def check_encoding(self):
        encoding = self.encoding
        for content in self.doc.xpath('//head/meta[lower-case(@http-equiv)="content-type"]/@content'):
            # meta http-equiv=content-type content=...
            _, params = parse_header(content)
            if 'charset' in params:
                encoding = params['charset'].strip("'\"")
        for charset in self.doc.xpath('//head/meta[@charset]/@charset'):
            # meta charset=...
            encoding = charset.lower()
        if encoding == 'iso-8859-1' or not encoding:
            encoding = 'windows-1252'
        try:
            codecs.lookup(encoding)
        except LookupError:
            encoding = 'windows-1252'
        if encoding != self.encoding:
            self.build_doc(encoding)
    def get_form(self, xpath='//form', name=None, nr=None, submit=None):
        """