HTMLPage checks the inner charset and parse again document if it is not the same than Content-Type HTTP header
This commit is contained in:
parent
dea6f1cf76
commit
65ece349db
1 changed files with 40 additions and 3 deletions
|
|
@ -21,6 +21,8 @@ from __future__ import absolute_import
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
import codecs
|
||||||
|
from cgi import parse_header
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
@ -293,11 +295,46 @@ class HTMLPage(Page):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, browser, response, *args, **kwargs):
|
def __init__(self, browser, response, *args, **kwargs):
|
||||||
|
encoding = kwargs.pop('encoding', self.ENCODING or response.encoding)
|
||||||
|
|
||||||
super(HTMLPage, self).__init__(browser, response, *args, **kwargs)
|
super(HTMLPage, self).__init__(browser, response, *args, **kwargs)
|
||||||
self.encoding = self.ENCODING or response.encoding
|
|
||||||
import lxml.html as html
|
import lxml.html as html
|
||||||
parser = html.HTMLParser(encoding=self.encoding)
|
ns = html.etree.FunctionNamespace(None)
|
||||||
self.doc = html.parse(BytesIO(response.content), parser)
|
self.define_xpath_functions(ns)
|
||||||
|
self.build_doc(encoding)
|
||||||
|
self.check_encoding()
|
||||||
|
|
||||||
|
def define_xpath_functions(self, ns):
|
||||||
|
ns['lower-case'] = lambda context, args: ' '.join([s.lower() for s in args])
|
||||||
|
|
||||||
|
def build_doc(self, encoding):
|
||||||
|
import lxml.html as html
|
||||||
|
parser = html.HTMLParser(encoding=encoding)
|
||||||
|
self.doc = html.parse(BytesIO(self.response.content), parser)
|
||||||
|
self.encoding = encoding
|
||||||
|
|
||||||
|
def check_encoding(self):
|
||||||
|
encoding = self.encoding
|
||||||
|
for content in self.doc.xpath('//head/meta[lower-case(@http-equiv)="content-type"]/@content'):
|
||||||
|
# meta http-equiv=content-type content=...
|
||||||
|
_, params = parse_header(content)
|
||||||
|
if 'charset' in params:
|
||||||
|
encoding = params['charset'].strip("'\"")
|
||||||
|
|
||||||
|
for charset in self.doc.xpath('//head/meta[@charset]/@charset'):
|
||||||
|
# meta charset=...
|
||||||
|
encoding = charset.lower()
|
||||||
|
|
||||||
|
if encoding == 'iso-8859-1' or not encoding:
|
||||||
|
encoding = 'windows-1252'
|
||||||
|
try:
|
||||||
|
codecs.lookup(encoding)
|
||||||
|
except LookupError:
|
||||||
|
encoding = 'windows-1252'
|
||||||
|
|
||||||
|
if encoding != self.encoding:
|
||||||
|
self.build_doc(encoding)
|
||||||
|
|
||||||
def get_form(self, xpath='//form', name=None, nr=None, submit=None):
|
def get_form(self, xpath='//form', name=None, nr=None, submit=None):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue