new api for content, encoding, and document building
This commit is contained in:
parent
9a3afd2dc1
commit
237036ce3e
1 changed files with 101 additions and 52 deletions
|
|
@ -92,12 +92,28 @@ class Page(object):
|
||||||
"""
|
"""
|
||||||
Represents a page.
|
Represents a page.
|
||||||
|
|
||||||
|
Encoding can be forced by setting the :attr:`ENCODING` class-wide
|
||||||
|
attribute, or by passing an `encoding` keyword argument, which overrides
|
||||||
|
:attr:`ENCODING`. Finally, it can be manually changed by assigning a new
|
||||||
|
value to :attr:`encoding` instance attribute. A unicode version of the
|
||||||
|
response content is accessible in :attr:`text`, decoded with specified
|
||||||
|
:attr:`encoding`.
|
||||||
|
|
||||||
:param browser: browser used to go on the page
|
:param browser: browser used to go on the page
|
||||||
:type browser: :class:`weboob.browser.browsers.Browser`
|
:type browser: :class:`weboob.browser.browsers.Browser`
|
||||||
:param response: response object
|
:param response: response object
|
||||||
:type response: :class:`Response`
|
:type response: :class:`Response`
|
||||||
:param params: optional dictionary containing parameters given to the page (see :class:`weboob.browser.url.URL`)
|
:param params: optional dictionary containing parameters given to the page (see :class:`weboob.browser.url.URL`)
|
||||||
:type params: :class:`dict`
|
:type params: :class:`dict`
|
||||||
|
:param encoding: optional parameter to force the encoding of the page, overrides :attr:`ENCODING`
|
||||||
|
:type encoding: :class:`basestring`
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
ENCODING = None
|
||||||
|
"""
|
||||||
|
Force a page encoding.
|
||||||
|
It is recommended to use None for autodetection.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logged = False
|
logged = False
|
||||||
|
|
@ -106,13 +122,59 @@ class Page(object):
|
||||||
:class:`LoginBrowser` and the :func:`need_login` decorator.
|
:class:`LoginBrowser` and the :func:`need_login` decorator.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, browser, response, params=None):
|
def __init__(self, browser, response, params=None, encoding=None):
|
||||||
self.browser = browser
|
self.browser = browser
|
||||||
self.logger = getLogger(self.__class__.__name__.lower(), browser.logger)
|
self.logger = getLogger(self.__class__.__name__.lower(), browser.logger)
|
||||||
self.response = response
|
self.response = response
|
||||||
self.url = self.response.url
|
self.url = self.response.url
|
||||||
self.params = params
|
self.params = params
|
||||||
|
|
||||||
|
# Setup encoding and build document
|
||||||
|
self.forced_encoding = encoding or self.ENCODING
|
||||||
|
if self.forced_encoding:
|
||||||
|
self.response.encoding = self.forced_encoding
|
||||||
|
self.doc = self.build_doc(self.data)
|
||||||
|
|
||||||
|
# Last chance to change encoding, according to :meth:`detect_encoding`,
|
||||||
|
# which can be used to detect a document-level encoding declaration
|
||||||
|
encoding = self.detect_encoding()
|
||||||
|
if encoding != self.encoding:
|
||||||
|
self.response.encoding = encoding
|
||||||
|
self.doc = self.build_doc(self.data)
|
||||||
|
|
||||||
|
# Encoding issues are delegated to Response instance, implemented by
|
||||||
|
# requests module.
|
||||||
|
|
||||||
|
@property
|
||||||
|
def encoding(self):
|
||||||
|
return self.response.encoding
|
||||||
|
|
||||||
|
@encoding.setter
|
||||||
|
def encoding(self, value):
|
||||||
|
self.forced_encoding = True
|
||||||
|
self.response.encoding = value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def content(self):
|
||||||
|
"""
|
||||||
|
Raw content from response.
|
||||||
|
"""
|
||||||
|
return self.response.content
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self):
|
||||||
|
"""
|
||||||
|
Content of the response, in unicode, decoded with :attr:`encoding`.
|
||||||
|
"""
|
||||||
|
return self.response.text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def data(self):
|
||||||
|
"""
|
||||||
|
Data passed to :meth:`build_doc`.
|
||||||
|
"""
|
||||||
|
return self.content
|
||||||
|
|
||||||
def on_load(self):
|
def on_load(self):
|
||||||
"""
|
"""
|
||||||
Event called when browser loads this page.
|
Event called when browser loads this page.
|
||||||
|
|
@ -123,6 +185,22 @@ class Page(object):
|
||||||
Event called when browser leaves this page.
|
Event called when browser leaves this page.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def build_doc(self, content):
|
||||||
|
"""
|
||||||
|
Abstract method to be implemented by subclasses to build structured
|
||||||
|
data (HTML, Json, CSV...) from :attr:`data` property. It also can be
|
||||||
|
overriden in modules pages to preprocess or postprocess data. It must
|
||||||
|
return an object -- that will be assigned to :attr:`doc`.
|
||||||
|
"""
|
||||||
|
raise NotImplemented
|
||||||
|
|
||||||
|
def detect_encoding(self):
|
||||||
|
"""
|
||||||
|
Override this method to implement detection of document-level encoding
|
||||||
|
declaration, if any (eg. html5's <meta charset="some-charset">).
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class FormNotFound(Exception):
|
class FormNotFound(Exception):
|
||||||
"""
|
"""
|
||||||
|
|
@ -266,17 +344,14 @@ class CsvPage(Page):
|
||||||
This means the rows will be also available as dictionaries.
|
This means the rows will be also available as dictionaries.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, browser, response, *args, **kwargs):
|
def build_doc(self, content):
|
||||||
super(CsvPage, self).__init__(browser, response, *args, **kwargs)
|
encoding = self.encoding
|
||||||
content = response.content
|
|
||||||
encoding = self.ENCODING
|
|
||||||
if encoding == 'utf-16le':
|
if encoding == 'utf-16le':
|
||||||
content = content.decode('utf-16le')[1:].encode('utf-8')
|
content = content.decode('utf-16le')[1:].encode('utf-8')
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
if self.NEWLINES_HACK:
|
if self.NEWLINES_HACK:
|
||||||
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
||||||
fp = BytesIO(content)
|
return self.parse(BytesIO(content), encoding)
|
||||||
self.doc = self.parse(fp, encoding)
|
|
||||||
|
|
||||||
def parse(self, data, encoding=None):
|
def parse(self, data, encoding=None):
|
||||||
"""
|
"""
|
||||||
|
|
@ -322,10 +397,13 @@ class JsonPage(Page):
|
||||||
Json Page.
|
Json Page.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, browser, response, *args, **kwargs):
|
@property
|
||||||
super(JsonPage, self).__init__(browser, response, *args, **kwargs)
|
def data(self):
|
||||||
|
return self.response.text
|
||||||
|
|
||||||
|
def build_doc(self, text):
|
||||||
from weboob.tools.json import json
|
from weboob.tools.json import json
|
||||||
self.doc = json.loads(response.text)
|
return json.loads(text)
|
||||||
|
|
||||||
|
|
||||||
class XMLPage(Page):
|
class XMLPage(Page):
|
||||||
|
|
@ -333,17 +411,10 @@ class XMLPage(Page):
|
||||||
XML Page.
|
XML Page.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ENCODING = None
|
def build_doc(self, content):
|
||||||
"""
|
|
||||||
Force a page encoding.
|
|
||||||
It is recommended to use None for autodetection.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, browser, response, *args, **kwargs):
|
|
||||||
super(XMLPage, self).__init__(browser, response, *args, **kwargs)
|
|
||||||
import lxml.etree as etree
|
import lxml.etree as etree
|
||||||
parser = etree.XMLParser(encoding=self.ENCODING or response.encoding)
|
parser = etree.XMLParser(encoding=self.encoding)
|
||||||
self.doc = etree.parse(BytesIO(response.content), parser)
|
return etree.parse(BytesIO(content), parser)
|
||||||
|
|
||||||
|
|
||||||
class RawPage(Page):
|
class RawPage(Page):
|
||||||
|
|
@ -351,9 +422,8 @@ class RawPage(Page):
|
||||||
Raw page where the "doc" attribute is the content string.
|
Raw page where the "doc" attribute is the content string.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, browser, response, *args, **kwargs):
|
def build_doc(self, content):
|
||||||
super(RawPage, self).__init__(browser, response, *args, **kwargs)
|
return content
|
||||||
self.doc = response.content
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLPage(Page):
|
class HTMLPage(Page):
|
||||||
|
|
@ -376,25 +446,12 @@ class HTMLPage(Page):
|
||||||
The class to instanciate when using :meth:`HTMLPage.get_form`. Default to :class:`Form`.
|
The class to instanciate when using :meth:`HTMLPage.get_form`. Default to :class:`Form`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ENCODING = None
|
|
||||||
"""
|
|
||||||
Force a page encoding.
|
|
||||||
It is recommended to use None for autodetection.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
encoding = kwargs.pop('encoding', self.ENCODING)
|
|
||||||
|
|
||||||
super(HTMLPage, self).__init__(*args, **kwargs)
|
|
||||||
self.doc = None
|
|
||||||
self.encoding = None
|
|
||||||
|
|
||||||
import lxml.html as html
|
import lxml.html as html
|
||||||
ns = html.etree.FunctionNamespace(None)
|
ns = html.etree.FunctionNamespace(None)
|
||||||
self.define_xpath_functions(ns)
|
self.define_xpath_functions(ns)
|
||||||
self.build_doc(encoding)
|
|
||||||
if encoding is None:
|
super(HTMLPage, self).__init__(*args, **kwargs)
|
||||||
self.check_encoding()
|
|
||||||
|
|
||||||
def define_xpath_functions(self, ns):
|
def define_xpath_functions(self, ns):
|
||||||
"""
|
"""
|
||||||
|
|
@ -435,24 +492,17 @@ class HTMLPage(Page):
|
||||||
return bool(context.context_node.xpath(xpath))
|
return bool(context.context_node.xpath(xpath))
|
||||||
ns['has-class'] = has_class
|
ns['has-class'] = has_class
|
||||||
|
|
||||||
def build_doc(self, encoding=None):
|
def build_doc(self, content):
|
||||||
"""
|
"""
|
||||||
Method to build the lxml document from response and given encoding.
|
Method to build the lxml document from response and given encoding.
|
||||||
"""
|
"""
|
||||||
if encoding is None:
|
|
||||||
encoding = self.response.encoding
|
|
||||||
|
|
||||||
import lxml.html as html
|
import lxml.html as html
|
||||||
parser = html.HTMLParser(encoding=encoding)
|
parser = html.HTMLParser(encoding=self.encoding)
|
||||||
self.doc = html.parse(BytesIO(self.response.content), parser)
|
return html.parse(BytesIO(content), parser)
|
||||||
self.encoding = encoding
|
|
||||||
return self.doc
|
|
||||||
|
|
||||||
def check_encoding(self):
|
def detect_encoding(self):
|
||||||
"""
|
"""
|
||||||
Check in the document the "http-equiv" and "charset" meta nodes. If the
|
Look for encoding in the document "http-equiv" and "charset" meta nodes.
|
||||||
specified charset isn't the one given by Content-Type HTTP header,
|
|
||||||
parse document again with the right encoding.
|
|
||||||
"""
|
"""
|
||||||
encoding = self.encoding
|
encoding = self.encoding
|
||||||
for content in self.doc.xpath('//head/meta[lower-case(@http-equiv)="content-type"]/@content'):
|
for content in self.doc.xpath('//head/meta[lower-case(@http-equiv)="content-type"]/@content'):
|
||||||
|
|
@ -472,8 +522,7 @@ class HTMLPage(Page):
|
||||||
except LookupError:
|
except LookupError:
|
||||||
encoding = 'windows-1252'
|
encoding = 'windows-1252'
|
||||||
|
|
||||||
if encoding != self.encoding:
|
return encoding
|
||||||
self.build_doc(encoding)
|
|
||||||
|
|
||||||
def get_form(self, xpath='//form', name=None, nr=None, submit=None):
|
def get_form(self, xpath='//form', name=None, nr=None, submit=None):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue