documentation

This commit is contained in:
Romain Bignon 2014-10-08 17:09:18 +02:00
commit 22c436bcb8
2 changed files with 109 additions and 14 deletions

View file

@ -89,9 +89,21 @@ class NextPage(Exception):
class Page(object): class Page(object):
""" """
Base page. Represents a page.
:param browser: browser used to go on the page
:type browser: :class:`weboob.browser.browsers.Browser`
:param response: response object
:type response: :class:`Response`
:param params: optional dictionary containing parameters given to the page (see :class:`weboob.browser.url.URL`)
:type params: :class:`dict`
""" """
logged = False logged = False
"""
If True, the page is in a restrected area of the wesite. Useful with
:class:`LoginBrowser` and the :func:`need_login` decorator.
"""
def __init__(self, browser, response, params=None): def __init__(self, browser, response, params=None):
self.browser = browser self.browser = browser
@ -128,9 +140,15 @@ class Form(OrderedDict):
It is used as a dict with pre-filled values from HTML. You can set new It is used as a dict with pre-filled values from HTML. You can set new
values as strings by setting an item value. values as strings by setting an item value.
submit_el allows you to only consider one submit button (which is what It is recommended to not use this class by yourself, but call
browsers do). If set to None, it takes all of them, and if set to False, :meth:`HTMLPage.get_form`.
it takes none.
:param page: the page where the form is located
:type page: :class:`Page`
:param el: the form element on the page
:param submit_el: allows you to only consider one submit button (which is
what browsers do). If set to None, it takes all of them,
and if set to False, it takes none.
""" """
def __init__(self, page, el, submit_el=None): def __init__(self, page, el, submit_el=None):
@ -204,16 +222,35 @@ class Form(OrderedDict):
class CsvPage(Page): class CsvPage(Page):
DIALECT = 'excel' """
FMTPARAMS = {} Page which parses CSV files.
ENCODING = 'utf-8' """
NEWLINES_HACK = True
DIALECT = 'excel'
""" """
If True, will consider the first line as a header. Dialect given to the :mod:`csv` module.
This means the rows will be also available as dictionnaries.
""" """
FMTPARAMS = {}
"""
Parameters given to the :mod:`csv` module.
"""
ENCODING = 'utf-8'
"""
Encoding of the file.
"""
NEWLINES_HACK = True
"""
Convert all strange newlines to unix ones.
"""
HEADER = None HEADER = None
"""
If not None, will consider the line represented by this index as a header.
This means the rows will be also available as dictionaries.
"""
def __init__(self, browser, response, *args, **kwargs): def __init__(self, browser, response, *args, **kwargs):
super(CsvPage, self).__init__(browser, response, *args, **kwargs) super(CsvPage, self).__init__(browser, response, *args, **kwargs)
@ -228,6 +265,14 @@ class CsvPage(Page):
self.doc = self.parse(fp, encoding) self.doc = self.parse(fp, encoding)
def parse(self, data, encoding=None): def parse(self, data, encoding=None):
"""
Method called by the constructor of :class:`CsvPage` to parse the document.
:param data: file stream
:type data: :class:`BytesIO`
:param encoding: if given, use it to decode cell strings
:type encoding: :class:`str`
"""
import csv import csv
reader = csv.reader(data, dialect=self.DIALECT, **self.FMTPARAMS) reader = csv.reader(data, dialect=self.DIALECT, **self.FMTPARAMS)
header = None header = None
@ -246,9 +291,12 @@ class CsvPage(Page):
for i, cell in enumerate(row): for i, cell in enumerate(row):
drow[header[i]] = cell drow[header[i]] = cell
drows.append(drow) drows.append(drow)
return drows if header is not None else row return drows if header is not None else rows
def decode_row(self, row, encoding): def decode_row(self, row, encoding):
"""
Method called by :meth:`CsvPage.parse` to decode a row using the given encoding.
"""
if encoding: if encoding:
return [unicode(cell, encoding) for cell in row] return [unicode(cell, encoding) for cell in row]
else: else:
@ -256,6 +304,10 @@ class CsvPage(Page):
class JsonPage(Page): class JsonPage(Page):
"""
Json Page.
"""
def __init__(self, browser, response, *args, **kwargs): def __init__(self, browser, response, *args, **kwargs):
super(JsonPage, self).__init__(browser, response, *args, **kwargs) super(JsonPage, self).__init__(browser, response, *args, **kwargs)
from weboob.tools.json import json from weboob.tools.json import json
@ -263,6 +315,10 @@ class JsonPage(Page):
class XMLPage(Page): class XMLPage(Page):
"""
XML Page.
"""
ENCODING = None ENCODING = None
""" """
Force a page encoding. Force a page encoding.
@ -277,6 +333,10 @@ class XMLPage(Page):
class RawPage(Page): class RawPage(Page):
"""
Raw page where the "doc" attribute is the content string.
"""
def __init__(self, browser, response, *args, **kwargs): def __init__(self, browser, response, *args, **kwargs):
super(RawPage, self).__init__(browser, response, *args, **kwargs) super(RawPage, self).__init__(browser, response, *args, **kwargs)
self.doc = response.content self.doc = response.content
@ -285,8 +345,22 @@ class RawPage(Page):
class HTMLPage(Page): class HTMLPage(Page):
""" """
HTML page. HTML page.
:param browser: browser used to go on the page
:type browser: :class:`weboob.browser.browsers.Browser`
:param response: response object
:type response: :class:`Response`
:param params: optional dictionary containing parameters given to the page (see :class:`weboob.browser.url.URL`)
:type params: :class:`dict`
:param encoding: optional parameter to force the encoding of the page
:type encoding: :class:`basestring`
""" """
FORM_CLASS = Form FORM_CLASS = Form
"""
The class to instanciate when using :meth:`HTMLPage.get_form`. Default to :class:`Form`.
"""
ENCODING = None ENCODING = None
""" """
@ -294,27 +368,48 @@ class HTMLPage(Page):
It is recommended to use None for autodetection. It is recommended to use None for autodetection.
""" """
def __init__(self, browser, response, *args, **kwargs): def __init__(self, *args, **kwargs):
encoding = kwargs.pop('encoding', self.ENCODING or response.encoding) encoding = kwargs.pop('encoding', self.ENCODING)
super(HTMLPage, self).__init__(browser, response, *args, **kwargs) super(HTMLPage, self).__init__(*args, **kwargs)
self.doc = None
self.encoding = None
import lxml.html as html import lxml.html as html
ns = html.etree.FunctionNamespace(None) ns = html.etree.FunctionNamespace(None)
self.define_xpath_functions(ns) self.define_xpath_functions(ns)
self.build_doc(encoding) self.build_doc(encoding)
self.check_encoding() if encoding is None:
self.check_encoding()
def define_xpath_functions(self, ns): def define_xpath_functions(self, ns):
"""
Define XPath functions on the given lxml function namespace.
This method is called in constructor of :class:`HTMLPage` and can be
overloaded by children classes to add extra functions.
"""
ns['lower-case'] = lambda context, args: ' '.join([s.lower() for s in args]) ns['lower-case'] = lambda context, args: ' '.join([s.lower() for s in args])
def build_doc(self, encoding): def build_doc(self, encoding=None):
"""
Method to build the lxml document from response and given encoding.
"""
if encoding is None:
encoding = self.response.encoding
import lxml.html as html import lxml.html as html
parser = html.HTMLParser(encoding=encoding) parser = html.HTMLParser(encoding=encoding)
self.doc = html.parse(BytesIO(self.response.content), parser) self.doc = html.parse(BytesIO(self.response.content), parser)
self.encoding = encoding self.encoding = encoding
return self.doc
def check_encoding(self): def check_encoding(self):
"""
Check in the document the "http-equiv" and "charset" meta nodes. If the
specified charset isn't the one given by Content-Type HTTP header,
parse document again with the right encoding.
"""
encoding = self.encoding encoding = self.encoding
for content in self.doc.xpath('//head/meta[lower-case(@http-equiv)="content-type"]/@content'): for content in self.doc.xpath('//head/meta[lower-case(@http-equiv)="content-type"]/@content'):
# meta http-equiv=content-type content=... # meta http-equiv=content-type content=...

View file

@ -65,7 +65,7 @@ class WeboobSession(Session):
:class:`Session`. :class:`Session`.
:param request: :class:`Request` instance to prepare with this :param request: :class:`Request` instance to prepare with this
session's settings. session's settings.
""" """
cookies = request.cookies or {} cookies = request.cookies or {}