From 22c436bcb81542f229d30d0f0fb1bccd0008f74a Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Wed, 8 Oct 2014 17:09:18 +0200 Subject: [PATCH] documentation --- weboob/browser/pages.py | 127 ++++++++++++++++++++++++++++++++----- weboob/browser/sessions.py | 2 +- 2 files changed, 112 insertions(+), 17 deletions(-) diff --git a/weboob/browser/pages.py b/weboob/browser/pages.py index b568347c..4cf0436d 100644 --- a/weboob/browser/pages.py +++ b/weboob/browser/pages.py @@ -89,9 +89,21 @@ class NextPage(Exception): class Page(object): """ - Base page. + Represents a page. + + :param browser: browser used to go on the page + :type browser: :class:`weboob.browser.browsers.Browser` + :param response: response object + :type response: :class:`Response` + :param params: optional dictionary containing parameters given to the page (see :class:`weboob.browser.url.URL`) + :type params: :class:`dict` """ + logged = False + """ + If True, the page is in a restrected area of the wesite. Useful with + :class:`LoginBrowser` and the :func:`need_login` decorator. + """ def __init__(self, browser, response, params=None): self.browser = browser @@ -128,9 +140,15 @@ class Form(OrderedDict): It is used as a dict with pre-filled values from HTML. You can set new values as strings by setting an item value. - submit_el allows you to only consider one submit button (which is what - browsers do). If set to None, it takes all of them, and if set to False, - it takes none. + It is recommended to not use this class by yourself, but call + :meth:`HTMLPage.get_form`. + + :param page: the page where the form is located + :type page: :class:`Page` + :param el: the form element on the page + :param submit_el: allows you to only consider one submit button (which is + what browsers do). If set to None, it takes all of them, + and if set to False, it takes none. """ def __init__(self, page, el, submit_el=None): @@ -204,16 +222,35 @@ class Form(OrderedDict): class CsvPage(Page): - DIALECT = 'excel' - FMTPARAMS = {} - ENCODING = 'utf-8' - NEWLINES_HACK = True + """ + Page which parses CSV files. + """ + DIALECT = 'excel' """ - If True, will consider the first line as a header. - This means the rows will be also available as dictionnaries. + Dialect given to the :mod:`csv` module. """ + + FMTPARAMS = {} + """ + Parameters given to the :mod:`csv` module. + """ + + ENCODING = 'utf-8' + """ + Encoding of the file. + """ + + NEWLINES_HACK = True + """ + Convert all strange newlines to unix ones. + """ + HEADER = None + """ + If not None, will consider the line represented by this index as a header. + This means the rows will be also available as dictionaries. + """ def __init__(self, browser, response, *args, **kwargs): super(CsvPage, self).__init__(browser, response, *args, **kwargs) @@ -228,6 +265,14 @@ class CsvPage(Page): self.doc = self.parse(fp, encoding) def parse(self, data, encoding=None): + """ + Method called by the constructor of :class:`CsvPage` to parse the document. + + :param data: file stream + :type data: :class:`BytesIO` + :param encoding: if given, use it to decode cell strings + :type encoding: :class:`str` + """ import csv reader = csv.reader(data, dialect=self.DIALECT, **self.FMTPARAMS) header = None @@ -246,9 +291,12 @@ class CsvPage(Page): for i, cell in enumerate(row): drow[header[i]] = cell drows.append(drow) - return drows if header is not None else row + return drows if header is not None else rows def decode_row(self, row, encoding): + """ + Method called by :meth:`CsvPage.parse` to decode a row using the given encoding. + """ if encoding: return [unicode(cell, encoding) for cell in row] else: @@ -256,6 +304,10 @@ class CsvPage(Page): class JsonPage(Page): + """ + Json Page. + """ + def __init__(self, browser, response, *args, **kwargs): super(JsonPage, self).__init__(browser, response, *args, **kwargs) from weboob.tools.json import json @@ -263,6 +315,10 @@ class JsonPage(Page): class XMLPage(Page): + """ + XML Page. + """ + ENCODING = None """ Force a page encoding. @@ -277,6 +333,10 @@ class XMLPage(Page): class RawPage(Page): + """ + Raw page where the "doc" attribute is the content string. + """ + def __init__(self, browser, response, *args, **kwargs): super(RawPage, self).__init__(browser, response, *args, **kwargs) self.doc = response.content @@ -285,8 +345,22 @@ class RawPage(Page): class HTMLPage(Page): """ HTML page. + + :param browser: browser used to go on the page + :type browser: :class:`weboob.browser.browsers.Browser` + :param response: response object + :type response: :class:`Response` + :param params: optional dictionary containing parameters given to the page (see :class:`weboob.browser.url.URL`) + :type params: :class:`dict` + :param encoding: optional parameter to force the encoding of the page + :type encoding: :class:`basestring` + """ + FORM_CLASS = Form + """ + The class to instanciate when using :meth:`HTMLPage.get_form`. Default to :class:`Form`. + """ ENCODING = None """ @@ -294,27 +368,48 @@ class HTMLPage(Page): It is recommended to use None for autodetection. """ - def __init__(self, browser, response, *args, **kwargs): - encoding = kwargs.pop('encoding', self.ENCODING or response.encoding) + def __init__(self, *args, **kwargs): + encoding = kwargs.pop('encoding', self.ENCODING) - super(HTMLPage, self).__init__(browser, response, *args, **kwargs) + super(HTMLPage, self).__init__(*args, **kwargs) + self.doc = None + self.encoding = None import lxml.html as html ns = html.etree.FunctionNamespace(None) self.define_xpath_functions(ns) self.build_doc(encoding) - self.check_encoding() + if encoding is None: + self.check_encoding() def define_xpath_functions(self, ns): + """ + Define XPath functions on the given lxml function namespace. + + This method is called in constructor of :class:`HTMLPage` and can be + overloaded by children classes to add extra functions. + """ ns['lower-case'] = lambda context, args: ' '.join([s.lower() for s in args]) - def build_doc(self, encoding): + def build_doc(self, encoding=None): + """ + Method to build the lxml document from response and given encoding. + """ + if encoding is None: + encoding = self.response.encoding + import lxml.html as html parser = html.HTMLParser(encoding=encoding) self.doc = html.parse(BytesIO(self.response.content), parser) self.encoding = encoding + return self.doc def check_encoding(self): + """ + Check in the document the "http-equiv" and "charset" meta nodes. If the + specified charset isn't the one given by Content-Type HTTP header, + parse document again with the right encoding. + """ encoding = self.encoding for content in self.doc.xpath('//head/meta[lower-case(@http-equiv)="content-type"]/@content'): # meta http-equiv=content-type content=... diff --git a/weboob/browser/sessions.py b/weboob/browser/sessions.py index 8a0e40cb..f24ba267 100644 --- a/weboob/browser/sessions.py +++ b/weboob/browser/sessions.py @@ -65,7 +65,7 @@ class WeboobSession(Session): :class:`Session`. :param request: :class:`Request` instance to prepare with this - session's settings. + session's settings. """ cookies = request.cookies or {}