improve documentation of browser2
This commit is contained in:
parent
5f59c130b3
commit
07f6507096
4 changed files with 115 additions and 45 deletions
|
|
@ -120,10 +120,25 @@ class BaseBrowser(object):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PROFILE = Firefox()
|
PROFILE = Firefox()
|
||||||
|
"""
|
||||||
|
Default profile used by browser to navigate on websites.
|
||||||
|
"""
|
||||||
|
|
||||||
TIMEOUT = 10.0
|
TIMEOUT = 10.0
|
||||||
|
"""
|
||||||
|
Default timeout during requests.
|
||||||
|
"""
|
||||||
|
|
||||||
REFRESH_MAX = 0.0
|
REFRESH_MAX = 0.0
|
||||||
|
"""
|
||||||
|
When handling a Refresh header, the browsers considers it only if the sleep
|
||||||
|
time in lesser than this value.
|
||||||
|
"""
|
||||||
|
|
||||||
VERIFY = True
|
VERIFY = True
|
||||||
|
"""
|
||||||
|
Check SSL certificates.
|
||||||
|
"""
|
||||||
|
|
||||||
PROXIES = None
|
PROXIES = None
|
||||||
|
|
||||||
|
|
@ -222,7 +237,7 @@ class BaseBrowser(object):
|
||||||
|
|
||||||
def location(self, url, **kwargs):
|
def location(self, url, **kwargs):
|
||||||
"""
|
"""
|
||||||
Like open() but also changes the current URL and response.
|
Like :meth:`open` but also changes the current URL and response.
|
||||||
This is the most common method to request web pages.
|
This is the most common method to request web pages.
|
||||||
|
|
||||||
Other than that, has the exact same behavior of open().
|
Other than that, has the exact same behavior of open().
|
||||||
|
|
@ -393,7 +408,10 @@ class BaseBrowser(object):
|
||||||
|
|
||||||
|
|
||||||
class UrlNotAllowed(Exception):
|
class UrlNotAllowed(Exception):
|
||||||
pass
|
"""
|
||||||
|
Raises by :class:`DomainBrowser` when `RESTRICT_URL` is set and trying to go
|
||||||
|
on an url not matching `BASEURL`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class DomainBrowser(BaseBrowser):
|
class DomainBrowser(BaseBrowser):
|
||||||
|
|
@ -410,6 +428,7 @@ class DomainBrowser(BaseBrowser):
|
||||||
See absurl().
|
See absurl().
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
RESTRICT_URL = False
|
||||||
"""
|
"""
|
||||||
URLs allowed to load.
|
URLs allowed to load.
|
||||||
This can be used to force SSL (if the BASEURL is SSL) or any other leakage.
|
This can be used to force SSL (if the BASEURL is SSL) or any other leakage.
|
||||||
|
|
@ -417,7 +436,6 @@ class DomainBrowser(BaseBrowser):
|
||||||
Set it to a list of allowed URLs if you have multiple allowed URLs.
|
Set it to a list of allowed URLs if you have multiple allowed URLs.
|
||||||
More complex behavior is possible by overloading url_allowed()
|
More complex behavior is possible by overloading url_allowed()
|
||||||
"""
|
"""
|
||||||
RESTRICT_URL = False
|
|
||||||
|
|
||||||
def url_allowed(self, url):
|
def url_allowed(self, url):
|
||||||
"""
|
"""
|
||||||
|
|
@ -458,6 +476,10 @@ class DomainBrowser(BaseBrowser):
|
||||||
return urljoin(base, uri)
|
return urljoin(base, uri)
|
||||||
|
|
||||||
def open(self, req, *args, **kwargs):
|
def open(self, req, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Like :meth:`BaseBrowser.open` but hanldes urls without domains, using
|
||||||
|
the :attr:`BASEURL` attribute.
|
||||||
|
"""
|
||||||
uri = req.url if isinstance(req, requests.Request) else req
|
uri = req.url if isinstance(req, requests.Request) else req
|
||||||
|
|
||||||
url = self.absurl(uri)
|
url = self.absurl(uri)
|
||||||
|
|
|
||||||
|
|
@ -133,17 +133,18 @@ class TableCell(_Filter):
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
class table(TableElement):
|
>>> from weboob.capabilities.bank import Transaction
|
||||||
head_xpath = '//table/thead/th'
|
>>> from .page import TableElement, ItemElement
|
||||||
item_xpath = '//table/tbody/tr'
|
>>> class table(TableElement):
|
||||||
|
... head_xpath = '//table/thead/th'
|
||||||
col_date = u'Date'
|
... item_xpath = '//table/tbody/tr'
|
||||||
col_label = [u'Name', u'Label']
|
... col_date = u'Date'
|
||||||
|
... col_label = [u'Name', u'Label']
|
||||||
class item(ItemElement):
|
... class item(ItemElement):
|
||||||
klass = Object
|
... klass = Transaction
|
||||||
obj_date = Date(TableCell('date'))
|
... obj_date = Date(TableCell('date'))
|
||||||
obj_label = CleanText(TableCell('label'))
|
... obj_label = CleanText(TableCell('label'))
|
||||||
|
...
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *names, **kwargs):
|
def __init__(self, *names, **kwargs):
|
||||||
|
|
|
||||||
|
|
@ -39,11 +39,15 @@ from .filters import _Filter, CleanText, AttributeNotFound, XPathNotFound
|
||||||
|
|
||||||
|
|
||||||
class UrlNotResolvable(Exception):
|
class UrlNotResolvable(Exception):
|
||||||
pass
|
"""
|
||||||
|
Raised when trying to locate on an URL instance which url pattern is not resolvable as a real url.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class DataError(Exception):
|
class DataError(Exception):
|
||||||
pass
|
"""
|
||||||
|
Returned data from pages are incoherent.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class URL(object):
|
class URL(object):
|
||||||
|
|
@ -128,6 +132,12 @@ class URL(object):
|
||||||
return r.page or r
|
return r.page or r
|
||||||
|
|
||||||
def build(self, **kwargs):
|
def build(self, **kwargs):
|
||||||
|
"""
|
||||||
|
Build an url with the given arguments from URL's regexps.
|
||||||
|
|
||||||
|
:rtype: :class:`str`
|
||||||
|
:raises: :class:`UrlNotResolvable` if unable to resolve a correct url with the given arguments.
|
||||||
|
"""
|
||||||
patterns = []
|
patterns = []
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
patterns += normalize(url)
|
patterns += normalize(url)
|
||||||
|
|
@ -142,6 +152,9 @@ class URL(object):
|
||||||
raise UrlNotResolvable('Unable to resolve URL with %r. Available are %s' % (kwargs, ', '.join([pattern for pattern, _ in patterns])))
|
raise UrlNotResolvable('Unable to resolve URL with %r. Available are %s' % (kwargs, ', '.join([pattern for pattern, _ in patterns])))
|
||||||
|
|
||||||
def match(self, url, base=None):
|
def match(self, url, base=None):
|
||||||
|
"""
|
||||||
|
Check if the given url match this object.
|
||||||
|
"""
|
||||||
if base is None:
|
if base is None:
|
||||||
assert self.browser is not None
|
assert self.browser is not None
|
||||||
base = self.browser.BASEURL
|
base = self.browser.BASEURL
|
||||||
|
|
@ -165,6 +178,9 @@ class URL(object):
|
||||||
return self.klass(self.browser, response, m.groupdict())
|
return self.klass(self.browser, response, m.groupdict())
|
||||||
|
|
||||||
def id2url(self, func):
|
def id2url(self, func):
|
||||||
|
r"""
|
||||||
|
Helper decorator to get an URL if the given first parameter is an ID.
|
||||||
|
"""
|
||||||
def inner(browser, id_or_url, *args, **kwargs):
|
def inner(browser, id_or_url, *args, **kwargs):
|
||||||
if re.match('^https?://.*', id_or_url):
|
if re.match('^https?://.*', id_or_url):
|
||||||
if not self.match(id_or_url, browser.BASEURL):
|
if not self.match(id_or_url, browser.BASEURL):
|
||||||
|
|
@ -203,11 +219,17 @@ class PagesBrowser(DomainBrowser):
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
class MyBrowser(PagesBrowser):
|
>>> class HomePage(BasePage):
|
||||||
BASEURL = 'http://example.org'
|
... pass
|
||||||
|
...
|
||||||
home = URL('/(index\.html)?', HomePage)
|
>>> class ListPage(BasePage):
|
||||||
list = URL('/list\.html', ListPage)
|
... pass
|
||||||
|
...
|
||||||
|
>>> class MyBrowser(PagesBrowser):
|
||||||
|
... BASEURL = 'http://example.org'
|
||||||
|
... home = URL('/(index\.html)?', HomePage)
|
||||||
|
... list = URL('/list\.html', ListPage)
|
||||||
|
...
|
||||||
|
|
||||||
You can then use URL instances to go on pages.
|
You can then use URL instances to go on pages.
|
||||||
"""
|
"""
|
||||||
|
|
@ -232,6 +254,12 @@ class PagesBrowser(DomainBrowser):
|
||||||
url.browser = self
|
url.browser = self
|
||||||
|
|
||||||
def open(self, *args, **kwargs):
|
def open(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Same method than
|
||||||
|
:meth:`weboob.tools.browser2.browser.DomainBrowser.open`, but the
|
||||||
|
response contains an attribute `page` if the url matches any
|
||||||
|
:class:`URL` object.
|
||||||
|
"""
|
||||||
response = super(PagesBrowser, self).open(*args, **kwargs)
|
response = super(PagesBrowser, self).open(*args, **kwargs)
|
||||||
response.page = None
|
response.page = None
|
||||||
|
|
||||||
|
|
@ -248,6 +276,12 @@ class PagesBrowser(DomainBrowser):
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def location(self, *args, **kwargs):
|
def location(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Same method than
|
||||||
|
:meth:`weboob.tools.browser2.browser.BaseBrowser.location`, but if the
|
||||||
|
url matches any :class:`URL` object, an attribute `page` is added to
|
||||||
|
response, and the attribute :attr:`PagesBrowser.page` is set.
|
||||||
|
"""
|
||||||
if self.page is not None:
|
if self.page is not None:
|
||||||
# Call leave hook.
|
# Call leave hook.
|
||||||
self.page.on_leave()
|
self.page.on_leave()
|
||||||
|
|
@ -269,10 +303,10 @@ class PagesBrowser(DomainBrowser):
|
||||||
r"""
|
r"""
|
||||||
This helper function can be used to handle pagination pages easily.
|
This helper function can be used to handle pagination pages easily.
|
||||||
|
|
||||||
When the called function raises an exception `NextPage`, it goes on the
|
When the called function raises an exception :class:`NextPage`, it goes
|
||||||
wanted page and recall the function.
|
on the wanted page and recall the function.
|
||||||
|
|
||||||
NextPage constructor can take an url or a Request object.
|
:class:`NextPage` constructor can take an url or a Request object.
|
||||||
|
|
||||||
>>> class Page(HTMLPage):
|
>>> class Page(HTMLPage):
|
||||||
... def iter_values(self):
|
... def iter_values(self):
|
||||||
|
|
@ -303,10 +337,10 @@ def pagination(func):
|
||||||
r"""
|
r"""
|
||||||
This helper decorator can be used to handle pagination pages easily.
|
This helper decorator can be used to handle pagination pages easily.
|
||||||
|
|
||||||
When the called function raises an exception `NextPage`, it goes on the
|
When the called function raises an exception :class:`NextPage`, it goes on
|
||||||
wanted page and recall the function.
|
the wanted page and recall the function.
|
||||||
|
|
||||||
NextPage constructor can take an url or a Request object.
|
:class:`NextPage` constructor can take an url or a Request object.
|
||||||
|
|
||||||
>>> class Page(HTMLPage):
|
>>> class Page(HTMLPage):
|
||||||
... @pagination
|
... @pagination
|
||||||
|
|
@ -325,8 +359,7 @@ def pagination(func):
|
||||||
>>> list(b.page.iter_values())
|
>>> list(b.page.iter_values())
|
||||||
['One', 'Two', 'Three', 'Four']
|
['One', 'Two', 'Three', 'Four']
|
||||||
"""
|
"""
|
||||||
def inner(self, *args, **kwargs):
|
def inner(page, *args, **kwargs):
|
||||||
page = self
|
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
for r in func(page, *args, **kwargs):
|
for r in func(page, *args, **kwargs):
|
||||||
|
|
@ -344,7 +377,7 @@ class NextPage(Exception):
|
||||||
Exception used for example in a BasePage to tell PagesBrowser.pagination to
|
Exception used for example in a BasePage to tell PagesBrowser.pagination to
|
||||||
go on the next page.
|
go on the next page.
|
||||||
|
|
||||||
See PagesBrowser.pagination.
|
See :meth:`PagesBrowser.pagination` or decorator :func:`pagination`.
|
||||||
"""
|
"""
|
||||||
def __init__(self, request):
|
def __init__(self, request):
|
||||||
super(NextPage, self).__init__()
|
super(NextPage, self).__init__()
|
||||||
|
|
@ -395,13 +428,19 @@ class BasePage(object):
|
||||||
self.params = params
|
self.params = params
|
||||||
|
|
||||||
def on_load(self):
|
def on_load(self):
|
||||||
pass
|
"""
|
||||||
|
Event called when browser loads this page.
|
||||||
|
"""
|
||||||
|
|
||||||
def on_leave(self):
|
def on_leave(self):
|
||||||
pass
|
"""
|
||||||
|
Event called when browser leaves this page.
|
||||||
|
"""
|
||||||
|
|
||||||
class FormNotFound(Exception):
|
class FormNotFound(Exception):
|
||||||
pass
|
"""
|
||||||
|
Raised when :meth:`HTMLPage.get_form` can't find a form.
|
||||||
|
"""
|
||||||
|
|
||||||
class Form(OrderedDict):
|
class Form(OrderedDict):
|
||||||
"""
|
"""
|
||||||
|
|
@ -489,13 +528,19 @@ class HTMLPage(BasePage):
|
||||||
parser = html.HTMLParser(encoding=response.encoding)
|
parser = html.HTMLParser(encoding=response.encoding)
|
||||||
self.doc = html.parse(StringIO(response.content), parser)
|
self.doc = html.parse(StringIO(response.content), parser)
|
||||||
|
|
||||||
def get_form(self, xpath=None, name=None, nr=None):
|
def get_form(self, xpath='//form', name=None, nr=None):
|
||||||
"""
|
"""
|
||||||
Get a Form object from a xpath selector.
|
Get a :class:`Form` object from a selector.
|
||||||
"""
|
|
||||||
if xpath is None:
|
|
||||||
xpath = '//form'
|
|
||||||
|
|
||||||
|
:param xpath: xpath string to select forms
|
||||||
|
:type xpath: :class:`str`
|
||||||
|
:param name: if supplied, select a form with the given name
|
||||||
|
:type name: :class:`str`
|
||||||
|
:param nr: if supplied, take the n-th selected form
|
||||||
|
:type nr: :class:`int`
|
||||||
|
:rtype: :class:`Form`
|
||||||
|
:raises: :class:`FormNotFound` if no form is found
|
||||||
|
"""
|
||||||
i = 0
|
i = 0
|
||||||
for el in self.doc.xpath(xpath):
|
for el in self.doc.xpath(xpath):
|
||||||
if name is not None and el.attrib.get('name', '') != name:
|
if name is not None and el.attrib.get('name', '') != name:
|
||||||
|
|
@ -617,12 +662,14 @@ class ListElement(AbstractElement):
|
||||||
|
|
||||||
|
|
||||||
class SkipItem(Exception):
|
class SkipItem(Exception):
|
||||||
pass
|
"""
|
||||||
|
Raise this exception in an :class:`ItemElement` subclass to skip an item.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class _ItemElementMeta(type):
|
class _ItemElementMeta(type):
|
||||||
"""
|
"""
|
||||||
Private meta-class used to keep order of obj_* attributes in ItemElement.
|
Private meta-class used to keep order of obj_* attributes in :class:`ItemElement`.
|
||||||
"""
|
"""
|
||||||
def __new__(mcs, name, bases, attrs):
|
def __new__(mcs, name, bases, attrs):
|
||||||
_attrs = []
|
_attrs = []
|
||||||
|
|
|
||||||
|
|
@ -102,11 +102,11 @@ class FrenchTransaction(Transaction):
|
||||||
PATTERN class attribute) with a list containing tuples of regexp
|
PATTERN class attribute) with a list containing tuples of regexp
|
||||||
and the associated type, for example::
|
and the associated type, for example::
|
||||||
|
|
||||||
PATTERNS = [(re.compile('^VIR(EMENT)? (?P<text>.*)'), FrenchTransaction.TYPE_TRANSFER),
|
>>> PATTERNS = [(re.compile('^VIR(EMENT)? (?P<text>.*)'), FrenchTransaction.TYPE_TRANSFER),
|
||||||
(re.compile('^PRLV (?P<text>.*)'), FrenchTransaction.TYPE_ORDER),
|
... (re.compile('^PRLV (?P<text>.*)'), FrenchTransaction.TYPE_ORDER),
|
||||||
(re.compile('^(?P<text>.*) CARTE \d+ PAIEMENT CB (?P<dd>\d{2})(?P<mm>\d{2}) ?(.*)$'),
|
... (re.compile('^(?P<text>.*) CARTE \d+ PAIEMENT CB (?P<dd>\d{2})(?P<mm>\d{2}) ?(.*)$'),
|
||||||
FrenchTransaction.TYPE_CARD)
|
... FrenchTransaction.TYPE_CARD)
|
||||||
]
|
... ]
|
||||||
|
|
||||||
In regexps, you can define this patterns:
|
In regexps, you can define this patterns:
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue