From 8ea92461dc9eca3b8791745e25ab416e28eaa90a Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Sun, 9 Mar 2014 15:40:25 +0100 Subject: [PATCH] add PagesBrowser, ListElement, ItemElement, TableElement, and filters --- weboob/tools/browser2/__init__.py | 7 + weboob/tools/browser2/filters.py | 138 ++++++++ weboob/tools/browser2/page.py | 538 ++++++++++++++++++++++++++++++ 3 files changed, 683 insertions(+) create mode 100644 weboob/tools/browser2/filters.py create mode 100644 weboob/tools/browser2/page.py diff --git a/weboob/tools/browser2/__init__.py b/weboob/tools/browser2/__init__.py index b51e0587..9798d3ab 100644 --- a/weboob/tools/browser2/__init__.py +++ b/weboob/tools/browser2/__init__.py @@ -16,3 +16,10 @@ # # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . + +from .browser import BaseBrowser, DomainBrowser, Wget, Firefox, UrlNotAllowed +from .page import PagesBrowser, BasePage, URL, HTMLPage, LoginBrowser, need_login + + +__all__ = ['BaseBrowser', 'DomainBrowser', 'Wget', 'Firefox', 'UrlNotAllowed', + 'PagesBrowser', 'BasePage', 'URL', 'HTMLPage', 'LoginBrowser', 'need_login'] diff --git a/weboob/tools/browser2/filters.py b/weboob/tools/browser2/filters.py new file mode 100644 index 00000000..9cad7579 --- /dev/null +++ b/weboob/tools/browser2/filters.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2014 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from __future__ import absolute_import + +from decimal import Decimal +import re + + +class Filter(object): + """ + Class used to filter on a HTML element given as call parameter to return + matching elements. + + Filters can be chained, so the parameter supplied to constructor can be + either a xpath selector string, or an other filter called before. + + >>> from lxml.html import etree + >>> f = CleanDecimal(CleanText('//p')) + >>> f(etree.fromstring('

blah: 229,90

')) + Decimal('229.90') + """ + + def __init__(self, selector=None): + self.selector = selector + + def __call__(self, item): + if isinstance(self.selector, basestring): + value = item.xpath(self.selector) + elif callable(self.selector): + value = self.selector(item) + else: + value = self.selector + + return self.filter(value) + + def filter(self, value): + """ + This method have to be overrided by children classes. + """ + return value + +class Env(Filter): + """ + Filter to get environment value of the item. + + It is used for example to get page parameters, or when there is a parse() + method on ItemElement. + """ + def __init__(self, name): + self.name = name + + def __call__(self, item): + return item.env[self.name] + +class TableCell(Filter): + """ + Used with TableElement, it get the cell value from its name. + + For example: + + class table(TableElement): + head_xpath = '//table/thead/th' + item_xpath = '//table/tbody/tr' + columns = {'date': u'Date', + 'label': [u'Name', 'Label'], + } + + class item(ItemElement): + klass = Object + obj_date = Date(TableCell('date')) + obj_label = CleanText(TableCell('label')) + """ + + def __init__(self, *names): + self.names = names + + def __call__(self, item): + for name in self.names: + idx = item.parent.get_colnum(name) + if idx is not None: + return item.xpath('./td[%s]' % (idx + 1)) + raise KeyError('Unable to find column %s' % ' or '.join(self.names)) + +class CleanText(Filter): + """ + Get a cleaned text from an element. + + It replaces all tabs and multiple spaces to one space and strip the result + string. + """ + def filter(self, txt): + if isinstance(txt, (tuple,list)): + txt = ' '.join(map(self.clean, txt)) + + return self.clean(txt) + + @classmethod + def clean(self, txt): + if not isinstance(txt, basestring): + txt = [t.strip() for t in txt.itertext()] + txt = u' '.join(txt) # 'foo bar' + txt = re.sub(u'[\s\xa0\t]+', u' ', txt) # 'foo bar' + return txt.strip() + +class CleanDecimal(CleanText): + """ + Get a cleaned Decimal value from an element. + """ + def filter(self, text): + text = super(CleanDecimal, self).filter(text) + text = text.replace('.','').replace(',','.') + return Decimal(re.sub(u'[^\d\-\.]', '', text)) + +class Link(Filter): + """ + Get the link uri of an element. + + If the tag is not found, an exception IndexError is raised. + """ + def filter(self, el): + return el[0].attrib.get('href', '') diff --git a/weboob/tools/browser2/page.py b/weboob/tools/browser2/page.py new file mode 100644 index 00000000..fe27ae41 --- /dev/null +++ b/weboob/tools/browser2/page.py @@ -0,0 +1,538 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2014 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from __future__ import absolute_import + +import requests +import re +from copy import deepcopy +from cStringIO import StringIO + +from weboob.tools.ordereddict import OrderedDict +from weboob.tools.regex_helper import normalize +from weboob.tools.parsers.lxmlparser import LxmlHtmlParser +from weboob.tools.log import getLogger + +from .browser import DomainBrowser +from .filters import Filter, CleanText + + +class URL(object): + """ + A description of an URL on the PagesBrowser website. + + It takes one or several regexps to match urls, and an optional BasePage + class which is instancied by PagesBrowser.open if the page matches a regex. + """ + _creation_counter = 0 + + def __init__(self, *args): + self.urls = [] + self.klass = None + self.browser = None + for arg in args: + if isinstance(arg, basestring): + self.urls.append(arg) + if isinstance(arg, type): + self.klass = arg + + self._creation_counter = URL._creation_counter + URL._creation_counter += 1 + + def is_here(self): + """ + Returns True if the current page of browser matches this URL. + """ + return self.browser.page and isinstance(self.browser.page, self.klass) + + def stay_or_go(self, **kwargs): + """ + Request to go on this url only if we aren't already here. + + Arguments are optional parameters for url. + + >>> url = URL('http://exawple.org/(?P).html') + >>> url.stay_or_go(pagename='index') + """ + if self.browser.page and isinstance(self.browser.page, self.klass): + return self.browser.page + + return self.go(**kwargs) + + def go(self, **kwargs): + """ + Request to go on this url. + + Arguments are optional parameters for url. + + >>> url = URL('http://exawple.org/(?P).html') + >>> url.stay_or_go(pagename='index') + """ + patterns = [] + for url in self.urls: + patterns += normalize(url) + + for pattern, args in patterns: + url = pattern % kwargs + return self.browser.location(url) + + def handle(self, response): + """ + Handle a HTTP response to get an instance of the klass if it matches. + """ + for regex in self.urls: + if regex.startswith('/'): + regex = self.browser.BASEURL + regex + m = re.match(regex, response.url) + if m: + return self.klass(self.browser, response, m.groupdict()) + + +class _PagesBrowserMeta(type): + """ + Private meta-class used to keep order of URLs instances of PagesBrowser. + """ + def __new__(cls, name, bases, attrs): + urls = [(url_name, attrs.pop(url_name)) for url_name, obj in attrs.items() if isinstance(obj, URL)] + urls.sort(key=lambda x: x[1]._creation_counter) + + new_class = super(_PagesBrowserMeta, cls).__new__(cls, name, bases, attrs) + if new_class._urls is None: + new_class._urls = {} + else: + new_class._urls = deepcopy(new_class._urls) + new_class._urls.update(urls) + return new_class + +class PagesBrowser(DomainBrowser): + """ + A browser which works pages and keep state of navigation. + + To use it, you have to derive it and to create URL objects as class + attributes. When open() or location() are called, if the url matches + one of URL objects, it returns a BasePage object. In case of location(), it + stores it in self.page. + + Example: + + class MyBrowser(PagesBrowser): + BASEURL = 'http://example.org' + + home = URL('/(index\.html)?', HomePage) + list = URL('/list\.html', ListPage) + + You can then use URL instances to go on pages. + """ + + + _urls = None + __metaclass__ = _PagesBrowserMeta + + def __getattr__(self, name): + if self._urls is not None and name in self._urls: + return self._urls[name] + else: + raise AttributeError("'%s' object has no attribute '%s'" % ( + self.__class__.__name__, name)) + + def __init__(self, *args, **kwargs): + super(PagesBrowser, self).__init__(*args, **kwargs) + + self.page = None + self._urls = deepcopy(self._urls) + for url in self._urls.itervalues(): + url.browser = self + + def open(self, *args, **kwargs): + response = super(PagesBrowser, self).open(*args, **kwargs) + + # Try to handle the response page with an URL instance. + for url in self._urls.itervalues(): + page = url.handle(response) + if page is not None: + self.logger.debug('Handle %s with %s' % (response.url, page.__class__.__name__)) + return page + + self.logger.debug('Unable to handle %s' % response.url) + return response + + def location(self, *args, **kwargs): + if self.page is not None: + # Call leave hook. + self.page.on_leave() + + page = self.open(*args, **kwargs) + + # If open() returns a BasePage instance, store it as the current page. + if isinstance(page, BasePage): + response = page.response + self.page = page + else: + response = page + self.page = None + + self.response = response + self.url = response.url + + if self.page is not None: + # Call load hook. + self.page.on_load() + + return page + + def pagination(self, func, *args, **kwargs): + """ + This helper function can be used to handle pagination pages easily. + + When the called function raises an exception `NextPage`, it goes on the + wanted page and recall the function. + + NextPage constructor can take an url or a Request object. + + >>> class Page(HTMLPage): + ... def iter_values(self): + ... for el in self.doc.xpath('//li'): + ... yield el.text + ... for next in self.doc.xpath('//a'): + ... raise NextPage(next.attrib['href']) + ... + >>> class Browser(PagesBrowser): + ... BASEURL = 'http://people.symlink.me' + ... list = URL('/~rom1/projects/weboob/list-(?P\d+).html', Page) + ... + >>> b = Browser() + >>> b.list.go(pagenum=1) + >>> list(b.pagination(lambda: b.page.iter_values())) + ['One', 'Two', 'Three', 'Four'] + """ + while True: + try: + for r in func(*args, **kwargs): + yield r + except NextPage as e: + self.location(e.request) + else: + return + +class NextPage(Exception): + """ + Exception used for example in a BasePage to tell PagesBrowser.pagination to + go on the next page. + + See PagesBrowser.pagination. + """ + def __init__(self, request): + self.request = request + + +def need_login(func): + """ + Decorator used to require to be logged to access to this function. + """ + def inner(browser, *args, **kwargs): + if browser.page is None or not browser.page.logged: + browser.do_login() + return func(browser, *args, **kwargs) + + return inner + + +class LoginBrowser(PagesBrowser): + """ + A browser which supports login. + """ + def __init__(self, username, password, *args, **kwargs): + super(LoginBrowser, self).__init__(*args, **kwargs) + self.username = username + self.password = password + + def do_login(self): + """" + Abstract method to implement to login on website. + + It is call when a login is needed. + """ + raise NotImplementedError() + + +class BasePage(object): + """ + Base page. + """ + logged = False + + def __init__(self, browser, response, params): + self.browser = browser + self.logger = getLogger(self.__class__.__name__.lower(), browser.logger) + self.response = response + self.url = self.response.url + self.params = params + + def on_load(self): + pass + + def on_leave(self): + pass + +class FormNotFound(Exception): + pass + +class Form(OrderedDict): + """ + Represents a form of an HTML page. + + It is used as a dict with pre-filled values from HTML. You can set new + values as strings by setting an item value. + """ + + def __init__(self, page, el): + super(Form, self).__init__() + self.page = page + self.el = el + self.method = el.attrib.get('method', 'GET') + self.url = el.attrib.get('action', page.url) + + for el in el.xpath('.//input'): + try: + name = el.attrib['name'] + except KeyError: + continue + value = el.attrib.get('value', u'') + self[name] = value + + @property + def request(self): + """ + Get the Request object from the form. + """ + return requests.Request(self.method, self.url, data=self) + + def submit(self): + """ + Submit the form and tell browser to be located to the new page. + """ + return self.page.browser.location(self.request) + + +class HTMLPage(BasePage): + """ + HTML page. + """ + FORM_CLASS = Form + + def __init__(self, browser, response, *args, **kwargs): + super(HTMLPage, self).__init__(browser, response, *args, **kwargs) + parser = LxmlHtmlParser() + self.doc = parser.parse(StringIO(response.content), response.encoding) + + def get_form(self, xpath=None, name=None, nr=None): + """ + Get a Form object from a xpath selector. + """ + if xpath is None: + xpath = '//form' + + i = 0 + for el in self.doc.xpath(xpath): + if name is not None and el.attrib.get('name', '') != name: + continue + i += i + if nr is not None and i != nr: + continue + + return self.FORM_CLASS(self, el) + + raise FormNotFound() + + +def method(klass): + """ + Class-decorator to call it as a method. + """ + def inner(self, *args, **kwargs): + return klass(self)(*args, **kwargs) + return inner + + +class AbstractElement(object): + def __init__(self, page, parent=None, el=None): + self.page = page + self.parent = parent + if el is not None: + self.el = el + elif parent is not None: + self.el = parent.el + else: + self.el = page.doc + + if parent is not None: + self.env = deepcopy(parent.env) + else: + self.env = deepcopy(page.params) + + def use_selector(self, func): + if isinstance(func, Filter): + value = func(self) + elif callable(func): + value = func() + else: + value = func + + return value + + def xpath(self, *args, **kwargs): + return self.el.xpath(*args, **kwargs) + + +class ListElement(AbstractElement): + item_xpath = None + flush_at_end = False + + def __init__(self, *args, **kwargs): + super(ListElement, self).__init__(*args, **kwargs) + + self.objects = {} + + def __call__(self): + return self.__iter__() + + def parse(self, el): + pass + + def __iter__(self): + self.parse(self.el) + + if self.item_xpath is not None: + for el in self.el.xpath(self.item_xpath): + for obj in self.handle_element(el): + if not self.flush_at_end: + yield obj + else: + for obj in self.handle_element(self.el): + if not self.flush_at_end: + yield obj + + if self.flush_at_end: + for obj in self.objects.itervalues(): + yield obj + + self.check_next_page() + + def check_next_page(self): + if not hasattr(self, 'next_page'): + return + + next_page = getattr(self, 'next_page') + try: + value = self.use_selector(next_page) + except IndexError: + return + + if value is None: + return + + raise NextPage(value) + + + def store(self, obj): + if obj.id: + if obj.id in self.objects: + raise ValueError('There are two objects with the same ID! %s' % obj.id) + self.objects[obj.id] = obj + return obj + + def handle_element(self, el): + for attrname in dir(self): + attr = getattr(self, attrname) + if isinstance(attr, type) and issubclass(attr, AbstractElement) and attr != type(self): + for obj in attr(self.page, self, el): + yield self.store(obj) + +class SkipItem(Exception): + pass + +class ItemElement(AbstractElement): + klass = None + __filter__ = None + + class Index(object): + pass + + def __init__(self, *args, **kwargs): + super(ItemElement, self).__init__(*args, **kwargs) + self.obj = None + + def parse(self, obj): + pass + + def build_object(self): + return self.klass() + + def __call__(self, obj=None): + if obj is not None: + self.obj = obj + + for obj in self: + return obj + + def __iter__(self): + if self.__filter__ is not None: + try: + skip = not self.__filter__(self.el) + except TypeError: + skip = not self.__filter__.im_func(self.el) + if skip: + return + + try: + if self.obj is None: + self.obj = self.build_object() + self.parse(self.el) + for attr in dir(self): + m = re.match('obj_(.*)', attr) + if m: + self.handle_attr(m.group(1), getattr(self, attr)) + except SkipItem: + return + + yield self.obj + + def handle_attr(self, key, func): + value = self.use_selector(func) + setattr(self.obj, key, value) + + +class TableElement(ListElement): + head_xpath = None + columns = None + cleaner = CleanText + + def __init__(self, *args, **kwargs): + super(TableElement, self).__init__(*args, **kwargs) + + self._cols = {} + + for colnum, el in enumerate(self.el.xpath(self.head_xpath)): + title = self.cleaner.clean(el) + for name, titles in self.columns.iteritems(): + if title in titles or title == titles: + self._cols[name] = colnum + + def get_colnum(self, name): + return self._cols.get(name, None)