diff --git a/modules/alloresto/pages.py b/modules/alloresto/pages.py index 3000a156..48367911 100644 --- a/modules/alloresto/pages.py +++ b/modules/alloresto/pages.py @@ -21,7 +21,8 @@ import datetime from decimal import Decimal -from weboob.tools.browser2.page import HTMLPage, LoggedPage, method, ItemElement +from weboob.tools.browser2.page import HTMLPage, LoggedPage, method +from weboob.tools.browser2.elements import ItemElement from weboob.tools.browser2.filters import CleanDecimal, CleanText, Filter, TableCell from weboob.capabilities.bank import Account from weboob.tools.capabilities.bank.transactions import FrenchTransaction as Transaction diff --git a/modules/banqueaccord/pages.py b/modules/banqueaccord/pages.py index f0cdc54b..41f52ad6 100644 --- a/modules/banqueaccord/pages.py +++ b/modules/banqueaccord/pages.py @@ -24,7 +24,8 @@ import re from cStringIO import StringIO from weboob.capabilities.bank import Account -from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, LoggedPage +from weboob.tools.browser2.page import HTMLPage, method, LoggedPage +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import ParseError, CleanText, Regexp, Attr, CleanDecimal, Env from weboob.tools.captcha.virtkeyboard import MappedVirtKeyboard, VirtKeyboardError from weboob.tools.capabilities.bank.transactions import FrenchTransaction diff --git a/modules/biplan/pages.py b/modules/biplan/pages.py index 90f92050..a9f5e508 100644 --- a/modules/biplan/pages.py +++ b/modules/biplan/pages.py @@ -23,7 +23,8 @@ from datetime import datetime, time import weboob.tools.date as date_util from .calendar import BiplanCalendarEventConcert, BiplanCalendarEventTheatre -from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement +from weboob.tools.browser2.elements import ItemElement, SkipItem, ListElement +from weboob.tools.browser2.page import HTMLPage, method from weboob.tools.browser2.filters import Filter, Link, CleanText, Env, Regexp, CombineDate, CleanHTML diff --git a/modules/carrefourbanque/pages.py b/modules/carrefourbanque/pages.py index dc89c1c3..6301423f 100644 --- a/modules/carrefourbanque/pages.py +++ b/modules/carrefourbanque/pages.py @@ -20,7 +20,8 @@ import re -from weboob.tools.browser2.page import HTMLPage, ListElement, ItemElement, method, LoggedPage +from weboob.tools.browser2.page import HTMLPage, method, LoggedPage +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import Regexp, CleanText, CleanDecimal, Format, Link from weboob.capabilities.bank import Account diff --git a/modules/cci/pages.py b/modules/cci/pages.py index 88b2cbbd..5173d3c8 100644 --- a/modules/cci/pages.py +++ b/modules/cci/pages.py @@ -17,7 +17,8 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from weboob.tools.browser2.page import HTMLPage, method, ItemElement, TableElement +from weboob.tools.browser2.page import HTMLPage, method +from weboob.tools.browser2.elements import ItemElement, TableElement from weboob.tools.browser2.filters import Filter, Link, CleanText, Format, Env, DateTime, CleanHTML, TableCell, Join from weboob.capabilities.job import BaseJobAdvert diff --git a/modules/creditmutuel/pages.py b/modules/creditmutuel/pages.py index 22cfda26..d63666f2 100644 --- a/modules/creditmutuel/pages.py +++ b/modules/creditmutuel/pages.py @@ -27,7 +27,8 @@ from decimal import Decimal import re from dateutil.relativedelta import relativedelta -from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, SkipItem, FormNotFound, LoggedPage +from weboob.tools.browser2.page import HTMLPage, method, FormNotFound, LoggedPage +from weboob.tools.browser2.elements import ListElement, ItemElement, SkipItem from weboob.tools.browser2.filters import Filter, Env, CleanText, CleanDecimal, Link, Field, TableCell from weboob.tools.exceptions import BrowserIncorrectPassword from weboob.capabilities import NotAvailable diff --git a/modules/dresdenwetter/pages.py b/modules/dresdenwetter/pages.py index 32010409..0d211fcd 100644 --- a/modules/dresdenwetter/pages.py +++ b/modules/dresdenwetter/pages.py @@ -17,7 +17,8 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement +from weboob.tools.browser2.page import HTMLPage, method +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import CleanText, Regexp, Field, Filter from weboob.capabilities.gauge import GaugeMeasure, GaugeSensor from weboob.capabilities.base import NotAvailable diff --git a/modules/feedly/pages.py b/modules/feedly/pages.py index 803184d0..8de43bab 100644 --- a/modules/feedly/pages.py +++ b/modules/feedly/pages.py @@ -21,7 +21,8 @@ from datetime import datetime from weboob.capabilities.messages import Message from weboob.capabilities.collection import Collection -from weboob.tools.browser2.page import JsonPage, ListElement, method, ItemElement +from weboob.tools.browser2.page import JsonPage, method +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import CleanText, Dict, Format, CleanHTML __all__ = ['TokenPage', 'ContentsPage', 'PreferencesPage'] diff --git a/modules/francetelevisions/pages.py b/modules/francetelevisions/pages.py index 1c81725b..c01d8ccd 100644 --- a/modules/francetelevisions/pages.py +++ b/modules/francetelevisions/pages.py @@ -22,7 +22,8 @@ from weboob.capabilities.video import BaseVideo from datetime import timedelta -from weboob.tools.browser2.page import HTMLPage, method, ItemElement, ListElement, JsonPage +from weboob.tools.browser2.page import HTMLPage, method, JsonPage +from weboob.tools.browser2.elements import ItemElement, ListElement from weboob.tools.browser2.filters import Filter, Link, CleanText, Regexp, Attr, Format, DateTime, Env, Dict, Duration, XPath diff --git a/modules/freemobile/pages/history.py b/modules/freemobile/pages/history.py index 19910e0e..1eab151e 100644 --- a/modules/freemobile/pages/history.py +++ b/modules/freemobile/pages/history.py @@ -24,7 +24,8 @@ import lxml.html as html from datetime import datetime from decimal import Decimal -from weboob.tools.browser2.page import HTMLPage, method, ItemElement, ListElement, LoggedPage +from weboob.tools.browser2.page import HTMLPage, method, LoggedPage +from weboob.tools.browser2.elements import ItemElement, ListElement from weboob.tools.browser2.filters import Date, CleanText, Attr, Filter,\ CleanDecimal, Regexp, Field, DateTime, Format, Env from weboob.capabilities.bill import Detail, Bill diff --git a/modules/freemobile/pages/homepage.py b/modules/freemobile/pages/homepage.py index fec6d14a..72b34f12 100644 --- a/modules/freemobile/pages/homepage.py +++ b/modules/freemobile/pages/homepage.py @@ -19,7 +19,8 @@ from .history import BadUTF8Page from weboob.capabilities.bill import Subscription -from weboob.tools.browser2.page import method, ListElement, ItemElement +from weboob.tools.browser2.page import method +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import CleanText, Attr, Field, Format, Filter __all__ = ['HomePage'] diff --git a/modules/groupamaes/pages.py b/modules/groupamaes/pages.py index 55b2eca3..f7e3c4fe 100644 --- a/modules/groupamaes/pages.py +++ b/modules/groupamaes/pages.py @@ -18,7 +18,8 @@ # along with weboob. If not, see . -from weboob.tools.browser2.page import HTMLPage, method, TableElement, ItemElement, LoggedPage +from weboob.tools.browser2.page import HTMLPage, method, LoggedPage +from weboob.tools.browser2.elements import TableElement, ItemElement from weboob.tools.browser2.filters import CleanText, CleanDecimal, TableCell, Date from weboob.capabilities.bank import Account, Transaction from weboob.tools.date import LinearDateGuesser diff --git a/modules/hsbc/pages.py b/modules/hsbc/pages.py index 09e437b9..42309e3e 100644 --- a/modules/hsbc/pages.py +++ b/modules/hsbc/pages.py @@ -25,7 +25,8 @@ from weboob.capabilities.bank import Account from weboob.tools.capabilities.bank.transactions import FrenchTransaction from weboob.tools.exceptions import BrowserIncorrectPassword -from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, SkipItem, LoggedPage, pagination +from weboob.tools.browser2.elements import ListElement, ItemElement, SkipItem +from weboob.tools.browser2.page import HTMLPage, method, LoggedPage, pagination from weboob.tools.browser2.filters import Filter, Env, CleanText, CleanDecimal, Link, Field, DateGuesser, TableCell diff --git a/modules/hybride/pages.py b/modules/hybride/pages.py index 1a71f54e..b6ea3f8a 100644 --- a/modules/hybride/pages.py +++ b/modules/hybride/pages.py @@ -23,7 +23,8 @@ from .calendar import HybrideCalendarEvent import weboob.tools.date as date_util import re -from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement +from weboob.tools.browser2.page import HTMLPage, method +from weboob.tools.browser2.elements import ItemElement, SkipItem, ListElement from weboob.tools.browser2.filters import Filter, Link, CleanText, Env diff --git a/modules/indeed/pages.py b/modules/indeed/pages.py index 88d13720..a73ddd5d 100644 --- a/modules/indeed/pages.py +++ b/modules/indeed/pages.py @@ -19,7 +19,8 @@ from datetime import timedelta, datetime import re -from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, pagination +from weboob.tools.browser2.page import HTMLPage, method, pagination +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import Filter, CleanText, Regexp, Format, Env, CleanHTML, Attr from weboob.capabilities.job import BaseJobAdvert diff --git a/modules/ing/pages/accounts_list.py b/modules/ing/pages/accounts_list.py index 4daf512c..58dcb483 100644 --- a/modules/ing/pages/accounts_list.py +++ b/modules/ing/pages/accounts_list.py @@ -24,7 +24,8 @@ import re from weboob.capabilities.bank import Account from weboob.capabilities.base import NotAvailable -from weboob.tools.browser2.page import HTMLPage, LoggedPage, method, ListElement, ItemElement +from weboob.tools.browser2.page import HTMLPage, LoggedPage, method +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import Attr, CleanText, CleanDecimal, Filter, Field, MultiFilter, Date, Lower from weboob.tools.capabilities.bank.transactions import FrenchTransaction diff --git a/modules/ing/pages/bills.py b/modules/ing/pages/bills.py index 0491ee69..bafecaeb 100644 --- a/modules/ing/pages/bills.py +++ b/modules/ing/pages/bills.py @@ -20,7 +20,8 @@ from weboob.capabilities.bill import Bill, Subscription from weboob.tools.browser2 import HTMLPage, LoggedPage from weboob.tools.browser2.filters import Filter, Attr, CleanText, Format, Field, Env -from weboob.tools.browser2.page import ListElement, ItemElement, method, pagination +from weboob.tools.browser2.page import method, pagination +from weboob.tools.browser2.elements import ListElement, ItemElement __all__ = ['BillsPage'] diff --git a/modules/ing/pages/titre.py b/modules/ing/pages/titre.py index a1463bec..aa5913b8 100644 --- a/modules/ing/pages/titre.py +++ b/modules/ing/pages/titre.py @@ -21,7 +21,8 @@ from decimal import Decimal from weboob.capabilities.bank import Investment -from weboob.tools.browser2.page import RawPage, HTMLPage, method, ListElement, ItemElement +from weboob.tools.browser2.page import RawPage, HTMLPage, method +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import CleanDecimal, CleanText, Date from weboob.tools.capabilities.bank.transactions import FrenchTransaction diff --git a/modules/ing/pages/transfer.py b/modules/ing/pages/transfer.py index c09821dd..0ca8da6c 100644 --- a/modules/ing/pages/transfer.py +++ b/modules/ing/pages/transfer.py @@ -18,7 +18,8 @@ # along with weboob. If not, see . from weboob.capabilities.bank import Recipient, AccountNotFound, Transfer -from weboob.tools.browser2.page import HTMLPage, LoggedPage, ListElement, ItemElement, method +from weboob.tools.browser2.page import HTMLPage, LoggedPage, method +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import CleanText, CleanDecimal, Attr, Format from .login import INGVirtKeyboard diff --git a/modules/oney/pages.py b/modules/oney/pages.py index 0d557f23..5e4d6577 100644 --- a/modules/oney/pages.py +++ b/modules/oney/pages.py @@ -25,7 +25,8 @@ import requests from weboob.tools.capabilities.bank.transactions import FrenchTransaction from weboob.tools.captcha.virtkeyboard import MappedVirtKeyboard, VirtKeyboardError -from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, LoggedPage, pagination +from weboob.tools.browser2.page import HTMLPage, method, LoggedPage, pagination +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import Env, CleanDecimal, ParseError __all__ = ['LoginPage', 'IndexPage', 'OperationsPage'] diff --git a/modules/pastealacon/browser.py b/modules/pastealacon/browser.py index db0a9741..b672052b 100644 --- a/modules/pastealacon/browser.py +++ b/modules/pastealacon/browser.py @@ -21,7 +21,8 @@ import re from weboob.capabilities.paste import BasePaste, PasteNotFound from weboob.tools.browser2.filters import CleanText, DateTime, Env, RawText, Regexp -from weboob.tools.browser2.page import HTMLPage, ItemElement, method, PagesBrowser, URL +from weboob.tools.browser2.page import HTMLPage, method, PagesBrowser, URL +from weboob.tools.browser2.elements import ItemElement from weboob.tools.exceptions import BrowserHTTPNotFound diff --git a/modules/pastebin/browser.py b/modules/pastebin/browser.py index 835c4747..da97b8f4 100644 --- a/modules/pastebin/browser.py +++ b/modules/pastebin/browser.py @@ -23,7 +23,8 @@ import re from weboob.capabilities.paste import BasePaste, PasteNotFound from weboob.tools.browser2 import HTMLPage, LoginBrowser, need_login, URL from weboob.tools.browser2.filters import Attr, Base, CleanText, DateTime, Env, Filter, FilterError, RawText -from weboob.tools.browser2.page import ItemElement, method, RawPage +from weboob.tools.browser2.page import method, RawPage +from weboob.tools.browser2.elements import ItemElement from weboob.tools.exceptions import BrowserHTTPNotFound, BrowserIncorrectPassword, BrowserUnavailable diff --git a/modules/poivy/pages.py b/modules/poivy/pages.py index 7f29d0bf..ab85e294 100644 --- a/modules/poivy/pages.py +++ b/modules/poivy/pages.py @@ -18,7 +18,8 @@ # along with weboob. If not, see . from weboob.tools.exceptions import BrowserBanned -from weboob.tools.browser2.page import HTMLPage, LoggedPage, method, ListElement, ItemElement, pagination +from weboob.tools.browser2.page import HTMLPage, LoggedPage, method, pagination +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import CleanText, CleanDecimal, Field, Attr, DateTime, Link, Format from weboob.capabilities.bill import Subscription, Detail diff --git a/modules/regionsjob/pages.py b/modules/regionsjob/pages.py index b0ee3199..78519d35 100644 --- a/modules/regionsjob/pages.py +++ b/modules/regionsjob/pages.py @@ -17,7 +17,8 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement +from weboob.tools.browser2.page import HTMLPage, method +from weboob.tools.browser2.elements import ItemElement, SkipItem, ListElement from weboob.tools.browser2.filters import Link, CleanText, Regexp, Format, Env, DateGuesser, CleanHTML, DateTime from weboob.tools.date import LinearDateGuesser from weboob.capabilities.job import BaseJobAdvert diff --git a/modules/sachsen/pages.py b/modules/sachsen/pages.py index 2767a19b..826a387f 100644 --- a/modules/sachsen/pages.py +++ b/modules/sachsen/pages.py @@ -17,7 +17,8 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement +from weboob.tools.browser2.page import HTMLPage, method +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import Env, CleanText, Regexp, Field, DateTime, Map, Attr from weboob.capabilities.gauge import Gauge, GaugeMeasure, GaugeSensor from weboob.capabilities.base import NotAvailable, NotLoaded diff --git a/modules/senscritique/pages.py b/modules/senscritique/pages.py index 5f66d41a..a165e1c1 100644 --- a/modules/senscritique/pages.py +++ b/modules/senscritique/pages.py @@ -22,7 +22,8 @@ from .calendar import SensCritiquenCalendarEvent from datetime import date, datetime, time, timedelta -from weboob.tools.browser2.page import HTMLPage, method, ItemElement, ListElement, JsonPage +from weboob.tools.browser2.page import HTMLPage, method, JsonPage +from weboob.tools.browser2.elements import ItemElement, ListElement from weboob.tools.browser2.filters import Filter, Link, CleanText, Regexp, Attr, Join, Format diff --git a/modules/twitter/pages.py b/modules/twitter/pages.py index 848bdd77..2164353b 100644 --- a/modules/twitter/pages.py +++ b/modules/twitter/pages.py @@ -23,7 +23,8 @@ from io import StringIO import lxml.html as html import urllib -from weboob.tools.browser2.page import HTMLPage, JsonPage, method, ListElement, ItemElement, FormNotFound, pagination +from weboob.tools.browser2.page import HTMLPage, JsonPage, method, FormNotFound, pagination +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import CleanText, Format, Link, Regexp, Env, DateTime, Attr, Filter from weboob.capabilities.messages import Thread, Message from weboob.capabilities.base import BaseObject diff --git a/modules/vlille/pages.py b/modules/vlille/pages.py index afab9dc0..2cd3c86f 100644 --- a/modules/vlille/pages.py +++ b/modules/vlille/pages.py @@ -18,7 +18,8 @@ # along with weboob. If not, see . -from weboob.tools.browser2.page import HTMLPage, XMLPage, method, ListElement, ItemElement, TableElement +from weboob.tools.browser2.page import HTMLPage, XMLPage, method +from weboob.tools.browser2.elements import ListElement, ItemElement, TableElement from weboob.tools.browser2.filters import CleanText, TableCell, Filter from weboob.capabilities.gauge import Gauge, GaugeMeasure, GaugeSensor diff --git a/modules/youjizz/pages/index.py b/modules/youjizz/pages/index.py index 280761b7..6f401ec6 100644 --- a/modules/youjizz/pages/index.py +++ b/modules/youjizz/pages/index.py @@ -19,7 +19,8 @@ from weboob.tools.browser2 import HTMLPage -from weboob.tools.browser2.page import ListElement, method, ItemElement, pagination +from weboob.tools.browser2.page import method, pagination +from weboob.tools.browser2.elements import ListElement, ItemElement from weboob.tools.browser2.filters import Link, CleanText, Duration, Regexp, CSS from weboob.capabilities.base import NotAvailable from weboob.capabilities.image import BaseImage diff --git a/modules/youjizz/pages/video.py b/modules/youjizz/pages/video.py index b64f0265..1b59b5ca 100644 --- a/modules/youjizz/pages/video.py +++ b/modules/youjizz/pages/video.py @@ -20,8 +20,8 @@ import re -from weboob.tools.browser2 import HTMLPage -from weboob.tools.browser2.page import method, ItemElement +from weboob.tools.browser2.page import method, HTMLPage +from weboob.tools.browser2.elements import ItemElement from weboob.tools.browser2.filters import CleanText, Env, Duration from weboob.capabilities.video import BaseVideo from weboob.tools.misc import to_unicode diff --git a/weboob/tools/browser2/elements.py b/weboob/tools/browser2/elements.py new file mode 100644 index 00000000..97d545c3 --- /dev/null +++ b/weboob/tools/browser2/elements.py @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2014 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +import re +import sys +from copy import deepcopy +from .filters import _Filter, CleanText, AttributeNotFound, XPathNotFound +from weboob.tools.log import getLogger +from weboob.tools.browser2.page import NextPage + +class DataError(Exception): + """ + Returned data from pages are incoherent. + """ + + +class AbstractElement(object): + def __init__(self, page, parent=None, el=None): + self.page = page + self.parent = parent + if el is not None: + self.el = el + elif parent is not None: + self.el = parent.el + else: + self.el = page.doc + + if parent is not None: + self.env = deepcopy(parent.env) + else: + self.env = deepcopy(page.params) + + def use_selector(self, func): + if isinstance(func, _Filter): + value = func(self) + elif callable(func): + value = func() + else: + value = deepcopy(func) + + return value + + def parse(self, obj): + pass + + def cssselect(self, *args, **kwargs): + return self.el.cssselect(*args, **kwargs) + + def xpath(self, *args, **kwargs): + return self.el.xpath(*args, **kwargs) + + +class SkipItem(Exception): + """ + Raise this exception in an :class:`ItemElement` subclass to skip an item. + """ + + +class _ItemElementMeta(type): + """ + Private meta-class used to keep order of obj_* attributes in :class:`ItemElement`. + """ + def __new__(mcs, name, bases, attrs): + _attrs = [] + for base in bases: + if hasattr(base, '_attrs'): + _attrs += base._attrs + + filters = [(re.sub('^obj_', '', attr_name), attrs[attr_name]) for attr_name, obj in attrs.items() if attr_name.startswith('obj_')] + # constants first, then filters, then methods + filters.sort(key=lambda x: x[1]._creation_counter if hasattr(x[1], '_creation_counter') else (sys.maxint if callable(x[1]) else 0)) + + new_class = super(_ItemElementMeta, mcs).__new__(mcs, name, bases, attrs) + new_class._attrs = _attrs + [f[0] for f in filters] + return new_class + + +class ItemElement(AbstractElement): + __metaclass__ = _ItemElementMeta + + _attrs = None + klass = None + condition = None + validate = None + + class Index(object): + pass + + def __init__(self, *args, **kwargs): + super(ItemElement, self).__init__(*args, **kwargs) + self.obj = None + + def build_object(self): + if self.klass is None: + return + return self.klass() + + def __call__(self, obj=None): + if obj is not None: + self.obj = obj + + for obj in self: + return obj + + def __iter__(self): + if self.condition is not None and not self.condition(): + return + + try: + if self.obj is None: + self.obj = self.build_object() + self.parse(self.el) + for attr in self._attrs: + self.handle_attr(attr, getattr(self, 'obj_%s' % attr)) + except SkipItem: + return + + if self.validate is not None and not self.validate(self.obj): + return + + yield self.obj + + def handle_attr(self, key, func): + value = self.use_selector(func) + setattr(self.obj, key, value) + + +class ListElement(AbstractElement): + item_xpath = None + flush_at_end = False + ignore_duplicate = False + + def __init__(self, *args, **kwargs): + super(ListElement, self).__init__(*args, **kwargs) + self.logger = getLogger(self.__class__.__name__.lower()) + self.objects = {} + + def __call__(self, *args, **kwargs): + for key, value in kwargs.iteritems(): + self.env[key] = value + + return self.__iter__() + + def __iter__(self): + self.parse(self.el) + + if self.item_xpath is not None: + for el in self.el.xpath(self.item_xpath): + for obj in self.handle_element(el): + if not self.flush_at_end: + yield obj + else: + for obj in self.handle_element(self.el): + if not self.flush_at_end: + yield obj + + if self.flush_at_end: + for obj in self.objects.itervalues(): + yield obj + + self.check_next_page() + + def check_next_page(self): + if not hasattr(self, 'next_page'): + return + + next_page = getattr(self, 'next_page') + try: + value = self.use_selector(next_page) + except (AttributeNotFound, XPathNotFound): + return + + if value is None: + return + + raise NextPage(value) + + + def store(self, obj): + if obj.id: + if obj.id in self.objects: + if self.ignore_duplicate: + self.logger.warning('There are two objects with the same ID! %s' % obj.id) + return + else: + raise DataError('There are two objects with the same ID! %s' % obj.id) + self.objects[obj.id] = obj + return obj + + def handle_element(self, el): + for attrname in dir(self): + attr = getattr(self, attrname) + if isinstance(attr, type) and issubclass(attr, AbstractElement) and attr != type(self): + for obj in attr(self.page, self, el): + obj = self.store(obj) + if obj: + yield obj + + +class TableElement(ListElement): + head_xpath = None + cleaner = CleanText + + def __init__(self, *args, **kwargs): + super(TableElement, self).__init__(*args, **kwargs) + + self._cols = {} + + columns = {} + for attrname in dir(self): + m = re.match('col_(.*)', attrname) + if m: + cols = getattr(self, attrname) + if not isinstance(cols, (list,tuple)): + cols = [cols] + columns[m.group(1)] = [s.lower() for s in cols] + + for colnum, el in enumerate(self.el.xpath(self.head_xpath)): + title = self.cleaner.clean(el).lower() + for name, titles in columns.iteritems(): + if title in titles: + self._cols[name] = colnum + + def get_colnum(self, name): + return self._cols.get(name, None) diff --git a/weboob/tools/browser2/page.py b/weboob/tools/browser2/page.py index 703e2069..eba4c2a5 100644 --- a/weboob/tools/browser2/page.py +++ b/weboob/tools/browser2/page.py @@ -24,15 +24,11 @@ try: except ImportError: from urllib import unquote import re -import sys from copy import deepcopy from io import BytesIO import requests -import lxml.html as html -import lxml.etree as etree -from weboob.tools.json import json from weboob.tools.ordereddict import OrderedDict from weboob.tools.regex_helper import normalize from weboob.tools.compat import basestring @@ -40,7 +36,6 @@ from weboob.tools.compat import basestring from weboob.tools.log import getLogger from .browser import DomainBrowser -from .filters import _Filter, CleanText, AttributeNotFound, XPathNotFound class UrlNotResolvable(Exception): @@ -49,12 +44,6 @@ class UrlNotResolvable(Exception): """ -class DataError(Exception): - """ - Returned data from pages are incoherent. - """ - - class URL(object): """ A description of an URL on the PagesBrowser website. @@ -538,6 +527,7 @@ class Form(OrderedDict): class JsonPage(BasePage): def __init__(self, browser, response, *args, **kwargs): super(JsonPage, self).__init__(browser, response, *args, **kwargs) + from weboob.tools.json import json self.doc = json.loads(response.text) @@ -550,6 +540,7 @@ class XMLPage(BasePage): def __init__(self, browser, response, *args, **kwargs): super(XMLPage, self).__init__(browser, response, *args, **kwargs) + import lxml.etree as etree parser = etree.XMLParser(encoding=self.ENCODING or response.encoding) self.doc = etree.parse(BytesIO(response.content), parser) @@ -575,6 +566,7 @@ class HTMLPage(BasePage): def __init__(self, browser, response, *args, **kwargs): super(HTMLPage, self).__init__(browser, response, *args, **kwargs) self.encoding = self.ENCODING or response.encoding + import lxml.html as html parser = html.HTMLParser(encoding=self.encoding) self.doc = html.parse(BytesIO(response.content), parser) @@ -613,228 +605,6 @@ def method(klass): return inner -class AbstractElement(object): - def __init__(self, page, parent=None, el=None): - self.page = page - self.parent = parent - if el is not None: - self.el = el - elif parent is not None: - self.el = parent.el - else: - self.el = page.doc - - if parent is not None: - self.env = deepcopy(parent.env) - else: - self.env = deepcopy(page.params) - - def use_selector(self, func): - if isinstance(func, _Filter): - value = func(self) - elif callable(func): - value = func() - else: - value = deepcopy(func) - - return value - - def parse(self, obj): - pass - - def cssselect(self, *args, **kwargs): - return self.el.cssselect(*args, **kwargs) - - def xpath(self, *args, **kwargs): - return self.el.xpath(*args, **kwargs) - - -class ListElement(AbstractElement): - item_xpath = None - flush_at_end = False - ignore_duplicate = False - - def __init__(self, *args, **kwargs): - super(ListElement, self).__init__(*args, **kwargs) - self.logger = getLogger(self.__class__.__name__.lower()) - self.objects = OrderedDict() - - def __call__(self, *args, **kwargs): - for key, value in kwargs.iteritems(): - self.env[key] = value - - return self.__iter__() - - def find_elements(self): - """ - Get the nodes that will have to be processed. - This method can be overridden if xpath filters are not - sufficient. - """ - if self.item_xpath is not None: - for el in self.el.xpath(self.item_xpath): - yield el - else: - yield self.el - - def __iter__(self): - self.parse(self.el) - - for el in self.find_elements(): - for obj in self.handle_element(el): - if not self.flush_at_end: - yield obj - - if self.flush_at_end: - for obj in self.flush(): - yield obj - - self.check_next_page() - - def flush(self): - for obj in self.objects.itervalues(): - yield obj - - def check_next_page(self): - if not hasattr(self, 'next_page'): - return - - next_page = getattr(self, 'next_page') - try: - value = self.use_selector(next_page) - except (AttributeNotFound, XPathNotFound): - return - - if value is None: - return - - raise NextPage(value) - - - def store(self, obj): - if obj.id: - if obj.id in self.objects: - if self.ignore_duplicate: - self.logger.warning('There are two objects with the same ID! %s' % obj.id) - return - else: - raise DataError('There are two objects with the same ID! %s' % obj.id) - self.objects[obj.id] = obj - return obj - - def handle_element(self, el): - for attrname in dir(self): - attr = getattr(self, attrname) - if isinstance(attr, type) and issubclass(attr, AbstractElement) and attr != type(self): - for obj in attr(self.page, self, el): - obj = self.store(obj) - if obj: - yield obj - - -class SkipItem(Exception): - """ - Raise this exception in an :class:`ItemElement` subclass to skip an item. - """ - - -class _ItemElementMeta(type): - """ - Private meta-class used to keep order of obj_* attributes in :class:`ItemElement`. - """ - def __new__(mcs, name, bases, attrs): - _attrs = [] - for base in bases: - if hasattr(base, '_attrs'): - _attrs += base._attrs - - filters = [(re.sub('^obj_', '', attr_name), attrs[attr_name]) for attr_name, obj in attrs.items() if attr_name.startswith('obj_')] - # constants first, then filters, then methods - filters.sort(key=lambda x: x[1]._creation_counter if hasattr(x[1], '_creation_counter') else (sys.maxint if callable(x[1]) else 0)) - - new_class = super(_ItemElementMeta, mcs).__new__(mcs, name, bases, attrs) - new_class._attrs = _attrs + [f[0] for f in filters] - return new_class - - -class ItemElement(AbstractElement): - __metaclass__ = _ItemElementMeta - - _attrs = None - klass = None - condition = None - validate = None - - class Index(object): - pass - - def __init__(self, *args, **kwargs): - super(ItemElement, self).__init__(*args, **kwargs) - self.obj = None - - def build_object(self): - if self.klass is None: - return - return self.klass() - - def __call__(self, obj=None): - if obj is not None: - self.obj = obj - - for obj in self: - return obj - - def __iter__(self): - if self.condition is not None and not self.condition(): - return - - try: - if self.obj is None: - self.obj = self.build_object() - self.parse(self.el) - for attr in self._attrs: - self.handle_attr(attr, getattr(self, 'obj_%s' % attr)) - except SkipItem: - return - - if self.validate is not None and not self.validate(self.obj): - return - - yield self.obj - - def handle_attr(self, key, func): - value = self.use_selector(func) - setattr(self.obj, key, value) - - -class TableElement(ListElement): - head_xpath = None - cleaner = CleanText - - def __init__(self, *args, **kwargs): - super(TableElement, self).__init__(*args, **kwargs) - - self._cols = {} - - columns = {} - for attrname in dir(self): - m = re.match('col_(.*)', attrname) - if m: - cols = getattr(self, attrname) - if not isinstance(cols, (list,tuple)): - cols = [cols] - columns[m.group(1)] = [s.lower() for s in cols] - - for colnum, el in enumerate(self.el.xpath(self.head_xpath)): - title = self.cleaner.clean(el).lower() - for name, titles in columns.iteritems(): - if title in titles: - self._cols[name] = colnum - - def get_colnum(self, name): - return self._cols.get(name, None) - - class LoggedPage(object): """ A page that only logged users can reach. If we did not get a redirection diff --git a/weboob/tools/capabilities/bank/transactions.py b/weboob/tools/capabilities/bank/transactions.py index a7cf9b48..6ed55a34 100644 --- a/weboob/tools/capabilities/bank/transactions.py +++ b/weboob/tools/capabilities/bank/transactions.py @@ -28,7 +28,7 @@ from weboob.tools.misc import to_unicode from weboob.tools.log import getLogger from weboob.tools.exceptions import ParseError -from weboob.tools.browser2.page import TableElement, ItemElement +from weboob.tools.browser2.elements import TableElement, ItemElement from weboob.tools.browser2.filters import Filter, CleanText, CleanDecimal, TableCell