Move ItemListTable-Element outside of page.py

One of the goal is to not import all modules needed by filters by
loading the page file.

In the same goal, move the import of parsers in the class definition.
This commit is contained in:
Florent 2014-07-08 19:59:37 +02:00
commit 76cb004eb4
33 changed files with 305 additions and 265 deletions

View file

@ -21,7 +21,8 @@
import datetime
from decimal import Decimal
from weboob.tools.browser2.page import HTMLPage, LoggedPage, method, ItemElement
from weboob.tools.browser2.page import HTMLPage, LoggedPage, method
from weboob.tools.browser2.elements import ItemElement
from weboob.tools.browser2.filters import CleanDecimal, CleanText, Filter, TableCell
from weboob.capabilities.bank import Account
from weboob.tools.capabilities.bank.transactions import FrenchTransaction as Transaction

View file

@ -24,7 +24,8 @@ import re
from cStringIO import StringIO
from weboob.capabilities.bank import Account
from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, LoggedPage
from weboob.tools.browser2.page import HTMLPage, method, LoggedPage
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import ParseError, CleanText, Regexp, Attr, CleanDecimal, Env
from weboob.tools.captcha.virtkeyboard import MappedVirtKeyboard, VirtKeyboardError
from weboob.tools.capabilities.bank.transactions import FrenchTransaction

View file

@ -23,7 +23,8 @@ from datetime import datetime, time
import weboob.tools.date as date_util
from .calendar import BiplanCalendarEventConcert, BiplanCalendarEventTheatre
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement
from weboob.tools.browser2.elements import ItemElement, SkipItem, ListElement
from weboob.tools.browser2.page import HTMLPage, method
from weboob.tools.browser2.filters import Filter, Link, CleanText, Env, Regexp, CombineDate, CleanHTML

View file

@ -20,7 +20,8 @@
import re
from weboob.tools.browser2.page import HTMLPage, ListElement, ItemElement, method, LoggedPage
from weboob.tools.browser2.page import HTMLPage, method, LoggedPage
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import Regexp, CleanText, CleanDecimal, Format, Link
from weboob.capabilities.bank import Account

View file

@ -17,7 +17,8 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, TableElement
from weboob.tools.browser2.page import HTMLPage, method
from weboob.tools.browser2.elements import ItemElement, TableElement
from weboob.tools.browser2.filters import Filter, Link, CleanText, Format, Env, DateTime, CleanHTML, TableCell, Join
from weboob.capabilities.job import BaseJobAdvert

View file

@ -27,7 +27,8 @@ from decimal import Decimal
import re
from dateutil.relativedelta import relativedelta
from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, SkipItem, FormNotFound, LoggedPage
from weboob.tools.browser2.page import HTMLPage, method, FormNotFound, LoggedPage
from weboob.tools.browser2.elements import ListElement, ItemElement, SkipItem
from weboob.tools.browser2.filters import Filter, Env, CleanText, CleanDecimal, Link, Field, TableCell
from weboob.tools.exceptions import BrowserIncorrectPassword
from weboob.capabilities import NotAvailable

View file

@ -17,7 +17,8 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement
from weboob.tools.browser2.page import HTMLPage, method
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import CleanText, Regexp, Field, Filter
from weboob.capabilities.gauge import GaugeMeasure, GaugeSensor
from weboob.capabilities.base import NotAvailable

View file

@ -21,7 +21,8 @@ from datetime import datetime
from weboob.capabilities.messages import Message
from weboob.capabilities.collection import Collection
from weboob.tools.browser2.page import JsonPage, ListElement, method, ItemElement
from weboob.tools.browser2.page import JsonPage, method
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import CleanText, Dict, Format, CleanHTML
__all__ = ['TokenPage', 'ContentsPage', 'PreferencesPage']

View file

@ -22,7 +22,8 @@ from weboob.capabilities.video import BaseVideo
from datetime import timedelta
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, ListElement, JsonPage
from weboob.tools.browser2.page import HTMLPage, method, JsonPage
from weboob.tools.browser2.elements import ItemElement, ListElement
from weboob.tools.browser2.filters import Filter, Link, CleanText, Regexp, Attr, Format, DateTime, Env, Dict, Duration, XPath

View file

@ -24,7 +24,8 @@ import lxml.html as html
from datetime import datetime
from decimal import Decimal
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, ListElement, LoggedPage
from weboob.tools.browser2.page import HTMLPage, method, LoggedPage
from weboob.tools.browser2.elements import ItemElement, ListElement
from weboob.tools.browser2.filters import Date, CleanText, Attr, Filter,\
CleanDecimal, Regexp, Field, DateTime, Format, Env
from weboob.capabilities.bill import Detail, Bill

View file

@ -19,7 +19,8 @@
from .history import BadUTF8Page
from weboob.capabilities.bill import Subscription
from weboob.tools.browser2.page import method, ListElement, ItemElement
from weboob.tools.browser2.page import method
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import CleanText, Attr, Field, Format, Filter
__all__ = ['HomePage']

View file

@ -18,7 +18,8 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser2.page import HTMLPage, method, TableElement, ItemElement, LoggedPage
from weboob.tools.browser2.page import HTMLPage, method, LoggedPage
from weboob.tools.browser2.elements import TableElement, ItemElement
from weboob.tools.browser2.filters import CleanText, CleanDecimal, TableCell, Date
from weboob.capabilities.bank import Account, Transaction
from weboob.tools.date import LinearDateGuesser

View file

@ -25,7 +25,8 @@ from weboob.capabilities.bank import Account
from weboob.tools.capabilities.bank.transactions import FrenchTransaction
from weboob.tools.exceptions import BrowserIncorrectPassword
from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, SkipItem, LoggedPage, pagination
from weboob.tools.browser2.elements import ListElement, ItemElement, SkipItem
from weboob.tools.browser2.page import HTMLPage, method, LoggedPage, pagination
from weboob.tools.browser2.filters import Filter, Env, CleanText, CleanDecimal, Link, Field, DateGuesser, TableCell

View file

@ -23,7 +23,8 @@ from .calendar import HybrideCalendarEvent
import weboob.tools.date as date_util
import re
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement
from weboob.tools.browser2.page import HTMLPage, method
from weboob.tools.browser2.elements import ItemElement, SkipItem, ListElement
from weboob.tools.browser2.filters import Filter, Link, CleanText, Env

View file

@ -19,7 +19,8 @@
from datetime import timedelta, datetime
import re
from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, pagination
from weboob.tools.browser2.page import HTMLPage, method, pagination
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import Filter, CleanText, Regexp, Format, Env, CleanHTML, Attr
from weboob.capabilities.job import BaseJobAdvert

View file

@ -24,7 +24,8 @@ import re
from weboob.capabilities.bank import Account
from weboob.capabilities.base import NotAvailable
from weboob.tools.browser2.page import HTMLPage, LoggedPage, method, ListElement, ItemElement
from weboob.tools.browser2.page import HTMLPage, LoggedPage, method
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import Attr, CleanText, CleanDecimal, Filter, Field, MultiFilter, Date, Lower
from weboob.tools.capabilities.bank.transactions import FrenchTransaction

View file

@ -20,7 +20,8 @@
from weboob.capabilities.bill import Bill, Subscription
from weboob.tools.browser2 import HTMLPage, LoggedPage
from weboob.tools.browser2.filters import Filter, Attr, CleanText, Format, Field, Env
from weboob.tools.browser2.page import ListElement, ItemElement, method, pagination
from weboob.tools.browser2.page import method, pagination
from weboob.tools.browser2.elements import ListElement, ItemElement
__all__ = ['BillsPage']

View file

@ -21,7 +21,8 @@
from decimal import Decimal
from weboob.capabilities.bank import Investment
from weboob.tools.browser2.page import RawPage, HTMLPage, method, ListElement, ItemElement
from weboob.tools.browser2.page import RawPage, HTMLPage, method
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import CleanDecimal, CleanText, Date
from weboob.tools.capabilities.bank.transactions import FrenchTransaction

View file

@ -18,7 +18,8 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.bank import Recipient, AccountNotFound, Transfer
from weboob.tools.browser2.page import HTMLPage, LoggedPage, ListElement, ItemElement, method
from weboob.tools.browser2.page import HTMLPage, LoggedPage, method
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import CleanText, CleanDecimal, Attr, Format
from .login import INGVirtKeyboard

View file

@ -25,7 +25,8 @@ import requests
from weboob.tools.capabilities.bank.transactions import FrenchTransaction
from weboob.tools.captcha.virtkeyboard import MappedVirtKeyboard, VirtKeyboardError
from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, LoggedPage, pagination
from weboob.tools.browser2.page import HTMLPage, method, LoggedPage, pagination
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import Env, CleanDecimal, ParseError
__all__ = ['LoginPage', 'IndexPage', 'OperationsPage']

View file

@ -21,7 +21,8 @@ import re
from weboob.capabilities.paste import BasePaste, PasteNotFound
from weboob.tools.browser2.filters import CleanText, DateTime, Env, RawText, Regexp
from weboob.tools.browser2.page import HTMLPage, ItemElement, method, PagesBrowser, URL
from weboob.tools.browser2.page import HTMLPage, method, PagesBrowser, URL
from weboob.tools.browser2.elements import ItemElement
from weboob.tools.exceptions import BrowserHTTPNotFound

View file

@ -23,7 +23,8 @@ import re
from weboob.capabilities.paste import BasePaste, PasteNotFound
from weboob.tools.browser2 import HTMLPage, LoginBrowser, need_login, URL
from weboob.tools.browser2.filters import Attr, Base, CleanText, DateTime, Env, Filter, FilterError, RawText
from weboob.tools.browser2.page import ItemElement, method, RawPage
from weboob.tools.browser2.page import method, RawPage
from weboob.tools.browser2.elements import ItemElement
from weboob.tools.exceptions import BrowserHTTPNotFound, BrowserIncorrectPassword, BrowserUnavailable

View file

@ -18,7 +18,8 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.exceptions import BrowserBanned
from weboob.tools.browser2.page import HTMLPage, LoggedPage, method, ListElement, ItemElement, pagination
from weboob.tools.browser2.page import HTMLPage, LoggedPage, method, pagination
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import CleanText, CleanDecimal, Field, Attr, DateTime, Link, Format
from weboob.capabilities.bill import Subscription, Detail

View file

@ -17,7 +17,8 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement
from weboob.tools.browser2.page import HTMLPage, method
from weboob.tools.browser2.elements import ItemElement, SkipItem, ListElement
from weboob.tools.browser2.filters import Link, CleanText, Regexp, Format, Env, DateGuesser, CleanHTML, DateTime
from weboob.tools.date import LinearDateGuesser
from weboob.capabilities.job import BaseJobAdvert

View file

@ -17,7 +17,8 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement
from weboob.tools.browser2.page import HTMLPage, method
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import Env, CleanText, Regexp, Field, DateTime, Map, Attr
from weboob.capabilities.gauge import Gauge, GaugeMeasure, GaugeSensor
from weboob.capabilities.base import NotAvailable, NotLoaded

View file

@ -22,7 +22,8 @@ from .calendar import SensCritiquenCalendarEvent
from datetime import date, datetime, time, timedelta
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, ListElement, JsonPage
from weboob.tools.browser2.page import HTMLPage, method, JsonPage
from weboob.tools.browser2.elements import ItemElement, ListElement
from weboob.tools.browser2.filters import Filter, Link, CleanText, Regexp, Attr, Join, Format

View file

@ -23,7 +23,8 @@ from io import StringIO
import lxml.html as html
import urllib
from weboob.tools.browser2.page import HTMLPage, JsonPage, method, ListElement, ItemElement, FormNotFound, pagination
from weboob.tools.browser2.page import HTMLPage, JsonPage, method, FormNotFound, pagination
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import CleanText, Format, Link, Regexp, Env, DateTime, Attr, Filter
from weboob.capabilities.messages import Thread, Message
from weboob.capabilities.base import BaseObject

View file

@ -18,7 +18,8 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser2.page import HTMLPage, XMLPage, method, ListElement, ItemElement, TableElement
from weboob.tools.browser2.page import HTMLPage, XMLPage, method
from weboob.tools.browser2.elements import ListElement, ItemElement, TableElement
from weboob.tools.browser2.filters import CleanText, TableCell, Filter
from weboob.capabilities.gauge import Gauge, GaugeMeasure, GaugeSensor

View file

@ -19,7 +19,8 @@
from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import ListElement, method, ItemElement, pagination
from weboob.tools.browser2.page import method, pagination
from weboob.tools.browser2.elements import ListElement, ItemElement
from weboob.tools.browser2.filters import Link, CleanText, Duration, Regexp, CSS
from weboob.capabilities.base import NotAvailable
from weboob.capabilities.image import BaseImage

View file

@ -20,8 +20,8 @@
import re
from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import method, ItemElement
from weboob.tools.browser2.page import method, HTMLPage
from weboob.tools.browser2.elements import ItemElement
from weboob.tools.browser2.filters import CleanText, Env, Duration
from weboob.capabilities.video import BaseVideo
from weboob.tools.misc import to_unicode

View file

@ -0,0 +1,241 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2014 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
import sys
from copy import deepcopy
from .filters import _Filter, CleanText, AttributeNotFound, XPathNotFound
from weboob.tools.log import getLogger
from weboob.tools.browser2.page import NextPage
class DataError(Exception):
"""
Returned data from pages are incoherent.
"""
class AbstractElement(object):
def __init__(self, page, parent=None, el=None):
self.page = page
self.parent = parent
if el is not None:
self.el = el
elif parent is not None:
self.el = parent.el
else:
self.el = page.doc
if parent is not None:
self.env = deepcopy(parent.env)
else:
self.env = deepcopy(page.params)
def use_selector(self, func):
if isinstance(func, _Filter):
value = func(self)
elif callable(func):
value = func()
else:
value = deepcopy(func)
return value
def parse(self, obj):
pass
def cssselect(self, *args, **kwargs):
return self.el.cssselect(*args, **kwargs)
def xpath(self, *args, **kwargs):
return self.el.xpath(*args, **kwargs)
class SkipItem(Exception):
"""
Raise this exception in an :class:`ItemElement` subclass to skip an item.
"""
class _ItemElementMeta(type):
"""
Private meta-class used to keep order of obj_* attributes in :class:`ItemElement`.
"""
def __new__(mcs, name, bases, attrs):
_attrs = []
for base in bases:
if hasattr(base, '_attrs'):
_attrs += base._attrs
filters = [(re.sub('^obj_', '', attr_name), attrs[attr_name]) for attr_name, obj in attrs.items() if attr_name.startswith('obj_')]
# constants first, then filters, then methods
filters.sort(key=lambda x: x[1]._creation_counter if hasattr(x[1], '_creation_counter') else (sys.maxint if callable(x[1]) else 0))
new_class = super(_ItemElementMeta, mcs).__new__(mcs, name, bases, attrs)
new_class._attrs = _attrs + [f[0] for f in filters]
return new_class
class ItemElement(AbstractElement):
__metaclass__ = _ItemElementMeta
_attrs = None
klass = None
condition = None
validate = None
class Index(object):
pass
def __init__(self, *args, **kwargs):
super(ItemElement, self).__init__(*args, **kwargs)
self.obj = None
def build_object(self):
if self.klass is None:
return
return self.klass()
def __call__(self, obj=None):
if obj is not None:
self.obj = obj
for obj in self:
return obj
def __iter__(self):
if self.condition is not None and not self.condition():
return
try:
if self.obj is None:
self.obj = self.build_object()
self.parse(self.el)
for attr in self._attrs:
self.handle_attr(attr, getattr(self, 'obj_%s' % attr))
except SkipItem:
return
if self.validate is not None and not self.validate(self.obj):
return
yield self.obj
def handle_attr(self, key, func):
value = self.use_selector(func)
setattr(self.obj, key, value)
class ListElement(AbstractElement):
item_xpath = None
flush_at_end = False
ignore_duplicate = False
def __init__(self, *args, **kwargs):
super(ListElement, self).__init__(*args, **kwargs)
self.logger = getLogger(self.__class__.__name__.lower())
self.objects = {}
def __call__(self, *args, **kwargs):
for key, value in kwargs.iteritems():
self.env[key] = value
return self.__iter__()
def __iter__(self):
self.parse(self.el)
if self.item_xpath is not None:
for el in self.el.xpath(self.item_xpath):
for obj in self.handle_element(el):
if not self.flush_at_end:
yield obj
else:
for obj in self.handle_element(self.el):
if not self.flush_at_end:
yield obj
if self.flush_at_end:
for obj in self.objects.itervalues():
yield obj
self.check_next_page()
def check_next_page(self):
if not hasattr(self, 'next_page'):
return
next_page = getattr(self, 'next_page')
try:
value = self.use_selector(next_page)
except (AttributeNotFound, XPathNotFound):
return
if value is None:
return
raise NextPage(value)
def store(self, obj):
if obj.id:
if obj.id in self.objects:
if self.ignore_duplicate:
self.logger.warning('There are two objects with the same ID! %s' % obj.id)
return
else:
raise DataError('There are two objects with the same ID! %s' % obj.id)
self.objects[obj.id] = obj
return obj
def handle_element(self, el):
for attrname in dir(self):
attr = getattr(self, attrname)
if isinstance(attr, type) and issubclass(attr, AbstractElement) and attr != type(self):
for obj in attr(self.page, self, el):
obj = self.store(obj)
if obj:
yield obj
class TableElement(ListElement):
head_xpath = None
cleaner = CleanText
def __init__(self, *args, **kwargs):
super(TableElement, self).__init__(*args, **kwargs)
self._cols = {}
columns = {}
for attrname in dir(self):
m = re.match('col_(.*)', attrname)
if m:
cols = getattr(self, attrname)
if not isinstance(cols, (list,tuple)):
cols = [cols]
columns[m.group(1)] = [s.lower() for s in cols]
for colnum, el in enumerate(self.el.xpath(self.head_xpath)):
title = self.cleaner.clean(el).lower()
for name, titles in columns.iteritems():
if title in titles:
self._cols[name] = colnum
def get_colnum(self, name):
return self._cols.get(name, None)

View file

@ -24,15 +24,11 @@ try:
except ImportError:
from urllib import unquote
import re
import sys
from copy import deepcopy
from io import BytesIO
import requests
import lxml.html as html
import lxml.etree as etree
from weboob.tools.json import json
from weboob.tools.ordereddict import OrderedDict
from weboob.tools.regex_helper import normalize
from weboob.tools.compat import basestring
@ -40,7 +36,6 @@ from weboob.tools.compat import basestring
from weboob.tools.log import getLogger
from .browser import DomainBrowser
from .filters import _Filter, CleanText, AttributeNotFound, XPathNotFound
class UrlNotResolvable(Exception):
@ -49,12 +44,6 @@ class UrlNotResolvable(Exception):
"""
class DataError(Exception):
"""
Returned data from pages are incoherent.
"""
class URL(object):
"""
A description of an URL on the PagesBrowser website.
@ -538,6 +527,7 @@ class Form(OrderedDict):
class JsonPage(BasePage):
def __init__(self, browser, response, *args, **kwargs):
super(JsonPage, self).__init__(browser, response, *args, **kwargs)
from weboob.tools.json import json
self.doc = json.loads(response.text)
@ -550,6 +540,7 @@ class XMLPage(BasePage):
def __init__(self, browser, response, *args, **kwargs):
super(XMLPage, self).__init__(browser, response, *args, **kwargs)
import lxml.etree as etree
parser = etree.XMLParser(encoding=self.ENCODING or response.encoding)
self.doc = etree.parse(BytesIO(response.content), parser)
@ -575,6 +566,7 @@ class HTMLPage(BasePage):
def __init__(self, browser, response, *args, **kwargs):
super(HTMLPage, self).__init__(browser, response, *args, **kwargs)
self.encoding = self.ENCODING or response.encoding
import lxml.html as html
parser = html.HTMLParser(encoding=self.encoding)
self.doc = html.parse(BytesIO(response.content), parser)
@ -613,228 +605,6 @@ def method(klass):
return inner
class AbstractElement(object):
def __init__(self, page, parent=None, el=None):
self.page = page
self.parent = parent
if el is not None:
self.el = el
elif parent is not None:
self.el = parent.el
else:
self.el = page.doc
if parent is not None:
self.env = deepcopy(parent.env)
else:
self.env = deepcopy(page.params)
def use_selector(self, func):
if isinstance(func, _Filter):
value = func(self)
elif callable(func):
value = func()
else:
value = deepcopy(func)
return value
def parse(self, obj):
pass
def cssselect(self, *args, **kwargs):
return self.el.cssselect(*args, **kwargs)
def xpath(self, *args, **kwargs):
return self.el.xpath(*args, **kwargs)
class ListElement(AbstractElement):
item_xpath = None
flush_at_end = False
ignore_duplicate = False
def __init__(self, *args, **kwargs):
super(ListElement, self).__init__(*args, **kwargs)
self.logger = getLogger(self.__class__.__name__.lower())
self.objects = OrderedDict()
def __call__(self, *args, **kwargs):
for key, value in kwargs.iteritems():
self.env[key] = value
return self.__iter__()
def find_elements(self):
"""
Get the nodes that will have to be processed.
This method can be overridden if xpath filters are not
sufficient.
"""
if self.item_xpath is not None:
for el in self.el.xpath(self.item_xpath):
yield el
else:
yield self.el
def __iter__(self):
self.parse(self.el)
for el in self.find_elements():
for obj in self.handle_element(el):
if not self.flush_at_end:
yield obj
if self.flush_at_end:
for obj in self.flush():
yield obj
self.check_next_page()
def flush(self):
for obj in self.objects.itervalues():
yield obj
def check_next_page(self):
if not hasattr(self, 'next_page'):
return
next_page = getattr(self, 'next_page')
try:
value = self.use_selector(next_page)
except (AttributeNotFound, XPathNotFound):
return
if value is None:
return
raise NextPage(value)
def store(self, obj):
if obj.id:
if obj.id in self.objects:
if self.ignore_duplicate:
self.logger.warning('There are two objects with the same ID! %s' % obj.id)
return
else:
raise DataError('There are two objects with the same ID! %s' % obj.id)
self.objects[obj.id] = obj
return obj
def handle_element(self, el):
for attrname in dir(self):
attr = getattr(self, attrname)
if isinstance(attr, type) and issubclass(attr, AbstractElement) and attr != type(self):
for obj in attr(self.page, self, el):
obj = self.store(obj)
if obj:
yield obj
class SkipItem(Exception):
"""
Raise this exception in an :class:`ItemElement` subclass to skip an item.
"""
class _ItemElementMeta(type):
"""
Private meta-class used to keep order of obj_* attributes in :class:`ItemElement`.
"""
def __new__(mcs, name, bases, attrs):
_attrs = []
for base in bases:
if hasattr(base, '_attrs'):
_attrs += base._attrs
filters = [(re.sub('^obj_', '', attr_name), attrs[attr_name]) for attr_name, obj in attrs.items() if attr_name.startswith('obj_')]
# constants first, then filters, then methods
filters.sort(key=lambda x: x[1]._creation_counter if hasattr(x[1], '_creation_counter') else (sys.maxint if callable(x[1]) else 0))
new_class = super(_ItemElementMeta, mcs).__new__(mcs, name, bases, attrs)
new_class._attrs = _attrs + [f[0] for f in filters]
return new_class
class ItemElement(AbstractElement):
__metaclass__ = _ItemElementMeta
_attrs = None
klass = None
condition = None
validate = None
class Index(object):
pass
def __init__(self, *args, **kwargs):
super(ItemElement, self).__init__(*args, **kwargs)
self.obj = None
def build_object(self):
if self.klass is None:
return
return self.klass()
def __call__(self, obj=None):
if obj is not None:
self.obj = obj
for obj in self:
return obj
def __iter__(self):
if self.condition is not None and not self.condition():
return
try:
if self.obj is None:
self.obj = self.build_object()
self.parse(self.el)
for attr in self._attrs:
self.handle_attr(attr, getattr(self, 'obj_%s' % attr))
except SkipItem:
return
if self.validate is not None and not self.validate(self.obj):
return
yield self.obj
def handle_attr(self, key, func):
value = self.use_selector(func)
setattr(self.obj, key, value)
class TableElement(ListElement):
head_xpath = None
cleaner = CleanText
def __init__(self, *args, **kwargs):
super(TableElement, self).__init__(*args, **kwargs)
self._cols = {}
columns = {}
for attrname in dir(self):
m = re.match('col_(.*)', attrname)
if m:
cols = getattr(self, attrname)
if not isinstance(cols, (list,tuple)):
cols = [cols]
columns[m.group(1)] = [s.lower() for s in cols]
for colnum, el in enumerate(self.el.xpath(self.head_xpath)):
title = self.cleaner.clean(el).lower()
for name, titles in columns.iteritems():
if title in titles:
self._cols[name] = colnum
def get_colnum(self, name):
return self._cols.get(name, None)
class LoggedPage(object):
"""
A page that only logged users can reach. If we did not get a redirection

View file

@ -28,7 +28,7 @@ from weboob.tools.misc import to_unicode
from weboob.tools.log import getLogger
from weboob.tools.exceptions import ParseError
from weboob.tools.browser2.page import TableElement, ItemElement
from weboob.tools.browser2.elements import TableElement, ItemElement
from weboob.tools.browser2.filters import Filter, CleanText, CleanDecimal, TableCell