add exceptions, add _Filter.default_or_raise, and other fixes

This commit is contained in:
Romain Bignon 2014-03-25 22:28:09 +01:00
commit 698ed5fb06
2 changed files with 73 additions and 48 deletions

View file

@ -23,18 +23,55 @@ from dateutil.parser import parse as parse_date
import datetime
from decimal import Decimal, InvalidOperation
import re
from weboob.capabilities.base import empty
_NO_DEFAULT = object()
class ParseError(Exception):
pass
class FilterError(ParseError):
pass
class XPathNotFound(FilterError):
pass
class ColumnNotFound(FilterError):
pass
class AttributeNotFound(FilterError):
pass
class RegexpError(FilterError):
pass
class ItemNotFound(FilterError):
pass
class _Filter(object):
_creation_counter = 0
def __init__(self):
def __init__(self, default=_NO_DEFAULT):
self.default = default
self._creation_counter = _Filter._creation_counter
_Filter._creation_counter += 1
def default_or_raise(self, exception):
if self.default is not _NO_DEFAULT:
return self.default
else:
raise exception
class Filter(_Filter):
"""
@ -50,8 +87,8 @@ class Filter(_Filter):
Decimal('229.90')
"""
def __init__(self, selector=None):
super(Filter, self).__init__()
def __init__(self, selector=None, default=_NO_DEFAULT):
super(Filter, self).__init__(default=default)
self.selector = selector
@classmethod
@ -108,9 +145,8 @@ class TableCell(_Filter):
"""
def __init__(self, *names, **kwargs):
super(TableCell, self).__init__()
super(TableCell, self).__init__(**kwargs)
self.names = names
self.default = kwargs.pop('default', _NO_DEFAULT)
def __call__(self, item):
for name in self.names:
@ -118,9 +154,7 @@ class TableCell(_Filter):
if idx is not None:
return item.xpath('./td[%s]' % (idx + 1))
if self.default is not _NO_DEFAULT:
return self.default
raise KeyError('Unable to find column %s' % ' or '.join(self.names))
return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names)))
class CleanText(Filter):
@ -162,9 +196,8 @@ class CleanDecimal(CleanText):
Get a cleaned Decimal value from an element.
"""
def __init__(self, selector, replace_dots=True, default=_NO_DEFAULT):
super(CleanDecimal, self).__init__(selector)
super(CleanDecimal, self).__init__(selector, default=default)
self.replace_dots = replace_dots
self.default = default
def filter(self, text):
text = super(CleanDecimal, self).filter(text)
@ -173,30 +206,21 @@ class CleanDecimal(CleanText):
try:
return Decimal(re.sub(ur'[^\d\-\.]', '', text))
except InvalidOperation as e:
if self.default is not _NO_DEFAULT:
return Decimal(self.default)
else:
raise InvalidOperation(e)
return self.default_or_raise(e)
class Attr(Filter):
def __init__(self, selector, attr, default=_NO_DEFAULT):
super(Attr, self).__init__(selector)
super(Attr, self).__init__(selector, default=default)
self.attr = attr
self.default = default
def filter(self, el):
try:
return el[0].attrib[self.attr]
except IndexError:
if self.default is not _NO_DEFAULT:
return self.default
raise ValueError('Unable to find link %s' % self.selector)
return self.default_or_raise(XPathNotFound('Unable to find link %s' % self.selector))
except KeyError:
if self.default is not _NO_DEFAULT:
return self.default
else:
raise KeyError('Link %s does not has attribute %s' % (el[0], self.attr))
return self.default_or_raise(AttributeNotFound('Link %s does not has attribute %s' % (el[0], self.attr)))
class Link(Attr):
@ -206,7 +230,7 @@ class Link(Attr):
If the <a> tag is not found, an exception IndexError is raised.
"""
def __init__(self, selector, default=_NO_DEFAULT):
super(Link, self).__init__(selector, 'href', default)
super(Link, self).__init__(selector, 'href', default=default)
class Field(_Filter):
@ -231,11 +255,10 @@ class Regexp(Filter):
u'1988-08-13'
"""
def __init__(self, selector, pattern, template=None, flags=0, default=_NO_DEFAULT):
super(Regexp, self).__init__(selector)
super(Regexp, self).__init__(selector, default=default)
self.pattern = pattern
self.regex = re.compile(pattern, flags)
self.template = template
self.default = default
def filter(self, txt):
if isinstance(txt, (tuple,list)):
@ -243,10 +266,7 @@ class Regexp(Filter):
mobj = self.regex.search(txt)
if not mobj:
if self.default is not _NO_DEFAULT:
return self.default
else:
raise KeyError('Unable to match %s in %r' % (self.pattern, txt))
return self.default_or_raise(RegexpError('Unable to match %s in %r' % (self.pattern, txt)))
if self.template is None:
return next(g for g in mobj.groups() if g is not None)
@ -256,25 +276,24 @@ class Regexp(Filter):
class Map(Filter):
def __init__(self, selector, map_dict, default=_NO_DEFAULT):
super(Map, self).__init__(selector)
super(Map, self).__init__(selector, default=default)
self.map_dict = map_dict
self.default = default
def filter(self, txt):
try:
return self.map_dict[txt]
except KeyError:
if self.default is not _NO_DEFAULT:
return self.default
else:
raise KeyError('Unable to handle %r' % txt)
return self.default_or_raise(ItemNotFound('Unable to handle %r on %r' % (txt, self.map_dict)))
class DateTime(Filter):
def filter(self, txt):
if empty(txt):
return txt
return parse_date(txt)
try:
return parse_date(txt)
except ValueError as e:
return self.default_or_raise(ParseError('Unable to parse %r: %s' % (txt, e)))
class Date(DateTime):
@ -290,8 +309,7 @@ class Time(Filter):
kwargs = {'hour': 'hh', 'minute': 'mm', 'second': 'ss'}
def __init__(self, selector, default=_NO_DEFAULT):
super(Time, self).__init__(selector)
self.default = default
super(Time, self).__init__(selector, default=default)
def filter(self, txt):
m = self.regexp.search(txt)
@ -301,10 +319,7 @@ class Time(Filter):
kwargs[key] = int(m.groupdict()[index] or 0)
return self.klass(**kwargs)
if self.default is not _NO_DEFAULT:
return self.default
else:
raise ValueError('Unable to find time in %r' % txt)
return self.default_or_raise(ParseError('Unable to find time in %r' % txt))
class Duration(Time):

View file

@ -19,6 +19,7 @@
from __future__ import absolute_import
from urllib import unquote
import requests
import re
import sys
@ -33,13 +34,17 @@ from weboob.tools.regex_helper import normalize
from weboob.tools.log import getLogger
from .browser import DomainBrowser
from .filters import _Filter, CleanText
from .filters import _Filter, CleanText, AttributeNotFound, XPathNotFound
class UrlNotResolvable(Exception):
pass
class DataError(Exception):
pass
class URL(object):
"""
A description of an URL on the PagesBrowser website.
@ -68,12 +73,17 @@ class URL(object):
If arguments are provided, and only then, they are checked against the arguments
that were used to build the current page URL.
"""
assert self.klass is not None, "You can use this method only if the is a BasePage class handler."
if len(kwargs):
params = self.match(self.browser.absurl(self.build(**kwargs), base=True)).groupdict()
else:
params = None
return self.browser.page and self.klass and isinstance(self.browser.page, self.klass) \
and (params is None or params == self.browser.page.params)
# XXX use unquote on current params values because if there are spaces
# or special characters in them, it is encoded only in but not in kwargs.
return self.browser.page and isinstance(self.browser.page, self.klass) \
and (params is None or params == dict([(k,unquote(v)) for k,v in self.browser.page.params.iteritems()]))
def stay_or_go(self, **kwargs):
"""
@ -525,7 +535,7 @@ class ListElement(AbstractElement):
next_page = getattr(self, 'next_page')
try:
value = self.use_selector(next_page)
except IndexError:
except (AttributeNotFound, XPathNotFound):
return
if value is None:
@ -537,7 +547,7 @@ class ListElement(AbstractElement):
def store(self, obj):
if obj.id:
if obj.id in self.objects:
raise ValueError('There are two objects with the same ID! %s' % obj.id)
raise DataError('There are two objects with the same ID! %s' % obj.id)
self.objects[obj.id] = obj
return obj