browser2 filters: Force unicode, little style fixes

lxml for Python2 has the tendency to return str instead of
unicode when the contents are pure ASCII.
Try to fix the nonsense.
This commit is contained in:
Laurent Bachelier 2014-06-04 00:58:35 +02:00
commit 04cec70e1f

View file

@ -19,17 +19,17 @@
from __future__ import absolute_import from __future__ import absolute_import
from dateutil.parser import parse as parse_date
import datetime import datetime
from decimal import Decimal, InvalidOperation
import re import re
import lxml.html as html from decimal import Decimal, InvalidOperation
import lxml.html as html
from dateutil.parser import parse as parse_date
from weboob.capabilities.base import empty
from weboob.tools.compat import basestring
from weboob.tools.exceptions import ParseError from weboob.tools.exceptions import ParseError
from weboob.tools.misc import html2text from weboob.tools.misc import html2text
from weboob.tools.compat import basestring
from weboob.capabilities.base import empty
_NO_DEFAULT = object() _NO_DEFAULT = object()
@ -117,6 +117,7 @@ class Env(_Filter):
It is used for example to get page parameters, or when there is a parse() It is used for example to get page parameters, or when there is a parse()
method on ItemElement. method on ItemElement.
""" """
def __init__(self, name): def __init__(self, name):
super(Env, self).__init__() super(Env, self).__init__()
self.name = name self.name = name
@ -157,6 +158,7 @@ class TableCell(_Filter):
return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names))) return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names)))
class Dict(Filter): class Dict(Filter):
@classmethod @classmethod
def select(cls, selector, item): def select(cls, selector, item):
@ -181,10 +183,11 @@ class Dict(Filter):
def filter(self, txt): def filter(self, txt):
return txt return txt
class CleanHTML(Filter): class CleanHTML(Filter):
def filter(self, txt): def filter(self, txt):
if isinstance(txt, (tuple,list)): if isinstance(txt, (tuple, list)):
return ' '.join([self.clean(item) for item in txt]) return u' '.join([self.clean(item) for item in txt])
return self.clean(txt) return self.clean(txt)
@classmethod @classmethod
@ -194,7 +197,7 @@ class CleanHTML(Filter):
class RawText(Filter): class RawText(Filter):
def filter(self, el): def filter(self, el):
if isinstance(el, (tuple,list)): if isinstance(el, (tuple, list)):
return u' '.join([self.filter(e) for e in el]) return u' '.join([self.filter(e) for e in el])
if el.text is None: if el.text is None:
@ -211,6 +214,7 @@ class CleanText(Filter):
string. string.
Second, it replaces all symbols given in second argument. Second, it replaces all symbols given in second argument.
""" """
def __init__(self, selector, symbols='', replace=[], childs=True, **kwargs): def __init__(self, selector, symbols='', replace=[], childs=True, **kwargs):
super(CleanText, self).__init__(selector, **kwargs) super(CleanText, self).__init__(selector, **kwargs)
self.symbols = symbols self.symbols = symbols
@ -218,8 +222,8 @@ class CleanText(Filter):
self.childs = childs self.childs = childs
def filter(self, txt): def filter(self, txt):
if isinstance(txt, (tuple,list)): if isinstance(txt, (tuple, list)):
txt = ' '.join([self.clean(item, childs=self.childs) for item in txt]) txt = u' '.join([self.clean(item, childs=self.childs) for item in txt])
txt = self.clean(txt, childs=self.childs) txt = self.clean(txt, childs=self.childs)
txt = self.remove(txt, self.symbols) txt = self.remove(txt, self.symbols)
@ -259,6 +263,7 @@ class CleanDecimal(CleanText):
""" """
Get a cleaned Decimal value from an element. Get a cleaned Decimal value from an element.
""" """
def __init__(self, selector, replace_dots=True, default=_NO_DEFAULT): def __init__(self, selector, replace_dots=True, default=_NO_DEFAULT):
super(CleanDecimal, self).__init__(selector, default=default) super(CleanDecimal, self).__init__(selector, default=default)
self.replace_dots = replace_dots self.replace_dots = replace_dots
@ -266,7 +271,7 @@ class CleanDecimal(CleanText):
def filter(self, text): def filter(self, text):
text = super(CleanDecimal, self).filter(text) text = super(CleanDecimal, self).filter(text)
if self.replace_dots: if self.replace_dots:
text = text.replace('.','').replace(',','.') text = text.replace('.', '').replace(',', '.')
try: try:
return Decimal(re.sub(r'[^\d\-\.]', '', text)) return Decimal(re.sub(r'[^\d\-\.]', '', text))
except InvalidOperation as e: except InvalidOperation as e:
@ -293,6 +298,7 @@ class Link(Attr):
If the <a> tag is not found, an exception IndexError is raised. If the <a> tag is not found, an exception IndexError is raised.
""" """
def __init__(self, selector, default=_NO_DEFAULT): def __init__(self, selector, default=_NO_DEFAULT):
super(Link, self).__init__(selector, 'href', default=default) super(Link, self).__init__(selector, 'href', default=default)
@ -301,6 +307,7 @@ class Field(_Filter):
""" """
Get the attribute of object. Get the attribute of object.
""" """
def __init__(self, name): def __init__(self, name):
super(Field, self).__init__() super(Field, self).__init__()
self.name = name self.name = name
@ -318,6 +325,7 @@ class Regexp(Filter):
>>> f(etree.fromstring('<html><body><p>Date: <span>13/08/1988</span></p></body></html>')) >>> f(etree.fromstring('<html><body><p>Date: <span>13/08/1988</span></p></body></html>'))
u'1988-08-13' u'1988-08-13'
""" """
def __init__(self, selector, pattern, template=None, flags=0, default=_NO_DEFAULT): def __init__(self, selector, pattern, template=None, flags=0, default=_NO_DEFAULT):
super(Regexp, self).__init__(selector, default=default) super(Regexp, self).__init__(selector, default=default)
self.pattern = pattern self.pattern = pattern
@ -325,8 +333,8 @@ class Regexp(Filter):
self.template = template self.template = template
def filter(self, txt): def filter(self, txt):
if isinstance(txt, (tuple,list)): if isinstance(txt, (tuple, list)):
txt = ' '.join([t.strip() for t in txt.itertext()]) txt = u' '.join([t.strip() for t in txt.itertext()])
mobj = self.regex.search(txt) mobj = self.regex.search(txt)
if not mobj: if not mobj:
@ -339,6 +347,7 @@ class Regexp(Filter):
class Map(Filter): class Map(Filter):
def __init__(self, selector, map_dict, default=_NO_DEFAULT): def __init__(self, selector, map_dict, default=_NO_DEFAULT):
super(Map, self).__init__(selector, default=default) super(Map, self).__init__(selector, default=default)
self.map_dict = map_dict self.map_dict = map_dict
@ -361,8 +370,8 @@ class DateTime(Filter):
return self.default_or_raise(ParseError('Unable to parse %r' % txt)) return self.default_or_raise(ParseError('Unable to parse %r' % txt))
try: try:
if self.translations: if self.translations:
for search, repl in self.translations: for search, repl in self.translations:
txt = search.sub(repl, txt) txt = search.sub(repl, txt)
return parse_date(txt, dayfirst=self.dayfirst) return parse_date(txt, dayfirst=self.dayfirst)
except ValueError as e: except ValueError as e:
return self.default_or_raise(ParseError('Unable to parse %r: %s' % (txt, e))) return self.default_or_raise(ParseError('Unable to parse %r: %s' % (txt, e)))
@ -398,7 +407,7 @@ class DateGuesser(Filter):
if len(values) == 2: if len(values) == 2:
day, month = map(int, values) day, month = map(int, values)
else: else:
raise ParseError('Unable to take (day,month) tuple from %r' % values) raise ParseError('Unable to take (day, month) tuple from %r' % values)
return date_guesser.guess_date(day, month, **self.kwargs) return date_guesser.guess_date(day, month, **self.kwargs)