browser2 filters: Force unicode, little style fixes

lxml for Python2 has the tendency to return str instead of
unicode when the contents are pure ASCII.
Try to fix the nonsense.
This commit is contained in:
Laurent Bachelier 2014-06-04 00:58:35 +02:00
commit 04cec70e1f

View file

@ -19,17 +19,17 @@
from __future__ import absolute_import
from dateutil.parser import parse as parse_date
import datetime
from decimal import Decimal, InvalidOperation
import re
import lxml.html as html
from decimal import Decimal, InvalidOperation
import lxml.html as html
from dateutil.parser import parse as parse_date
from weboob.capabilities.base import empty
from weboob.tools.compat import basestring
from weboob.tools.exceptions import ParseError
from weboob.tools.misc import html2text
from weboob.tools.compat import basestring
from weboob.capabilities.base import empty
_NO_DEFAULT = object()
@ -117,6 +117,7 @@ class Env(_Filter):
It is used for example to get page parameters, or when there is a parse()
method on ItemElement.
"""
def __init__(self, name):
super(Env, self).__init__()
self.name = name
@ -157,6 +158,7 @@ class TableCell(_Filter):
return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names)))
class Dict(Filter):
@classmethod
def select(cls, selector, item):
@ -181,10 +183,11 @@ class Dict(Filter):
def filter(self, txt):
return txt
class CleanHTML(Filter):
def filter(self, txt):
if isinstance(txt, (tuple,list)):
return ' '.join([self.clean(item) for item in txt])
if isinstance(txt, (tuple, list)):
return u' '.join([self.clean(item) for item in txt])
return self.clean(txt)
@classmethod
@ -194,7 +197,7 @@ class CleanHTML(Filter):
class RawText(Filter):
def filter(self, el):
if isinstance(el, (tuple,list)):
if isinstance(el, (tuple, list)):
return u' '.join([self.filter(e) for e in el])
if el.text is None:
@ -211,6 +214,7 @@ class CleanText(Filter):
string.
Second, it replaces all symbols given in second argument.
"""
def __init__(self, selector, symbols='', replace=[], childs=True, **kwargs):
super(CleanText, self).__init__(selector, **kwargs)
self.symbols = symbols
@ -218,8 +222,8 @@ class CleanText(Filter):
self.childs = childs
def filter(self, txt):
if isinstance(txt, (tuple,list)):
txt = ' '.join([self.clean(item, childs=self.childs) for item in txt])
if isinstance(txt, (tuple, list)):
txt = u' '.join([self.clean(item, childs=self.childs) for item in txt])
txt = self.clean(txt, childs=self.childs)
txt = self.remove(txt, self.symbols)
@ -259,6 +263,7 @@ class CleanDecimal(CleanText):
"""
Get a cleaned Decimal value from an element.
"""
def __init__(self, selector, replace_dots=True, default=_NO_DEFAULT):
super(CleanDecimal, self).__init__(selector, default=default)
self.replace_dots = replace_dots
@ -266,7 +271,7 @@ class CleanDecimal(CleanText):
def filter(self, text):
text = super(CleanDecimal, self).filter(text)
if self.replace_dots:
text = text.replace('.','').replace(',','.')
text = text.replace('.', '').replace(',', '.')
try:
return Decimal(re.sub(r'[^\d\-\.]', '', text))
except InvalidOperation as e:
@ -293,6 +298,7 @@ class Link(Attr):
If the <a> tag is not found, an exception IndexError is raised.
"""
def __init__(self, selector, default=_NO_DEFAULT):
super(Link, self).__init__(selector, 'href', default=default)
@ -301,6 +307,7 @@ class Field(_Filter):
"""
Get the attribute of object.
"""
def __init__(self, name):
super(Field, self).__init__()
self.name = name
@ -318,6 +325,7 @@ class Regexp(Filter):
>>> f(etree.fromstring('<html><body><p>Date: <span>13/08/1988</span></p></body></html>'))
u'1988-08-13'
"""
def __init__(self, selector, pattern, template=None, flags=0, default=_NO_DEFAULT):
super(Regexp, self).__init__(selector, default=default)
self.pattern = pattern
@ -325,8 +333,8 @@ class Regexp(Filter):
self.template = template
def filter(self, txt):
if isinstance(txt, (tuple,list)):
txt = ' '.join([t.strip() for t in txt.itertext()])
if isinstance(txt, (tuple, list)):
txt = u' '.join([t.strip() for t in txt.itertext()])
mobj = self.regex.search(txt)
if not mobj:
@ -339,6 +347,7 @@ class Regexp(Filter):
class Map(Filter):
def __init__(self, selector, map_dict, default=_NO_DEFAULT):
super(Map, self).__init__(selector, default=default)
self.map_dict = map_dict
@ -361,8 +370,8 @@ class DateTime(Filter):
return self.default_or_raise(ParseError('Unable to parse %r' % txt))
try:
if self.translations:
for search, repl in self.translations:
txt = search.sub(repl, txt)
for search, repl in self.translations:
txt = search.sub(repl, txt)
return parse_date(txt, dayfirst=self.dayfirst)
except ValueError as e:
return self.default_or_raise(ParseError('Unable to parse %r: %s' % (txt, e)))
@ -398,7 +407,7 @@ class DateGuesser(Filter):
if len(values) == 2:
day, month = map(int, values)
else:
raise ParseError('Unable to take (day,month) tuple from %r' % values)
raise ParseError('Unable to take (day, month) tuple from %r' % values)
return date_guesser.guess_date(day, month, **self.kwargs)