browser2 filters: Force unicode, little style fixes

lxml for Python2 has the tendency to return str instead of unicode when the contents are pure ASCII. Try to fix the nonsense.
2014-06-04 00:58:35 +02:00 · 2014-06-04 00:58:35 +02:00 · 04cec70e1f
commit 04cec70e1f
parent 560d36840d
1 changed files with 25 additions and 16 deletions
--- a/weboob/tools/browser2/filters.py
+++ b/weboob/tools/browser2/filters.py
@ -19,17 +19,17 @@
 from __future__ import absolute_import
 from dateutil.parser import parse as parse_date
 import datetime
 from decimal import Decimal, InvalidOperation
 import re
-import lxml.html as html
+from decimal import Decimal, InvalidOperation
 import lxml.html as html
 from dateutil.parser import parse as parse_date
 from weboob.capabilities.base import empty
 from weboob.tools.compat import basestring
 from weboob.tools.exceptions import ParseError
 from weboob.tools.misc import html2text
 from weboob.tools.compat import basestring
 from weboob.capabilities.base import empty
 _NO_DEFAULT = object()
@ -117,6 +117,7 @@ class Env(_Filter):
    It is used for example to get page parameters, or when there is a parse()
    method on ItemElement.
    """
    def __init__(self, name):
        super(Env, self).__init__()
        self.name = name
@ -157,6 +158,7 @@ class TableCell(_Filter):
        return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names)))
 class Dict(Filter):
    @classmethod
    def select(cls, selector, item):
@ -181,10 +183,11 @@ class Dict(Filter):
    def filter(self, txt):
        return txt
 class CleanHTML(Filter):
    def filter(self, txt):
-        if isinstance(txt, (tuple,list)):
+        if isinstance(txt, (tuple, list)):
-            return ' '.join([self.clean(item) for item in txt])
+            return u' '.join([self.clean(item) for item in txt])
        return self.clean(txt)
    @classmethod
@ -194,7 +197,7 @@ class CleanHTML(Filter):
 class RawText(Filter):
    def filter(self, el):
-        if isinstance(el, (tuple,list)):
+        if isinstance(el, (tuple, list)):
            return u' '.join([self.filter(e) for e in el])
        if el.text is None:
@ -211,6 +214,7 @@ class CleanText(Filter):
    string.
    Second, it replaces all symbols given in second argument.
    """
    def __init__(self, selector, symbols='', replace=[], childs=True, **kwargs):
        super(CleanText, self).__init__(selector, **kwargs)
        self.symbols = symbols
@ -218,8 +222,8 @@ class CleanText(Filter):
        self.childs = childs
    def filter(self, txt):
-        if isinstance(txt, (tuple,list)):
+        if isinstance(txt, (tuple, list)):
-            txt = ' '.join([self.clean(item, childs=self.childs) for item in txt])
+            txt = u' '.join([self.clean(item, childs=self.childs) for item in txt])
        txt = self.clean(txt, childs=self.childs)
        txt = self.remove(txt, self.symbols)
@ -259,6 +263,7 @@ class CleanDecimal(CleanText):
    """
    Get a cleaned Decimal value from an element.
    """
    def __init__(self, selector, replace_dots=True, default=_NO_DEFAULT):
        super(CleanDecimal, self).__init__(selector, default=default)
        self.replace_dots = replace_dots
@ -266,7 +271,7 @@ class CleanDecimal(CleanText):
    def filter(self, text):
        text = super(CleanDecimal, self).filter(text)
        if self.replace_dots:
-            text = text.replace('.','').replace(',','.')
+            text = text.replace('.', '').replace(',', '.')
        try:
            return Decimal(re.sub(r'[^\d\-\.]', '', text))
        except InvalidOperation as e:
@ -293,6 +298,7 @@ class Link(Attr):
    If the <a> tag is not found, an exception IndexError is raised.
    """
    def __init__(self, selector, default=_NO_DEFAULT):
        super(Link, self).__init__(selector, 'href', default=default)
@ -301,6 +307,7 @@ class Field(_Filter):
    """
    Get the attribute of object.
    """
    def __init__(self, name):
        super(Field, self).__init__()
        self.name = name
@ -318,6 +325,7 @@ class Regexp(Filter):
    >>> f(etree.fromstring('<html><body><p>Date: <span>13/08/1988</span></p></body></html>'))
    u'1988-08-13'
    """
    def __init__(self, selector, pattern, template=None, flags=0, default=_NO_DEFAULT):
        super(Regexp, self).__init__(selector, default=default)
        self.pattern = pattern
@ -325,8 +333,8 @@ class Regexp(Filter):
        self.template = template
    def filter(self, txt):
-        if isinstance(txt, (tuple,list)):
+        if isinstance(txt, (tuple, list)):
-            txt = ' '.join([t.strip() for t in txt.itertext()])
+            txt = u' '.join([t.strip() for t in txt.itertext()])
        mobj = self.regex.search(txt)
        if not mobj:
@ -339,6 +347,7 @@ class Regexp(Filter):
 class Map(Filter):
    def __init__(self, selector, map_dict, default=_NO_DEFAULT):
        super(Map, self).__init__(selector, default=default)
        self.map_dict = map_dict
@ -361,8 +370,8 @@ class DateTime(Filter):
            return self.default_or_raise(ParseError('Unable to parse %r' % txt))
        try:
            if self.translations:
-                 for search, repl in self.translations:
+                for search, repl in self.translations:
-                     txt = search.sub(repl, txt)
+                    txt = search.sub(repl, txt)
            return parse_date(txt, dayfirst=self.dayfirst)
        except ValueError as e:
            return self.default_or_raise(ParseError('Unable to parse %r: %s' % (txt, e)))
@ -398,7 +407,7 @@ class DateGuesser(Filter):
        if len(values) == 2:
            day, month = map(int, values)
        else:
-            raise ParseError('Unable to take (day,month) tuple from %r' % values)
+            raise ParseError('Unable to take (day, month) tuple from %r' % values)
        return date_guesser.guess_date(day, month, **self.kwargs)