browser2 filters: Force unicode, little style fixes
lxml for Python2 has the tendency to return str instead of unicode when the contents are pure ASCII. Try to fix the nonsense.
This commit is contained in:
parent
560d36840d
commit
04cec70e1f
1 changed files with 25 additions and 16 deletions
|
|
@ -19,17 +19,17 @@
|
||||||
|
|
||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
from dateutil.parser import parse as parse_date
|
|
||||||
import datetime
|
import datetime
|
||||||
from decimal import Decimal, InvalidOperation
|
|
||||||
import re
|
import re
|
||||||
import lxml.html as html
|
from decimal import Decimal, InvalidOperation
|
||||||
|
|
||||||
|
import lxml.html as html
|
||||||
|
from dateutil.parser import parse as parse_date
|
||||||
|
|
||||||
|
from weboob.capabilities.base import empty
|
||||||
|
from weboob.tools.compat import basestring
|
||||||
from weboob.tools.exceptions import ParseError
|
from weboob.tools.exceptions import ParseError
|
||||||
from weboob.tools.misc import html2text
|
from weboob.tools.misc import html2text
|
||||||
from weboob.tools.compat import basestring
|
|
||||||
from weboob.capabilities.base import empty
|
|
||||||
|
|
||||||
|
|
||||||
_NO_DEFAULT = object()
|
_NO_DEFAULT = object()
|
||||||
|
|
||||||
|
|
@ -117,6 +117,7 @@ class Env(_Filter):
|
||||||
It is used for example to get page parameters, or when there is a parse()
|
It is used for example to get page parameters, or when there is a parse()
|
||||||
method on ItemElement.
|
method on ItemElement.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
super(Env, self).__init__()
|
super(Env, self).__init__()
|
||||||
self.name = name
|
self.name = name
|
||||||
|
|
@ -157,6 +158,7 @@ class TableCell(_Filter):
|
||||||
|
|
||||||
return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names)))
|
return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names)))
|
||||||
|
|
||||||
|
|
||||||
class Dict(Filter):
|
class Dict(Filter):
|
||||||
@classmethod
|
@classmethod
|
||||||
def select(cls, selector, item):
|
def select(cls, selector, item):
|
||||||
|
|
@ -181,10 +183,11 @@ class Dict(Filter):
|
||||||
def filter(self, txt):
|
def filter(self, txt):
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
|
||||||
class CleanHTML(Filter):
|
class CleanHTML(Filter):
|
||||||
def filter(self, txt):
|
def filter(self, txt):
|
||||||
if isinstance(txt, (tuple,list)):
|
if isinstance(txt, (tuple, list)):
|
||||||
return ' '.join([self.clean(item) for item in txt])
|
return u' '.join([self.clean(item) for item in txt])
|
||||||
return self.clean(txt)
|
return self.clean(txt)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -194,7 +197,7 @@ class CleanHTML(Filter):
|
||||||
|
|
||||||
class RawText(Filter):
|
class RawText(Filter):
|
||||||
def filter(self, el):
|
def filter(self, el):
|
||||||
if isinstance(el, (tuple,list)):
|
if isinstance(el, (tuple, list)):
|
||||||
return u' '.join([self.filter(e) for e in el])
|
return u' '.join([self.filter(e) for e in el])
|
||||||
|
|
||||||
if el.text is None:
|
if el.text is None:
|
||||||
|
|
@ -211,6 +214,7 @@ class CleanText(Filter):
|
||||||
string.
|
string.
|
||||||
Second, it replaces all symbols given in second argument.
|
Second, it replaces all symbols given in second argument.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, selector, symbols='', replace=[], childs=True, **kwargs):
|
def __init__(self, selector, symbols='', replace=[], childs=True, **kwargs):
|
||||||
super(CleanText, self).__init__(selector, **kwargs)
|
super(CleanText, self).__init__(selector, **kwargs)
|
||||||
self.symbols = symbols
|
self.symbols = symbols
|
||||||
|
|
@ -218,8 +222,8 @@ class CleanText(Filter):
|
||||||
self.childs = childs
|
self.childs = childs
|
||||||
|
|
||||||
def filter(self, txt):
|
def filter(self, txt):
|
||||||
if isinstance(txt, (tuple,list)):
|
if isinstance(txt, (tuple, list)):
|
||||||
txt = ' '.join([self.clean(item, childs=self.childs) for item in txt])
|
txt = u' '.join([self.clean(item, childs=self.childs) for item in txt])
|
||||||
|
|
||||||
txt = self.clean(txt, childs=self.childs)
|
txt = self.clean(txt, childs=self.childs)
|
||||||
txt = self.remove(txt, self.symbols)
|
txt = self.remove(txt, self.symbols)
|
||||||
|
|
@ -259,6 +263,7 @@ class CleanDecimal(CleanText):
|
||||||
"""
|
"""
|
||||||
Get a cleaned Decimal value from an element.
|
Get a cleaned Decimal value from an element.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, selector, replace_dots=True, default=_NO_DEFAULT):
|
def __init__(self, selector, replace_dots=True, default=_NO_DEFAULT):
|
||||||
super(CleanDecimal, self).__init__(selector, default=default)
|
super(CleanDecimal, self).__init__(selector, default=default)
|
||||||
self.replace_dots = replace_dots
|
self.replace_dots = replace_dots
|
||||||
|
|
@ -266,7 +271,7 @@ class CleanDecimal(CleanText):
|
||||||
def filter(self, text):
|
def filter(self, text):
|
||||||
text = super(CleanDecimal, self).filter(text)
|
text = super(CleanDecimal, self).filter(text)
|
||||||
if self.replace_dots:
|
if self.replace_dots:
|
||||||
text = text.replace('.','').replace(',','.')
|
text = text.replace('.', '').replace(',', '.')
|
||||||
try:
|
try:
|
||||||
return Decimal(re.sub(r'[^\d\-\.]', '', text))
|
return Decimal(re.sub(r'[^\d\-\.]', '', text))
|
||||||
except InvalidOperation as e:
|
except InvalidOperation as e:
|
||||||
|
|
@ -293,6 +298,7 @@ class Link(Attr):
|
||||||
|
|
||||||
If the <a> tag is not found, an exception IndexError is raised.
|
If the <a> tag is not found, an exception IndexError is raised.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, selector, default=_NO_DEFAULT):
|
def __init__(self, selector, default=_NO_DEFAULT):
|
||||||
super(Link, self).__init__(selector, 'href', default=default)
|
super(Link, self).__init__(selector, 'href', default=default)
|
||||||
|
|
||||||
|
|
@ -301,6 +307,7 @@ class Field(_Filter):
|
||||||
"""
|
"""
|
||||||
Get the attribute of object.
|
Get the attribute of object.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
super(Field, self).__init__()
|
super(Field, self).__init__()
|
||||||
self.name = name
|
self.name = name
|
||||||
|
|
@ -318,6 +325,7 @@ class Regexp(Filter):
|
||||||
>>> f(etree.fromstring('<html><body><p>Date: <span>13/08/1988</span></p></body></html>'))
|
>>> f(etree.fromstring('<html><body><p>Date: <span>13/08/1988</span></p></body></html>'))
|
||||||
u'1988-08-13'
|
u'1988-08-13'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, selector, pattern, template=None, flags=0, default=_NO_DEFAULT):
|
def __init__(self, selector, pattern, template=None, flags=0, default=_NO_DEFAULT):
|
||||||
super(Regexp, self).__init__(selector, default=default)
|
super(Regexp, self).__init__(selector, default=default)
|
||||||
self.pattern = pattern
|
self.pattern = pattern
|
||||||
|
|
@ -325,8 +333,8 @@ class Regexp(Filter):
|
||||||
self.template = template
|
self.template = template
|
||||||
|
|
||||||
def filter(self, txt):
|
def filter(self, txt):
|
||||||
if isinstance(txt, (tuple,list)):
|
if isinstance(txt, (tuple, list)):
|
||||||
txt = ' '.join([t.strip() for t in txt.itertext()])
|
txt = u' '.join([t.strip() for t in txt.itertext()])
|
||||||
|
|
||||||
mobj = self.regex.search(txt)
|
mobj = self.regex.search(txt)
|
||||||
if not mobj:
|
if not mobj:
|
||||||
|
|
@ -339,6 +347,7 @@ class Regexp(Filter):
|
||||||
|
|
||||||
|
|
||||||
class Map(Filter):
|
class Map(Filter):
|
||||||
|
|
||||||
def __init__(self, selector, map_dict, default=_NO_DEFAULT):
|
def __init__(self, selector, map_dict, default=_NO_DEFAULT):
|
||||||
super(Map, self).__init__(selector, default=default)
|
super(Map, self).__init__(selector, default=default)
|
||||||
self.map_dict = map_dict
|
self.map_dict = map_dict
|
||||||
|
|
@ -361,8 +370,8 @@ class DateTime(Filter):
|
||||||
return self.default_or_raise(ParseError('Unable to parse %r' % txt))
|
return self.default_or_raise(ParseError('Unable to parse %r' % txt))
|
||||||
try:
|
try:
|
||||||
if self.translations:
|
if self.translations:
|
||||||
for search, repl in self.translations:
|
for search, repl in self.translations:
|
||||||
txt = search.sub(repl, txt)
|
txt = search.sub(repl, txt)
|
||||||
return parse_date(txt, dayfirst=self.dayfirst)
|
return parse_date(txt, dayfirst=self.dayfirst)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return self.default_or_raise(ParseError('Unable to parse %r: %s' % (txt, e)))
|
return self.default_or_raise(ParseError('Unable to parse %r: %s' % (txt, e)))
|
||||||
|
|
@ -398,7 +407,7 @@ class DateGuesser(Filter):
|
||||||
if len(values) == 2:
|
if len(values) == 2:
|
||||||
day, month = map(int, values)
|
day, month = map(int, values)
|
||||||
else:
|
else:
|
||||||
raise ParseError('Unable to take (day,month) tuple from %r' % values)
|
raise ParseError('Unable to take (day, month) tuple from %r' % values)
|
||||||
return date_guesser.guess_date(day, month, **self.kwargs)
|
return date_guesser.guess_date(day, month, **self.kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue