From fbd8cf1a647908ce17231d5f6861e7a7dcef6480 Mon Sep 17 00:00:00 2001 From: Florent Date: Sat, 27 Sep 2014 22:06:07 +0200 Subject: [PATCH] Move CleanHTML to html filters --- weboob/tools/browser2/filters/html.py | 17 ++++++++++++++++- weboob/tools/browser2/filters/standard.py | 17 +---------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/weboob/tools/browser2/filters/html.py b/weboob/tools/browser2/filters/html.py index f7a4b1dd..b3e28fbf 100644 --- a/weboob/tools/browser2/filters/html.py +++ b/weboob/tools/browser2/filters/html.py @@ -18,10 +18,13 @@ # along with weboob. If not, see . +import lxml.html as html from .standard import _Selector, _NO_DEFAULT, Filter, FilterError +from weboob.tools.html import html2text -__all__ = ['CSS', 'XPath', 'XPathNotFound', 'AttributeNotFound', 'Attr', 'Link'] +__all__ = ['CSS', 'XPath', 'XPathNotFound', 'AttributeNotFound', + 'Attr', 'Link', 'CleanHTML'] class XPathNotFound(FilterError): @@ -65,3 +68,15 @@ class Link(Attr): def __init__(self, selector=None, default=_NO_DEFAULT): super(Link, self).__init__(selector, 'href', default=default) + +class CleanHTML(Filter): + def filter(self, txt): + if isinstance(txt, (tuple, list)): + return u' '.join([self.clean(item) for item in txt]) + return self.clean(txt) + + @classmethod + def clean(cls, txt): + if not isinstance(txt, basestring): + txt = html.tostring(txt, encoding=unicode) + return html2text(txt) diff --git a/weboob/tools/browser2/filters/standard.py b/weboob/tools/browser2/filters/standard.py index 29dd7478..217b279f 100644 --- a/weboob/tools/browser2/filters/standard.py +++ b/weboob/tools/browser2/filters/standard.py @@ -29,7 +29,6 @@ from dateutil.parser import parse as parse_date from weboob.capabilities.base import empty from weboob.tools.compat import basestring from weboob.tools.exceptions import ParseError -from weboob.tools.html import html2text from weboob.tools.browser2 import URL class NoDefault(object): @@ -40,7 +39,7 @@ _NO_DEFAULT = NoDefault() __all__ = ['FilterError', 'ColumnNotFound', 'RegexpError', 'ItemNotFound', - 'Filter', 'Base', 'Env', 'TableCell', 'CleanHTML', 'RawText', + 'Filter', 'Base', 'Env', 'TableCell', 'RawText', 'CleanText', 'Lower', 'CleanDecimal', 'Field', 'Regexp', 'Map', 'DateTime', 'Date', 'Time', 'DateGuesser', 'Duration', 'MultiFilter', 'CombineDate', 'Format', 'Join', 'Type', @@ -223,20 +222,6 @@ class TableCell(_Filter): return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names))) -class CleanHTML(Filter): - def filter(self, txt): - if isinstance(txt, (tuple, list)): - return u' '.join([self.clean(item) for item in txt]) - return self.clean(txt) - - @classmethod - def clean(cls, txt): - if not isinstance(txt, basestring): - import lxml.html as html - txt = html.tostring(txt, encoding=unicode) - return html2text(txt) - - class RawText(Filter): def filter(self, el): if isinstance(el, (tuple, list)):