Move CleanHTML to html filters

This commit is contained in:
Florent 2014-09-27 22:06:07 +02:00
commit fbd8cf1a64
2 changed files with 17 additions and 17 deletions

View file

@ -18,10 +18,13 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import lxml.html as html
from .standard import _Selector, _NO_DEFAULT, Filter, FilterError from .standard import _Selector, _NO_DEFAULT, Filter, FilterError
from weboob.tools.html import html2text
__all__ = ['CSS', 'XPath', 'XPathNotFound', 'AttributeNotFound', 'Attr', 'Link'] __all__ = ['CSS', 'XPath', 'XPathNotFound', 'AttributeNotFound',
'Attr', 'Link', 'CleanHTML']
class XPathNotFound(FilterError): class XPathNotFound(FilterError):
@ -65,3 +68,15 @@ class Link(Attr):
def __init__(self, selector=None, default=_NO_DEFAULT): def __init__(self, selector=None, default=_NO_DEFAULT):
super(Link, self).__init__(selector, 'href', default=default) super(Link, self).__init__(selector, 'href', default=default)
class CleanHTML(Filter):
def filter(self, txt):
if isinstance(txt, (tuple, list)):
return u' '.join([self.clean(item) for item in txt])
return self.clean(txt)
@classmethod
def clean(cls, txt):
if not isinstance(txt, basestring):
txt = html.tostring(txt, encoding=unicode)
return html2text(txt)

View file

@ -29,7 +29,6 @@ from dateutil.parser import parse as parse_date
from weboob.capabilities.base import empty from weboob.capabilities.base import empty
from weboob.tools.compat import basestring from weboob.tools.compat import basestring
from weboob.tools.exceptions import ParseError from weboob.tools.exceptions import ParseError
from weboob.tools.html import html2text
from weboob.tools.browser2 import URL from weboob.tools.browser2 import URL
class NoDefault(object): class NoDefault(object):
@ -40,7 +39,7 @@ _NO_DEFAULT = NoDefault()
__all__ = ['FilterError', 'ColumnNotFound', 'RegexpError', 'ItemNotFound', __all__ = ['FilterError', 'ColumnNotFound', 'RegexpError', 'ItemNotFound',
'Filter', 'Base', 'Env', 'TableCell', 'CleanHTML', 'RawText', 'Filter', 'Base', 'Env', 'TableCell', 'RawText',
'CleanText', 'Lower', 'CleanDecimal', 'Field', 'Regexp', 'Map', 'CleanText', 'Lower', 'CleanDecimal', 'Field', 'Regexp', 'Map',
'DateTime', 'Date', 'Time', 'DateGuesser', 'Duration', 'DateTime', 'Date', 'Time', 'DateGuesser', 'Duration',
'MultiFilter', 'CombineDate', 'Format', 'Join', 'Type', 'MultiFilter', 'CombineDate', 'Format', 'Join', 'Type',
@ -223,20 +222,6 @@ class TableCell(_Filter):
return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names))) return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names)))
class CleanHTML(Filter):
def filter(self, txt):
if isinstance(txt, (tuple, list)):
return u' '.join([self.clean(item) for item in txt])
return self.clean(txt)
@classmethod
def clean(cls, txt):
if not isinstance(txt, basestring):
import lxml.html as html
txt = html.tostring(txt, encoding=unicode)
return html2text(txt)
class RawText(Filter): class RawText(Filter):
def filter(self, el): def filter(self, el):
if isinstance(el, (tuple, list)): if isinstance(el, (tuple, list)):