From 66f3560b523f0de243a36e4045d5a61d54ad59e1 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Thu, 27 Mar 2014 00:16:11 +0100 Subject: [PATCH] [browser2] add a CleanHTML filter --- weboob/tools/browser2/filters.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/weboob/tools/browser2/filters.py b/weboob/tools/browser2/filters.py index 9086dec1..12d6a243 100644 --- a/weboob/tools/browser2/filters.py +++ b/weboob/tools/browser2/filters.py @@ -23,7 +23,9 @@ from dateutil.parser import parse as parse_date import datetime from decimal import Decimal, InvalidOperation import re +import lxml.html as html +from weboob.tools.misc import html2text from weboob.capabilities.base import empty @@ -156,6 +158,9 @@ class TableCell(_Filter): return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names))) +class CleanHTML(Filter): + def filter(self, txt): + return html2text(html.tostring(txt[0], encoding=unicode)) class CleanText(Filter): """