diff --git a/weboob/backends/bp/pages/accounthistory.py b/weboob/backends/bp/pages/accounthistory.py index fece0797..e44cdbdb 100644 --- a/weboob/backends/bp/pages/accounthistory.py +++ b/weboob/backends/bp/pages/accounthistory.py @@ -22,7 +22,6 @@ import re from weboob.capabilities.bank import Operation from weboob.tools.browser import BasePage -from weboob.tools.misc import remove_html_tags __all__ = ['AccountHistory'] @@ -40,7 +39,7 @@ class AccountHistory(BasePage): operation = Operation(len(operations)) operation.date = mvt.xpath("./td/span")[0].text tmp = mvt.xpath("./td/span")[1] - operation.label = remove_html_tags(self.parser.tostring(tmp)).strip() + operation.label = self.parser.tocleanstring(tmp) r = re.compile(r'\d+') diff --git a/weboob/tools/misc.py b/weboob/tools/misc.py index 18cb92ec..8dcc7e11 100644 --- a/weboob/tools/misc.py +++ b/weboob/tools/misc.py @@ -24,7 +24,6 @@ from dateutil import tz from logging import warning from time import time, sleep from tempfile import gettempdir -import re import os import sys import traceback @@ -62,10 +61,6 @@ def get_bytes_size(size, unit_name): } return float(size * unit_data.get(unit_name, 1)) -def remove_html_tags(data): - p = re.compile(r'<.*?>') - return p.sub(' ', data) - try: import html2text as h2t h2t.UNICODE_SNOB = 1 diff --git a/weboob/tools/parsers/iparser.py b/weboob/tools/parsers/iparser.py index f7d6b106..52fed24d 100644 --- a/weboob/tools/parsers/iparser.py +++ b/weboob/tools/parsers/iparser.py @@ -18,6 +18,12 @@ # along with weboob. If not, see . +import re + + +__all__ = ['IParser'] + + class IParser(object): def parse(self, data, encoding=None): """ @@ -34,3 +40,16 @@ class IParser(object): Get HTML string from an element. """ raise NotImplementedError() + + def tocleanstring(self, elem): + """ + Get a clean string from an element. + """ + return self.strip(self.tostring(elem)) + + def strip(self, data): + """ + Strip a HTML string. + """ + p = re.compile(r'<.*?>') + return p.sub(' ', data).strip() diff --git a/weboob/tools/parsers/lxmlparser.py b/weboob/tools/parsers/lxmlparser.py index 1430f798..9d166345 100644 --- a/weboob/tools/parsers/lxmlparser.py +++ b/weboob/tools/parsers/lxmlparser.py @@ -18,6 +18,7 @@ # along with weboob. If not, see . +import re import lxml.html from .iparser import IParser @@ -44,6 +45,16 @@ class LxmlHtmlParser(IParser): def tostring(self, element): return lxml.html.tostring(element, encoding=unicode) + def tocleanstring(self, element): + txt = element.xpath('text()') # ['foo ', ' bar'] + txt = ' '.join(txt) # 'foo bar' + txt = re.sub('\s+', ' ', txt) # 'foo bar' + return txt.strip() + + def strip(self, s): + doc = lxml.html.fromstring(s) # parse html string + return self.tocleanstring(doc) + @classmethod def select(cls, element, selector, nb=None, method='cssselect'): """