diff --git a/weboob/backends/bp/pages/accounthistory.py b/weboob/backends/bp/pages/accounthistory.py
index fece0797..e44cdbdb 100644
--- a/weboob/backends/bp/pages/accounthistory.py
+++ b/weboob/backends/bp/pages/accounthistory.py
@@ -22,7 +22,6 @@ import re
from weboob.capabilities.bank import Operation
from weboob.tools.browser import BasePage
-from weboob.tools.misc import remove_html_tags
__all__ = ['AccountHistory']
@@ -40,7 +39,7 @@ class AccountHistory(BasePage):
operation = Operation(len(operations))
operation.date = mvt.xpath("./td/span")[0].text
tmp = mvt.xpath("./td/span")[1]
- operation.label = remove_html_tags(self.parser.tostring(tmp)).strip()
+ operation.label = self.parser.tocleanstring(tmp)
r = re.compile(r'\d+')
diff --git a/weboob/tools/misc.py b/weboob/tools/misc.py
index 18cb92ec..8dcc7e11 100644
--- a/weboob/tools/misc.py
+++ b/weboob/tools/misc.py
@@ -24,7 +24,6 @@ from dateutil import tz
from logging import warning
from time import time, sleep
from tempfile import gettempdir
-import re
import os
import sys
import traceback
@@ -62,10 +61,6 @@ def get_bytes_size(size, unit_name):
}
return float(size * unit_data.get(unit_name, 1))
-def remove_html_tags(data):
- p = re.compile(r'<.*?>')
- return p.sub(' ', data)
-
try:
import html2text as h2t
h2t.UNICODE_SNOB = 1
diff --git a/weboob/tools/parsers/iparser.py b/weboob/tools/parsers/iparser.py
index f7d6b106..52fed24d 100644
--- a/weboob/tools/parsers/iparser.py
+++ b/weboob/tools/parsers/iparser.py
@@ -18,6 +18,12 @@
# along with weboob. If not, see .
+import re
+
+
+__all__ = ['IParser']
+
+
class IParser(object):
def parse(self, data, encoding=None):
"""
@@ -34,3 +40,16 @@ class IParser(object):
Get HTML string from an element.
"""
raise NotImplementedError()
+
+ def tocleanstring(self, elem):
+ """
+ Get a clean string from an element.
+ """
+ return self.strip(self.tostring(elem))
+
+ def strip(self, data):
+ """
+ Strip a HTML string.
+ """
+ p = re.compile(r'<.*?>')
+ return p.sub(' ', data).strip()
diff --git a/weboob/tools/parsers/lxmlparser.py b/weboob/tools/parsers/lxmlparser.py
index 1430f798..9d166345 100644
--- a/weboob/tools/parsers/lxmlparser.py
+++ b/weboob/tools/parsers/lxmlparser.py
@@ -18,6 +18,7 @@
# along with weboob. If not, see .
+import re
import lxml.html
from .iparser import IParser
@@ -44,6 +45,16 @@ class LxmlHtmlParser(IParser):
def tostring(self, element):
return lxml.html.tostring(element, encoding=unicode)
+ def tocleanstring(self, element):
+ txt = element.xpath('text()') # ['foo ', ' bar']
+ txt = ' '.join(txt) # 'foo bar'
+ txt = re.sub('\s+', ' ', txt) # 'foo bar'
+ return txt.strip()
+
+ def strip(self, s):
+ doc = lxml.html.fromstring(s) # parse html string
+ return self.tocleanstring(doc)
+
@classmethod
def select(cls, element, selector, nb=None, method='cssselect'):
"""