delete 'remove_html_tags' global function, and create IParser.tocleanstring and IParser.strip abstract methods.

2011-10-25 13:28:43 +02:00 · 2011-10-25 13:28:43 +02:00 · 59dfe3083a
commit 59dfe3083a
parent 5a96b425da
4 changed files with 31 additions and 7 deletions
--- a/weboob/backends/bp/pages/accounthistory.py
+++ b/weboob/backends/bp/pages/accounthistory.py
@ -22,7 +22,6 @@ import re

 from weboob.capabilities.bank import Operation
 from weboob.tools.browser import BasePage
-from weboob.tools.misc import remove_html_tags


 __all__ = ['AccountHistory']
@ -40,7 +39,7 @@ class AccountHistory(BasePage):
            operation = Operation(len(operations))
            operation.date = mvt.xpath("./td/span")[0].text
            tmp = mvt.xpath("./td/span")[1]
-            operation.label = remove_html_tags(self.parser.tostring(tmp)).strip()
+            operation.label = self.parser.tocleanstring(tmp)

            r = re.compile(r'\d+')

--- a/weboob/tools/misc.py
+++ b/weboob/tools/misc.py
@ -24,7 +24,6 @@ from dateutil import tz
 from logging import warning
 from time import time, sleep
 from tempfile import gettempdir
-import re
 import os
 import sys
 import traceback
@ -62,10 +61,6 @@ def get_bytes_size(size, unit_name):
        }
    return float(size * unit_data.get(unit_name, 1))

-def remove_html_tags(data):
-    p = re.compile(r'<.*?>')
-    return p.sub(' ', data)
-
 try:
    import html2text as h2t
    h2t.UNICODE_SNOB = 1
--- a/weboob/tools/parsers/iparser.py
+++ b/weboob/tools/parsers/iparser.py
@ -18,6 +18,12 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.


+import re
+
+
+__all__ = ['IParser']
+
+
 class IParser(object):
    def parse(self, data, encoding=None):
        """
@ -34,3 +40,16 @@ class IParser(object):
        Get HTML string from an element.
        """
        raise NotImplementedError()
+
+    def tocleanstring(self, elem):
+        """
+        Get a clean string from an element.
+        """
+        return self.strip(self.tostring(elem))
+
+    def strip(self, data):
+        """
+        Strip a HTML string.
+        """
+        p = re.compile(r'<.*?>')
+        return p.sub(' ', data).strip()
--- a/weboob/tools/parsers/lxmlparser.py
+++ b/weboob/tools/parsers/lxmlparser.py
@ -18,6 +18,7 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.


+import re
 import lxml.html

 from .iparser import IParser
@ -44,6 +45,16 @@ class LxmlHtmlParser(IParser):
    def tostring(self, element):
        return lxml.html.tostring(element, encoding=unicode)

+    def tocleanstring(self, element):
+        txt = element.xpath('text()')   # ['foo ', ' bar']
+        txt = ' '.join(txt)             # 'foo   bar'
+        txt = re.sub('\s+', ' ', txt)   # 'foo bar'
+        return txt.strip()
+
+    def strip(self, s):
+        doc = lxml.html.fromstring(s)   # parse html string
+        return self.tocleanstring(doc)
+
    @classmethod
    def select(cls, element, selector, nb=None, method='cssselect'):
        """