delete 'remove_html_tags' global function, and create IParser.tocleanstring and IParser.strip abstract methods.

This commit is contained in:
Romain Bignon 2011-10-25 13:28:43 +02:00
commit 59dfe3083a
4 changed files with 31 additions and 7 deletions

View file

@ -22,7 +22,6 @@ import re
from weboob.capabilities.bank import Operation from weboob.capabilities.bank import Operation
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.misc import remove_html_tags
__all__ = ['AccountHistory'] __all__ = ['AccountHistory']
@ -40,7 +39,7 @@ class AccountHistory(BasePage):
operation = Operation(len(operations)) operation = Operation(len(operations))
operation.date = mvt.xpath("./td/span")[0].text operation.date = mvt.xpath("./td/span")[0].text
tmp = mvt.xpath("./td/span")[1] tmp = mvt.xpath("./td/span")[1]
operation.label = remove_html_tags(self.parser.tostring(tmp)).strip() operation.label = self.parser.tocleanstring(tmp)
r = re.compile(r'\d+') r = re.compile(r'\d+')

View file

@ -24,7 +24,6 @@ from dateutil import tz
from logging import warning from logging import warning
from time import time, sleep from time import time, sleep
from tempfile import gettempdir from tempfile import gettempdir
import re
import os import os
import sys import sys
import traceback import traceback
@ -62,10 +61,6 @@ def get_bytes_size(size, unit_name):
} }
return float(size * unit_data.get(unit_name, 1)) return float(size * unit_data.get(unit_name, 1))
def remove_html_tags(data):
p = re.compile(r'<.*?>')
return p.sub(' ', data)
try: try:
import html2text as h2t import html2text as h2t
h2t.UNICODE_SNOB = 1 h2t.UNICODE_SNOB = 1

View file

@ -18,6 +18,12 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
__all__ = ['IParser']
class IParser(object): class IParser(object):
def parse(self, data, encoding=None): def parse(self, data, encoding=None):
""" """
@ -34,3 +40,16 @@ class IParser(object):
Get HTML string from an element. Get HTML string from an element.
""" """
raise NotImplementedError() raise NotImplementedError()
def tocleanstring(self, elem):
"""
Get a clean string from an element.
"""
return self.strip(self.tostring(elem))
def strip(self, data):
"""
Strip a HTML string.
"""
p = re.compile(r'<.*?>')
return p.sub(' ', data).strip()

View file

@ -18,6 +18,7 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
import lxml.html import lxml.html
from .iparser import IParser from .iparser import IParser
@ -44,6 +45,16 @@ class LxmlHtmlParser(IParser):
def tostring(self, element): def tostring(self, element):
return lxml.html.tostring(element, encoding=unicode) return lxml.html.tostring(element, encoding=unicode)
def tocleanstring(self, element):
txt = element.xpath('text()') # ['foo ', ' bar']
txt = ' '.join(txt) # 'foo bar'
txt = re.sub('\s+', ' ', txt) # 'foo bar'
return txt.strip()
def strip(self, s):
doc = lxml.html.fromstring(s) # parse html string
return self.tocleanstring(doc)
@classmethod @classmethod
def select(cls, element, selector, nb=None, method='cssselect'): def select(cls, element, selector, nb=None, method='cssselect'):
""" """