delete 'remove_html_tags' global function, and create IParser.tocleanstring and IParser.strip abstract methods.
This commit is contained in:
parent
5a96b425da
commit
59dfe3083a
4 changed files with 31 additions and 7 deletions
|
|
@ -22,7 +22,6 @@ import re
|
||||||
|
|
||||||
from weboob.capabilities.bank import Operation
|
from weboob.capabilities.bank import Operation
|
||||||
from weboob.tools.browser import BasePage
|
from weboob.tools.browser import BasePage
|
||||||
from weboob.tools.misc import remove_html_tags
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['AccountHistory']
|
__all__ = ['AccountHistory']
|
||||||
|
|
@ -40,7 +39,7 @@ class AccountHistory(BasePage):
|
||||||
operation = Operation(len(operations))
|
operation = Operation(len(operations))
|
||||||
operation.date = mvt.xpath("./td/span")[0].text
|
operation.date = mvt.xpath("./td/span")[0].text
|
||||||
tmp = mvt.xpath("./td/span")[1]
|
tmp = mvt.xpath("./td/span")[1]
|
||||||
operation.label = remove_html_tags(self.parser.tostring(tmp)).strip()
|
operation.label = self.parser.tocleanstring(tmp)
|
||||||
|
|
||||||
r = re.compile(r'\d+')
|
r = re.compile(r'\d+')
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,6 @@ from dateutil import tz
|
||||||
from logging import warning
|
from logging import warning
|
||||||
from time import time, sleep
|
from time import time, sleep
|
||||||
from tempfile import gettempdir
|
from tempfile import gettempdir
|
||||||
import re
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
|
|
@ -62,10 +61,6 @@ def get_bytes_size(size, unit_name):
|
||||||
}
|
}
|
||||||
return float(size * unit_data.get(unit_name, 1))
|
return float(size * unit_data.get(unit_name, 1))
|
||||||
|
|
||||||
def remove_html_tags(data):
|
|
||||||
p = re.compile(r'<.*?>')
|
|
||||||
return p.sub(' ', data)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import html2text as h2t
|
import html2text as h2t
|
||||||
h2t.UNICODE_SNOB = 1
|
h2t.UNICODE_SNOB = 1
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,12 @@
|
||||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['IParser']
|
||||||
|
|
||||||
|
|
||||||
class IParser(object):
|
class IParser(object):
|
||||||
def parse(self, data, encoding=None):
|
def parse(self, data, encoding=None):
|
||||||
"""
|
"""
|
||||||
|
|
@ -34,3 +40,16 @@ class IParser(object):
|
||||||
Get HTML string from an element.
|
Get HTML string from an element.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def tocleanstring(self, elem):
|
||||||
|
"""
|
||||||
|
Get a clean string from an element.
|
||||||
|
"""
|
||||||
|
return self.strip(self.tostring(elem))
|
||||||
|
|
||||||
|
def strip(self, data):
|
||||||
|
"""
|
||||||
|
Strip a HTML string.
|
||||||
|
"""
|
||||||
|
p = re.compile(r'<.*?>')
|
||||||
|
return p.sub(' ', data).strip()
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,7 @@
|
||||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
import lxml.html
|
import lxml.html
|
||||||
|
|
||||||
from .iparser import IParser
|
from .iparser import IParser
|
||||||
|
|
@ -44,6 +45,16 @@ class LxmlHtmlParser(IParser):
|
||||||
def tostring(self, element):
|
def tostring(self, element):
|
||||||
return lxml.html.tostring(element, encoding=unicode)
|
return lxml.html.tostring(element, encoding=unicode)
|
||||||
|
|
||||||
|
def tocleanstring(self, element):
|
||||||
|
txt = element.xpath('text()') # ['foo ', ' bar']
|
||||||
|
txt = ' '.join(txt) # 'foo bar'
|
||||||
|
txt = re.sub('\s+', ' ', txt) # 'foo bar'
|
||||||
|
return txt.strip()
|
||||||
|
|
||||||
|
def strip(self, s):
|
||||||
|
doc = lxml.html.fromstring(s) # parse html string
|
||||||
|
return self.tocleanstring(doc)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def select(cls, element, selector, nb=None, method='cssselect'):
|
def select(cls, element, selector, nb=None, method='cssselect'):
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue