delete 'remove_html_tags' global function, and create IParser.tocleanstring and IParser.strip abstract methods.
This commit is contained in:
parent
5a96b425da
commit
59dfe3083a
4 changed files with 31 additions and 7 deletions
|
|
@ -18,6 +18,7 @@
|
|||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
import re
|
||||
import lxml.html
|
||||
|
||||
from .iparser import IParser
|
||||
|
|
@ -44,6 +45,16 @@ class LxmlHtmlParser(IParser):
|
|||
def tostring(self, element):
|
||||
return lxml.html.tostring(element, encoding=unicode)
|
||||
|
||||
def tocleanstring(self, element):
|
||||
txt = element.xpath('text()') # ['foo ', ' bar']
|
||||
txt = ' '.join(txt) # 'foo bar'
|
||||
txt = re.sub('\s+', ' ', txt) # 'foo bar'
|
||||
return txt.strip()
|
||||
|
||||
def strip(self, s):
|
||||
doc = lxml.html.fromstring(s) # parse html string
|
||||
return self.tocleanstring(doc)
|
||||
|
||||
@classmethod
|
||||
def select(cls, element, selector, nb=None, method='cssselect'):
|
||||
"""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue