get all strings under this element

This commit is contained in:
Romain Bignon 2013-01-07 16:34:03 +01:00
commit 501c3c8e11

View file

@ -46,8 +46,8 @@ class LxmlHtmlParser(IParser):
return lxml.html.tostring(element, encoding=unicode)
def tocleanstring(self, element):
txt = element.xpath('text()') # ['foo ', ' bar']
txt = ' '.join(txt) # 'foo bar'
txt = [txt.strip() for txt in element.itertext()]
txt = u' '.join(txt) # 'foo bar'
txt = re.sub('\s+', ' ', txt) # 'foo bar'
return txt.strip()