get all strings under this element
This commit is contained in:
parent
3ac1824307
commit
501c3c8e11
1 changed files with 2 additions and 2 deletions
|
|
@ -46,8 +46,8 @@ class LxmlHtmlParser(IParser):
|
||||||
return lxml.html.tostring(element, encoding=unicode)
|
return lxml.html.tostring(element, encoding=unicode)
|
||||||
|
|
||||||
def tocleanstring(self, element):
|
def tocleanstring(self, element):
|
||||||
txt = element.xpath('text()') # ['foo ', ' bar']
|
txt = [txt.strip() for txt in element.itertext()]
|
||||||
txt = ' '.join(txt) # 'foo bar'
|
txt = u' '.join(txt) # 'foo bar'
|
||||||
txt = re.sub('\s+', ' ', txt) # 'foo bar'
|
txt = re.sub('\s+', ' ', txt) # 'foo bar'
|
||||||
return txt.strip()
|
return txt.strip()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue