Add childs option to CleanText
This commit is contained in:
parent
45351d57c9
commit
d85675918a
1 changed files with 10 additions and 6 deletions
|
|
@ -170,23 +170,27 @@ class CleanText(Filter):
|
||||||
string.
|
string.
|
||||||
Second, it replaces all symbols given in second argument.
|
Second, it replaces all symbols given in second argument.
|
||||||
"""
|
"""
|
||||||
def __init__(self, selector, symbols='', replace=[], **kwargs):
|
def __init__(self, selector, symbols='', replace=[], childs=True, **kwargs):
|
||||||
super(CleanText, self).__init__(selector, **kwargs)
|
super(CleanText, self).__init__(selector, **kwargs)
|
||||||
self.symbols = symbols
|
self.symbols = symbols
|
||||||
self.toreplace = replace
|
self.toreplace = replace
|
||||||
|
self.childs = childs
|
||||||
|
|
||||||
def filter(self, txt):
|
def filter(self, txt):
|
||||||
if isinstance(txt, (tuple,list)):
|
if isinstance(txt, (tuple,list)):
|
||||||
txt = ' '.join(map(self.clean, txt))
|
txt = ' '.join([self.clean(item, childs=self.childs) for item in txt])
|
||||||
|
|
||||||
txt = self.clean(txt)
|
txt = self.clean(txt, childs=self.childs)
|
||||||
txt = self.remove(txt, self.symbols)
|
txt = self.remove(txt, self.symbols)
|
||||||
return self.replace(txt, self.toreplace)
|
return self.replace(txt, self.toreplace)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def clean(cls, txt):
|
def clean(cls, txt, childs=True):
|
||||||
if not isinstance(txt, basestring):
|
if not isinstance(txt, basestring):
|
||||||
txt = [t.strip() for t in txt.itertext()]
|
if childs:
|
||||||
|
txt = [t.strip() for t in txt.itertext()]
|
||||||
|
else:
|
||||||
|
txt = [txt.text.strip()]
|
||||||
txt = u' '.join(txt) # 'foo bar'
|
txt = u' '.join(txt) # 'foo bar'
|
||||||
txt = re.sub(u'[\\s\xa0\t]+', u' ', txt) # 'foo bar'
|
txt = re.sub(u'[\\s\xa0\t]+', u' ', txt) # 'foo bar'
|
||||||
return txt.strip()
|
return txt.strip()
|
||||||
|
|
@ -195,7 +199,7 @@ class CleanText(Filter):
|
||||||
def remove(cls, txt, symbols):
|
def remove(cls, txt, symbols):
|
||||||
for symbol in symbols:
|
for symbol in symbols:
|
||||||
txt = txt.replace(symbol, '')
|
txt = txt.replace(symbol, '')
|
||||||
return txt
|
return txt.strip()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def replace(cls, txt, replace):
|
def replace(cls, txt, replace):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue