add Regexp filter
This commit is contained in:
parent
434931a3ca
commit
a4167194f0
1 changed files with 32 additions and 0 deletions
|
|
@ -164,3 +164,35 @@ class Attr(_Filter):
|
||||||
|
|
||||||
def __call__(self, item):
|
def __call__(self, item):
|
||||||
return item.use_selector(getattr(item, 'obj_%s' % self.name))
|
return item.use_selector(getattr(item, 'obj_%s' % self.name))
|
||||||
|
|
||||||
|
|
||||||
|
class Regexp(Filter):
|
||||||
|
"""
|
||||||
|
Apply a regex.
|
||||||
|
|
||||||
|
>>> from lxml.html import etree
|
||||||
|
>>> f = Regexp(CleanText('//p'), r'Date: (\d+)/(\d+)/(\d+)', r'\3-\2-\1')
|
||||||
|
>>> f(etree.fromstring('<html><body><p>Date: <span>13/08/1988</span></p></body></html>'))
|
||||||
|
"""
|
||||||
|
def __init__(self, selector, pattern, template=None, flags=0, default=None):
|
||||||
|
super(Regexp, self).__init__(selector)
|
||||||
|
self.pattern = pattern
|
||||||
|
self.regex = re.compile(pattern, flags)
|
||||||
|
self.template = template
|
||||||
|
self.default = default
|
||||||
|
|
||||||
|
def filter(self, txt):
|
||||||
|
if isinstance(txt, (tuple,list)):
|
||||||
|
txt = ' '.join([t.strip() for t in txt.itertext()])
|
||||||
|
|
||||||
|
mobj = self.regex.search(txt)
|
||||||
|
if not mobj:
|
||||||
|
if self.default is not None:
|
||||||
|
return self.default
|
||||||
|
else:
|
||||||
|
raise KeyError('Unable to match %s' % self.pattern)
|
||||||
|
|
||||||
|
if self.template is None:
|
||||||
|
return next(g for g in mobj.groups() if g is not None)
|
||||||
|
else:
|
||||||
|
return mobj.expand(self.template)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue