From a4167194f066b1b1f866a42b18ef4e8a3ea3d8fa Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Wed, 12 Mar 2014 21:07:31 +0100 Subject: [PATCH] add Regexp filter --- weboob/tools/browser2/filters.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/weboob/tools/browser2/filters.py b/weboob/tools/browser2/filters.py index 79ff1003..b1654995 100644 --- a/weboob/tools/browser2/filters.py +++ b/weboob/tools/browser2/filters.py @@ -164,3 +164,35 @@ class Attr(_Filter): def __call__(self, item): return item.use_selector(getattr(item, 'obj_%s' % self.name)) + + +class Regexp(Filter): + """ + Apply a regex. + + >>> from lxml.html import etree + >>> f = Regexp(CleanText('//p'), r'Date: (\d+)/(\d+)/(\d+)', r'\3-\2-\1') + >>> f(etree.fromstring('

Date: 13/08/1988

')) + """ + def __init__(self, selector, pattern, template=None, flags=0, default=None): + super(Regexp, self).__init__(selector) + self.pattern = pattern + self.regex = re.compile(pattern, flags) + self.template = template + self.default = default + + def filter(self, txt): + if isinstance(txt, (tuple,list)): + txt = ' '.join([t.strip() for t in txt.itertext()]) + + mobj = self.regex.search(txt) + if not mobj: + if self.default is not None: + return self.default + else: + raise KeyError('Unable to match %s' % self.pattern) + + if self.template is None: + return next(g for g in mobj.groups() if g is not None) + else: + return mobj.expand(self.template)