From 8efd37e71de81150c4b61903519f23a4bfcd5042 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Sat, 5 Jul 2014 19:26:29 +0200 Subject: [PATCH] overload & and | operators to chain filters (refs #1426) --- modules/pastebin/browser.py | 10 ++-- modules/youjizz/pages/index.py | 8 +-- modules/youjizz/pages/video.py | 2 +- weboob/tools/browser2/filters.py | 90 +++++++++++++++++++------------- weboob/tools/browser2/page.py | 3 ++ 5 files changed, 66 insertions(+), 47 deletions(-) diff --git a/modules/pastebin/browser.py b/modules/pastebin/browser.py index 40dd7d86..835c4747 100644 --- a/modules/pastebin/browser.py +++ b/modules/pastebin/browser.py @@ -72,12 +72,12 @@ class PastePage(BasePastebinPage): self.env['header'] = el.find('//div[@id="content_left"]//div[@class="paste_box_info"]') obj_id = Env('id') - obj_title = Base(Env('header'), CleanText('.//div[@class="paste_box_line1"]//h1')) + obj_title = Base(Env('header')) & CleanText('.//div[@class="paste_box_line1"]//h1') obj_contents = RawText('//textarea[@id="paste_code"]') - obj_public = Base( - Env('header'), - CleanVisibility(Attr('.//div[@class="paste_box_line1"]//img', 'title'))) - obj__date = Base(Env('header'), DateTime(Attr('.//div[@class="paste_box_line2"]/span[1]', 'title'))) + obj_public = Base(Env('header')) \ + & Attr('.//div[@class="paste_box_line1"]//img', 'title') \ + & CleanVisibility() + obj__date = Base(Env('header')) & Attr('.//div[@class="paste_box_line2"]/span[1]', 'title') & DateTime() class PostPage(BasePastebinPage): diff --git a/modules/youjizz/pages/index.py b/modules/youjizz/pages/index.py index 46c76f79..565eabcd 100644 --- a/modules/youjizz/pages/index.py +++ b/modules/youjizz/pages/index.py @@ -20,7 +20,7 @@ from weboob.tools.browser2 import HTMLPage from weboob.tools.browser2.page import ListElement, method, ItemElement, pagination -from weboob.tools.browser2.filters import Link, CleanText, Duration, Regexp +from weboob.tools.browser2.filters import Link, CleanText, Duration, Regexp, CSS from weboob.capabilities.base import NotAvailable from weboob.capabilities.image import BaseImage from weboob.capabilities.video import BaseVideo @@ -40,9 +40,9 @@ class IndexPage(HTMLPage): class item(ItemElement): klass = BaseVideo - obj_id = Regexp(Link('.//a'), r'/videos/(.+)\.html') - obj_title = CleanText('.//span[@id="title1"]') - obj_duration = Duration(CleanText('.//span[@class="thumbtime"]//span'), default=NotAvailable) + obj_id = CSS('a') & Link() & Regexp(pattern=r'/videos/(.+)\.html') + obj_title = CSS('span#title1') & CleanText() + obj_duration = CSS('span.thumbtime span') & CleanText() & Duration() | NotAvailable obj_nsfw = True def obj_thumbnail(self): diff --git a/modules/youjizz/pages/video.py b/modules/youjizz/pages/video.py index c74febcc..6f9de6bc 100644 --- a/modules/youjizz/pages/video.py +++ b/modules/youjizz/pages/video.py @@ -39,7 +39,7 @@ class VideoPage(HTMLPage): obj_title = CleanText('//title') obj_nsfw = True obj_ext = u'flv' - obj_duration = Duration(CleanText('//div[@id="video_text"]')) + obj_duration = CleanText('//div[@id="video_text"]') & Duration() def obj_url(self): real_id = int(self.env['id'].split('-')[-1]) diff --git a/weboob/tools/browser2/filters.py b/weboob/tools/browser2/filters.py index 17c714aa..849b2a36 100644 --- a/weboob/tools/browser2/filters.py +++ b/weboob/tools/browser2/filters.py @@ -66,6 +66,14 @@ class _Filter(object): self._creation_counter = _Filter._creation_counter _Filter._creation_counter += 1 + def __or__(self, o): + self.default = o + return self + + def __and__(self, o): + o.selector = self + return o + def default_or_raise(self, exception): if self.default is not _NO_DEFAULT: return self.default @@ -110,6 +118,41 @@ class Filter(_Filter): raise NotImplementedError() +class _Selector(Filter): + def filter(self, txt): + if txt is not None: + return txt + else: + return self.default_or_raise(ParseError('Element %r not found' % self.selector)) + + +class Dict(_Selector): + @classmethod + def select(cls, selector, item): + if isinstance(item, dict): + content = item + else: + content = item.el + + for el in selector.split('/'): + if el not in content: + return None + + content = content.get(el) + + return content + + +class CSS(_Selector): + @classmethod + def select(cls, selector, item): + return item.cssselect(selector) + + +class XPath(_Selector): + pass + + class Base(Filter): """ Change the base element used in filters. @@ -119,7 +162,7 @@ class Base(Filter): base = self.select(self.base, item) return self.selector(base) - def __init__(self, base, selector, default=_NO_DEFAULT): + def __init__(self, base, selector=None, default=_NO_DEFAULT): super(Base, self).__init__(selector, default) self.base = base @@ -173,34 +216,6 @@ class TableCell(_Filter): return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names))) -class Dict(Filter): - @classmethod - def select(cls, selector, item): - if isinstance(selector, basestring): - if isinstance(item, dict): - content = item - else: - content = item.el - - for el in selector.split('/'): - if el not in content: - return None - - content = content.get(el) - - return content - elif callable(selector): - return selector(item) - else: - return selector - - def filter(self, txt): - if txt is not None: - return txt - else: - return self.default_or_raise(ParseError()) - - class CleanHTML(Filter): def filter(self, txt): if isinstance(txt, (tuple, list)): @@ -234,7 +249,7 @@ class CleanText(Filter): Second, it replaces all symbols given in second argument. """ - def __init__(self, selector, symbols='', replace=[], childs=True, **kwargs): + def __init__(self, selector=None, symbols='', replace=[], childs=True, **kwargs): super(CleanText, self).__init__(selector, **kwargs) self.symbols = symbols self.toreplace = replace @@ -283,7 +298,7 @@ class CleanDecimal(CleanText): Get a cleaned Decimal value from an element. """ - def __init__(self, selector, replace_dots=True, default=_NO_DEFAULT): + def __init__(self, selector=None, replace_dots=True, default=_NO_DEFAULT): super(CleanDecimal, self).__init__(selector, default=default) self.replace_dots = replace_dots @@ -318,7 +333,7 @@ class Link(Attr): If the tag is not found, an exception IndexError is raised. """ - def __init__(self, selector, default=_NO_DEFAULT): + def __init__(self, selector=None, default=_NO_DEFAULT): super(Link, self).__init__(selector, 'href', default=default) @@ -345,8 +360,9 @@ class Regexp(Filter): u'1988-08-13' """ - def __init__(self, selector, pattern, template=None, flags=0, default=_NO_DEFAULT): + def __init__(self, selector=None, pattern=None, template=None, flags=0, default=_NO_DEFAULT): super(Regexp, self).__init__(selector, default=default) + assert pattern is not None self.pattern = pattern self.regex = re.compile(pattern, flags) self.template = template @@ -379,7 +395,7 @@ class Map(Filter): class DateTime(Filter): - def __init__(self, selector, default=_NO_DEFAULT, dayfirst=False, translations=None): + def __init__(self, selector=None, default=_NO_DEFAULT, dayfirst=False, translations=None): super(DateTime, self).__init__(selector, default=default) self.dayfirst = dayfirst self.translations = translations @@ -397,7 +413,7 @@ class DateTime(Filter): class Date(DateTime): - def __init__(self, selector, default=_NO_DEFAULT, dayfirst=False, translations=None): + def __init__(self, selector=None, default=_NO_DEFAULT, dayfirst=False, translations=None): super(Date, self).__init__(selector, default=default, dayfirst=dayfirst, translations=translations) def filter(self, txt): @@ -435,7 +451,7 @@ class Time(Filter): regexp = re.compile(r'(?P\d+):?(?P\d+)(:(?P\d+))?') kwargs = {'hour': 'hh', 'minute': 'mm', 'second': 'ss'} - def __init__(self, selector, default=_NO_DEFAULT): + def __init__(self, selector=None, default=_NO_DEFAULT): super(Time, self).__init__(selector, default=default) def filter(self, txt): @@ -486,7 +502,7 @@ class Format(MultiFilter): class Join(Filter): - def __init__(self, pattern, selector, textCleaner=CleanText): + def __init__(self, pattern, selector=None, textCleaner=CleanText): super(Join, self).__init__(selector) self.pattern = pattern self.textCleaner = textCleaner diff --git a/weboob/tools/browser2/page.py b/weboob/tools/browser2/page.py index 1d3ffa31..703e2069 100644 --- a/weboob/tools/browser2/page.py +++ b/weboob/tools/browser2/page.py @@ -642,6 +642,9 @@ class AbstractElement(object): def parse(self, obj): pass + def cssselect(self, *args, **kwargs): + return self.el.cssselect(*args, **kwargs) + def xpath(self, *args, **kwargs): return self.el.xpath(*args, **kwargs)