From 06d1907d3d4530690b9c53f66887a0c04ae8488c Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Wed, 19 Mar 2014 20:27:22 +0100 Subject: [PATCH] add filter Duration --- modules/youjizz/pages/index.py | 20 +------------- modules/youjizz/pages/video.py | 18 ++----------- weboob/tools/browser2/filters.py | 45 ++++++++++++++++++++------------ 3 files changed, 31 insertions(+), 52 deletions(-) diff --git a/modules/youjizz/pages/index.py b/modules/youjizz/pages/index.py index bb7c7cba..fa2adb3e 100644 --- a/modules/youjizz/pages/index.py +++ b/modules/youjizz/pages/index.py @@ -18,12 +18,11 @@ # along with weboob. If not, see . -import datetime import re from weboob.tools.browser2 import HTMLPage from weboob.tools.browser2.page import ListElement, method, ItemElement -from weboob.tools.browser2.filters import Filter, Link, CleanText +from weboob.tools.browser2.filters import Filter, Link, CleanText, Duration from weboob.capabilities.image import BaseImage from weboob.capabilities.video import BaseVideo @@ -45,23 +44,6 @@ class IndexPage(HTMLPage): def filter(self, link): return re.sub(r'/videos/(.+)\.html', r'\1', link) - class Duration(Filter): - def filter(self, txt): - time_txt = txt.replace(';', ':') - hours, minutes, seconds = 0, 0, 0 - if ':' in time_txt: - t = time_txt.split(':') - t.reverse() - seconds = int(t[0]) - minutes = int(t[1]) - if len(t) == 3: - hours = int(t[2]) - elif time_txt != 'N/A': - raise ValueError('Unable to parse the video duration: %s' % time_txt) - - return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) - - obj_id = Id(Link('.//a')) obj_title = CleanText('.//span[@id="title1"]') obj_duration = Duration(CleanText('.//span[@class="thumbtime"]//span')) diff --git a/modules/youjizz/pages/video.py b/modules/youjizz/pages/video.py index 2e905a19..c74febcc 100644 --- a/modules/youjizz/pages/video.py +++ b/modules/youjizz/pages/video.py @@ -18,13 +18,11 @@ # along with weboob. If not, see . -import datetime import re from weboob.tools.browser2 import HTMLPage from weboob.tools.browser2.page import method, ItemElement -from weboob.tools.browser2.filters import CleanText, Env -from weboob.capabilities.base import NotAvailable +from weboob.tools.browser2.filters import CleanText, Env, Duration from weboob.capabilities.video import BaseVideo from weboob.tools.misc import to_unicode @@ -41,19 +39,7 @@ class VideoPage(HTMLPage): obj_title = CleanText('//title') obj_nsfw = True obj_ext = u'flv' - - def obj_duration(self): - # youjizz HTML is crap, we must parse it with regexps - m = re.search(r'.*?Runtime.*? (.+?)', self.page.response.text) - if m: - txt = m.group(1).strip() - if txt == 'Unknown': - return NotAvailable - else: - minutes, seconds = (int(v) for v in to_unicode(txt).split(':')) - return datetime.timedelta(minutes=minutes, seconds=seconds) - else: - raise ValueError('Unable to retrieve video duration') + obj_duration = Duration(CleanText('//div[@id="video_text"]')) def obj_url(self): real_id = int(self.env['id'].split('-')[-1]) diff --git a/weboob/tools/browser2/filters.py b/weboob/tools/browser2/filters.py index 68e9c576..474b1445 100644 --- a/weboob/tools/browser2/filters.py +++ b/weboob/tools/browser2/filters.py @@ -23,7 +23,7 @@ from dateutil.parser import parse as parse_date import datetime from decimal import Decimal import re -from weboob.capabilities.base import NotAvailable +from weboob.capabilities.base import empty _NO_DEFAULT = object() @@ -68,7 +68,7 @@ class Filter(_Filter): """ This method have to be overrided by children classes. """ - return value + raise NotImplementedError() class Env(_Filter): @@ -139,15 +139,15 @@ class CleanText(Filter): return self.remove(txt, self.symbols) @classmethod - def clean(self, txt): + def clean(cls, txt): if not isinstance(txt, basestring): txt = [t.strip() for t in txt.itertext()] txt = u' '.join(txt) # 'foo bar' - txt = re.sub(u'[\s\xa0\t]+', u' ', txt) # 'foo bar' + txt = re.sub(u'[\\s\xa0\t]+', u' ', txt) # 'foo bar' return txt.strip() @classmethod - def remove(self, txt, symbols): + def remove(cls, txt, symbols): for symbol in symbols: txt = txt.replace(symbol, '') return txt @@ -159,7 +159,7 @@ class CleanDecimal(CleanText): def filter(self, text): text = super(CleanDecimal, self).filter(text) text = text.replace('.','').replace(',','.') - return Decimal(re.sub(u'[^\d\-\.]', '', text)) + return Decimal(re.sub(ur'[^\d\-\.]', '', text)) class Link(Filter): """ @@ -184,7 +184,7 @@ class Field(_Filter): class Regexp(Filter): - """ + r""" Apply a regex. >>> from lxml.html import etree @@ -216,14 +216,14 @@ class Regexp(Filter): return mobj.expand(self.template) class Map(Filter): - def __init__(self, selector, map, default=_NO_DEFAULT): + def __init__(self, selector, map_dict, default=_NO_DEFAULT): super(Map, self).__init__(selector) - self.map = map + self.map_dict = map_dict self.default = default def filter(self, txt): try: - return self.map[txt] + return self.map_dict[txt] except KeyError: if self.default is not _NO_DEFAULT: return self.default @@ -232,18 +232,29 @@ class Map(Filter): class Date(Filter): def filter(self, txt): - if txt is NotAvailable: - return NotAvailable + if empty(txt): + return txt return parse_date(txt) class Time(Filter): + klass = datetime.time + regexp = re.compile(ur'(?P\d+):?(?P\d+)(:(?P\d+))?') + kwargs = {'hour': 'hh', 'minute': 'mm', 'second': 'ss'} + def filter(self, txt): - m = re.search('((?P\d+):)?(?P\d+):(?P\d+)', txt) + m = self.regexp.search(txt) if m: - hh = int(m.groupdict()['hh'] or 0) - mm = int(m.groupdict()['mm'] or 0) - ss = int(m.groupdict()['ss'] or 0) - return datetime.time(hh, mm, ss) + kwargs = {} + for key, index in self.kwargs.iteritems(): + kwargs[key] = int(m.groupdict()[index] or 0) + return self.klass(**kwargs) + + +class Duration(Time): + klass = datetime.timedelta + regexp = re.compile(ur'((?P\d+)[:;])?(?P\d+)[;:](?P\d+)') + kwargs = {'hours': 'hh', 'minutes': 'mm', 'seconds': 'ss'} + class Attr(_Filter): def __init__(self, xpath, attr):