diff --git a/modules/youjizz/pages/index.py b/modules/youjizz/pages/index.py
index bb7c7cba..fa2adb3e 100644
--- a/modules/youjizz/pages/index.py
+++ b/modules/youjizz/pages/index.py
@@ -18,12 +18,11 @@
# along with weboob. If not, see .
-import datetime
import re
from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import ListElement, method, ItemElement
-from weboob.tools.browser2.filters import Filter, Link, CleanText
+from weboob.tools.browser2.filters import Filter, Link, CleanText, Duration
from weboob.capabilities.image import BaseImage
from weboob.capabilities.video import BaseVideo
@@ -45,23 +44,6 @@ class IndexPage(HTMLPage):
def filter(self, link):
return re.sub(r'/videos/(.+)\.html', r'\1', link)
- class Duration(Filter):
- def filter(self, txt):
- time_txt = txt.replace(';', ':')
- hours, minutes, seconds = 0, 0, 0
- if ':' in time_txt:
- t = time_txt.split(':')
- t.reverse()
- seconds = int(t[0])
- minutes = int(t[1])
- if len(t) == 3:
- hours = int(t[2])
- elif time_txt != 'N/A':
- raise ValueError('Unable to parse the video duration: %s' % time_txt)
-
- return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
-
-
obj_id = Id(Link('.//a'))
obj_title = CleanText('.//span[@id="title1"]')
obj_duration = Duration(CleanText('.//span[@class="thumbtime"]//span'))
diff --git a/modules/youjizz/pages/video.py b/modules/youjizz/pages/video.py
index 2e905a19..c74febcc 100644
--- a/modules/youjizz/pages/video.py
+++ b/modules/youjizz/pages/video.py
@@ -18,13 +18,11 @@
# along with weboob. If not, see .
-import datetime
import re
from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import method, ItemElement
-from weboob.tools.browser2.filters import CleanText, Env
-from weboob.capabilities.base import NotAvailable
+from weboob.tools.browser2.filters import CleanText, Env, Duration
from weboob.capabilities.video import BaseVideo
from weboob.tools.misc import to_unicode
@@ -41,19 +39,7 @@ class VideoPage(HTMLPage):
obj_title = CleanText('//title')
obj_nsfw = True
obj_ext = u'flv'
-
- def obj_duration(self):
- # youjizz HTML is crap, we must parse it with regexps
- m = re.search(r'.*?Runtime.*? (.+?)', self.page.response.text)
- if m:
- txt = m.group(1).strip()
- if txt == 'Unknown':
- return NotAvailable
- else:
- minutes, seconds = (int(v) for v in to_unicode(txt).split(':'))
- return datetime.timedelta(minutes=minutes, seconds=seconds)
- else:
- raise ValueError('Unable to retrieve video duration')
+ obj_duration = Duration(CleanText('//div[@id="video_text"]'))
def obj_url(self):
real_id = int(self.env['id'].split('-')[-1])
diff --git a/weboob/tools/browser2/filters.py b/weboob/tools/browser2/filters.py
index 68e9c576..474b1445 100644
--- a/weboob/tools/browser2/filters.py
+++ b/weboob/tools/browser2/filters.py
@@ -23,7 +23,7 @@ from dateutil.parser import parse as parse_date
import datetime
from decimal import Decimal
import re
-from weboob.capabilities.base import NotAvailable
+from weboob.capabilities.base import empty
_NO_DEFAULT = object()
@@ -68,7 +68,7 @@ class Filter(_Filter):
"""
This method have to be overrided by children classes.
"""
- return value
+ raise NotImplementedError()
class Env(_Filter):
@@ -139,15 +139,15 @@ class CleanText(Filter):
return self.remove(txt, self.symbols)
@classmethod
- def clean(self, txt):
+ def clean(cls, txt):
if not isinstance(txt, basestring):
txt = [t.strip() for t in txt.itertext()]
txt = u' '.join(txt) # 'foo bar'
- txt = re.sub(u'[\s\xa0\t]+', u' ', txt) # 'foo bar'
+ txt = re.sub(u'[\\s\xa0\t]+', u' ', txt) # 'foo bar'
return txt.strip()
@classmethod
- def remove(self, txt, symbols):
+ def remove(cls, txt, symbols):
for symbol in symbols:
txt = txt.replace(symbol, '')
return txt
@@ -159,7 +159,7 @@ class CleanDecimal(CleanText):
def filter(self, text):
text = super(CleanDecimal, self).filter(text)
text = text.replace('.','').replace(',','.')
- return Decimal(re.sub(u'[^\d\-\.]', '', text))
+ return Decimal(re.sub(ur'[^\d\-\.]', '', text))
class Link(Filter):
"""
@@ -184,7 +184,7 @@ class Field(_Filter):
class Regexp(Filter):
- """
+ r"""
Apply a regex.
>>> from lxml.html import etree
@@ -216,14 +216,14 @@ class Regexp(Filter):
return mobj.expand(self.template)
class Map(Filter):
- def __init__(self, selector, map, default=_NO_DEFAULT):
+ def __init__(self, selector, map_dict, default=_NO_DEFAULT):
super(Map, self).__init__(selector)
- self.map = map
+ self.map_dict = map_dict
self.default = default
def filter(self, txt):
try:
- return self.map[txt]
+ return self.map_dict[txt]
except KeyError:
if self.default is not _NO_DEFAULT:
return self.default
@@ -232,18 +232,29 @@ class Map(Filter):
class Date(Filter):
def filter(self, txt):
- if txt is NotAvailable:
- return NotAvailable
+ if empty(txt):
+ return txt
return parse_date(txt)
class Time(Filter):
+ klass = datetime.time
+ regexp = re.compile(ur'(?P\d+):?(?P\d+)(:(?P\d+))?')
+ kwargs = {'hour': 'hh', 'minute': 'mm', 'second': 'ss'}
+
def filter(self, txt):
- m = re.search('((?P\d+):)?(?P\d+):(?P\d+)', txt)
+ m = self.regexp.search(txt)
if m:
- hh = int(m.groupdict()['hh'] or 0)
- mm = int(m.groupdict()['mm'] or 0)
- ss = int(m.groupdict()['ss'] or 0)
- return datetime.time(hh, mm, ss)
+ kwargs = {}
+ for key, index in self.kwargs.iteritems():
+ kwargs[key] = int(m.groupdict()[index] or 0)
+ return self.klass(**kwargs)
+
+
+class Duration(Time):
+ klass = datetime.timedelta
+ regexp = re.compile(ur'((?P\d+)[:;])?(?P\d+)[;:](?P\d+)')
+ kwargs = {'hours': 'hh', 'minutes': 'mm', 'seconds': 'ss'}
+
class Attr(_Filter):
def __init__(self, xpath, attr):