add filter Duration

This commit is contained in:
Romain Bignon 2014-03-19 20:27:22 +01:00
commit 06d1907d3d
3 changed files with 31 additions and 52 deletions

View file

@ -18,12 +18,11 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
import re
from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import ListElement, method, ItemElement
from weboob.tools.browser2.filters import Filter, Link, CleanText
from weboob.tools.browser2.filters import Filter, Link, CleanText, Duration
from weboob.capabilities.image import BaseImage
from weboob.capabilities.video import BaseVideo
@ -45,23 +44,6 @@ class IndexPage(HTMLPage):
def filter(self, link):
return re.sub(r'/videos/(.+)\.html', r'\1', link)
class Duration(Filter):
def filter(self, txt):
time_txt = txt.replace(';', ':')
hours, minutes, seconds = 0, 0, 0
if ':' in time_txt:
t = time_txt.split(':')
t.reverse()
seconds = int(t[0])
minutes = int(t[1])
if len(t) == 3:
hours = int(t[2])
elif time_txt != 'N/A':
raise ValueError('Unable to parse the video duration: %s' % time_txt)
return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
obj_id = Id(Link('.//a'))
obj_title = CleanText('.//span[@id="title1"]')
obj_duration = Duration(CleanText('.//span[@class="thumbtime"]//span'))

View file

@ -18,13 +18,11 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
import re
from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import method, ItemElement
from weboob.tools.browser2.filters import CleanText, Env
from weboob.capabilities.base import NotAvailable
from weboob.tools.browser2.filters import CleanText, Env, Duration
from weboob.capabilities.video import BaseVideo
from weboob.tools.misc import to_unicode
@ -41,19 +39,7 @@ class VideoPage(HTMLPage):
obj_title = CleanText('//title')
obj_nsfw = True
obj_ext = u'flv'
def obj_duration(self):
# youjizz HTML is crap, we must parse it with regexps
m = re.search(r'<strong>.*?Runtime.*?</strong> (.+?)</div>', self.page.response.text)
if m:
txt = m.group(1).strip()
if txt == 'Unknown':
return NotAvailable
else:
minutes, seconds = (int(v) for v in to_unicode(txt).split(':'))
return datetime.timedelta(minutes=minutes, seconds=seconds)
else:
raise ValueError('Unable to retrieve video duration')
obj_duration = Duration(CleanText('//div[@id="video_text"]'))
def obj_url(self):
real_id = int(self.env['id'].split('-')[-1])

View file

@ -23,7 +23,7 @@ from dateutil.parser import parse as parse_date
import datetime
from decimal import Decimal
import re
from weboob.capabilities.base import NotAvailable
from weboob.capabilities.base import empty
_NO_DEFAULT = object()
@ -68,7 +68,7 @@ class Filter(_Filter):
"""
This method have to be overrided by children classes.
"""
return value
raise NotImplementedError()
class Env(_Filter):
@ -139,15 +139,15 @@ class CleanText(Filter):
return self.remove(txt, self.symbols)
@classmethod
def clean(self, txt):
def clean(cls, txt):
if not isinstance(txt, basestring):
txt = [t.strip() for t in txt.itertext()]
txt = u' '.join(txt) # 'foo bar'
txt = re.sub(u'[\s\xa0\t]+', u' ', txt) # 'foo bar'
txt = re.sub(u'[\\s\xa0\t]+', u' ', txt) # 'foo bar'
return txt.strip()
@classmethod
def remove(self, txt, symbols):
def remove(cls, txt, symbols):
for symbol in symbols:
txt = txt.replace(symbol, '')
return txt
@ -159,7 +159,7 @@ class CleanDecimal(CleanText):
def filter(self, text):
text = super(CleanDecimal, self).filter(text)
text = text.replace('.','').replace(',','.')
return Decimal(re.sub(u'[^\d\-\.]', '', text))
return Decimal(re.sub(ur'[^\d\-\.]', '', text))
class Link(Filter):
"""
@ -184,7 +184,7 @@ class Field(_Filter):
class Regexp(Filter):
"""
r"""
Apply a regex.
>>> from lxml.html import etree
@ -216,14 +216,14 @@ class Regexp(Filter):
return mobj.expand(self.template)
class Map(Filter):
def __init__(self, selector, map, default=_NO_DEFAULT):
def __init__(self, selector, map_dict, default=_NO_DEFAULT):
super(Map, self).__init__(selector)
self.map = map
self.map_dict = map_dict
self.default = default
def filter(self, txt):
try:
return self.map[txt]
return self.map_dict[txt]
except KeyError:
if self.default is not _NO_DEFAULT:
return self.default
@ -232,18 +232,29 @@ class Map(Filter):
class Date(Filter):
def filter(self, txt):
if txt is NotAvailable:
return NotAvailable
if empty(txt):
return txt
return parse_date(txt)
class Time(Filter):
klass = datetime.time
regexp = re.compile(ur'(?P<hh>\d+):?(?P<mm>\d+)(:(?P<ss>\d+))?')
kwargs = {'hour': 'hh', 'minute': 'mm', 'second': 'ss'}
def filter(self, txt):
m = re.search('((?P<hh>\d+):)?(?P<mm>\d+):(?P<ss>\d+)', txt)
m = self.regexp.search(txt)
if m:
hh = int(m.groupdict()['hh'] or 0)
mm = int(m.groupdict()['mm'] or 0)
ss = int(m.groupdict()['ss'] or 0)
return datetime.time(hh, mm, ss)
kwargs = {}
for key, index in self.kwargs.iteritems():
kwargs[key] = int(m.groupdict()[index] or 0)
return self.klass(**kwargs)
class Duration(Time):
klass = datetime.timedelta
regexp = re.compile(ur'((?P<hh>\d+)[:;])?(?P<mm>\d+)[;:](?P<ss>\d+)')
kwargs = {'hours': 'hh', 'minutes': 'mm', 'seconds': 'ss'}
class Attr(_Filter):
def __init__(self, xpath, attr):