add filter Duration

This commit is contained in:
Romain Bignon 2014-03-19 20:27:22 +01:00
commit 06d1907d3d
3 changed files with 31 additions and 52 deletions

View file

@ -18,12 +18,11 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
import re import re
from weboob.tools.browser2 import HTMLPage from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import ListElement, method, ItemElement from weboob.tools.browser2.page import ListElement, method, ItemElement
from weboob.tools.browser2.filters import Filter, Link, CleanText from weboob.tools.browser2.filters import Filter, Link, CleanText, Duration
from weboob.capabilities.image import BaseImage from weboob.capabilities.image import BaseImage
from weboob.capabilities.video import BaseVideo from weboob.capabilities.video import BaseVideo
@ -45,23 +44,6 @@ class IndexPage(HTMLPage):
def filter(self, link): def filter(self, link):
return re.sub(r'/videos/(.+)\.html', r'\1', link) return re.sub(r'/videos/(.+)\.html', r'\1', link)
class Duration(Filter):
def filter(self, txt):
time_txt = txt.replace(';', ':')
hours, minutes, seconds = 0, 0, 0
if ':' in time_txt:
t = time_txt.split(':')
t.reverse()
seconds = int(t[0])
minutes = int(t[1])
if len(t) == 3:
hours = int(t[2])
elif time_txt != 'N/A':
raise ValueError('Unable to parse the video duration: %s' % time_txt)
return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
obj_id = Id(Link('.//a')) obj_id = Id(Link('.//a'))
obj_title = CleanText('.//span[@id="title1"]') obj_title = CleanText('.//span[@id="title1"]')
obj_duration = Duration(CleanText('.//span[@class="thumbtime"]//span')) obj_duration = Duration(CleanText('.//span[@class="thumbtime"]//span'))

View file

@ -18,13 +18,11 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
import re import re
from weboob.tools.browser2 import HTMLPage from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import method, ItemElement from weboob.tools.browser2.page import method, ItemElement
from weboob.tools.browser2.filters import CleanText, Env from weboob.tools.browser2.filters import CleanText, Env, Duration
from weboob.capabilities.base import NotAvailable
from weboob.capabilities.video import BaseVideo from weboob.capabilities.video import BaseVideo
from weboob.tools.misc import to_unicode from weboob.tools.misc import to_unicode
@ -41,19 +39,7 @@ class VideoPage(HTMLPage):
obj_title = CleanText('//title') obj_title = CleanText('//title')
obj_nsfw = True obj_nsfw = True
obj_ext = u'flv' obj_ext = u'flv'
obj_duration = Duration(CleanText('//div[@id="video_text"]'))
def obj_duration(self):
# youjizz HTML is crap, we must parse it with regexps
m = re.search(r'<strong>.*?Runtime.*?</strong> (.+?)</div>', self.page.response.text)
if m:
txt = m.group(1).strip()
if txt == 'Unknown':
return NotAvailable
else:
minutes, seconds = (int(v) for v in to_unicode(txt).split(':'))
return datetime.timedelta(minutes=minutes, seconds=seconds)
else:
raise ValueError('Unable to retrieve video duration')
def obj_url(self): def obj_url(self):
real_id = int(self.env['id'].split('-')[-1]) real_id = int(self.env['id'].split('-')[-1])

View file

@ -23,7 +23,7 @@ from dateutil.parser import parse as parse_date
import datetime import datetime
from decimal import Decimal from decimal import Decimal
import re import re
from weboob.capabilities.base import NotAvailable from weboob.capabilities.base import empty
_NO_DEFAULT = object() _NO_DEFAULT = object()
@ -68,7 +68,7 @@ class Filter(_Filter):
""" """
This method have to be overrided by children classes. This method have to be overrided by children classes.
""" """
return value raise NotImplementedError()
class Env(_Filter): class Env(_Filter):
@ -139,15 +139,15 @@ class CleanText(Filter):
return self.remove(txt, self.symbols) return self.remove(txt, self.symbols)
@classmethod @classmethod
def clean(self, txt): def clean(cls, txt):
if not isinstance(txt, basestring): if not isinstance(txt, basestring):
txt = [t.strip() for t in txt.itertext()] txt = [t.strip() for t in txt.itertext()]
txt = u' '.join(txt) # 'foo bar' txt = u' '.join(txt) # 'foo bar'
txt = re.sub(u'[\s\xa0\t]+', u' ', txt) # 'foo bar' txt = re.sub(u'[\\s\xa0\t]+', u' ', txt) # 'foo bar'
return txt.strip() return txt.strip()
@classmethod @classmethod
def remove(self, txt, symbols): def remove(cls, txt, symbols):
for symbol in symbols: for symbol in symbols:
txt = txt.replace(symbol, '') txt = txt.replace(symbol, '')
return txt return txt
@ -159,7 +159,7 @@ class CleanDecimal(CleanText):
def filter(self, text): def filter(self, text):
text = super(CleanDecimal, self).filter(text) text = super(CleanDecimal, self).filter(text)
text = text.replace('.','').replace(',','.') text = text.replace('.','').replace(',','.')
return Decimal(re.sub(u'[^\d\-\.]', '', text)) return Decimal(re.sub(ur'[^\d\-\.]', '', text))
class Link(Filter): class Link(Filter):
""" """
@ -184,7 +184,7 @@ class Field(_Filter):
class Regexp(Filter): class Regexp(Filter):
""" r"""
Apply a regex. Apply a regex.
>>> from lxml.html import etree >>> from lxml.html import etree
@ -216,14 +216,14 @@ class Regexp(Filter):
return mobj.expand(self.template) return mobj.expand(self.template)
class Map(Filter): class Map(Filter):
def __init__(self, selector, map, default=_NO_DEFAULT): def __init__(self, selector, map_dict, default=_NO_DEFAULT):
super(Map, self).__init__(selector) super(Map, self).__init__(selector)
self.map = map self.map_dict = map_dict
self.default = default self.default = default
def filter(self, txt): def filter(self, txt):
try: try:
return self.map[txt] return self.map_dict[txt]
except KeyError: except KeyError:
if self.default is not _NO_DEFAULT: if self.default is not _NO_DEFAULT:
return self.default return self.default
@ -232,18 +232,29 @@ class Map(Filter):
class Date(Filter): class Date(Filter):
def filter(self, txt): def filter(self, txt):
if txt is NotAvailable: if empty(txt):
return NotAvailable return txt
return parse_date(txt) return parse_date(txt)
class Time(Filter): class Time(Filter):
klass = datetime.time
regexp = re.compile(ur'(?P<hh>\d+):?(?P<mm>\d+)(:(?P<ss>\d+))?')
kwargs = {'hour': 'hh', 'minute': 'mm', 'second': 'ss'}
def filter(self, txt): def filter(self, txt):
m = re.search('((?P<hh>\d+):)?(?P<mm>\d+):(?P<ss>\d+)', txt) m = self.regexp.search(txt)
if m: if m:
hh = int(m.groupdict()['hh'] or 0) kwargs = {}
mm = int(m.groupdict()['mm'] or 0) for key, index in self.kwargs.iteritems():
ss = int(m.groupdict()['ss'] or 0) kwargs[key] = int(m.groupdict()[index] or 0)
return datetime.time(hh, mm, ss) return self.klass(**kwargs)
class Duration(Time):
klass = datetime.timedelta
regexp = re.compile(ur'((?P<hh>\d+)[:;])?(?P<mm>\d+)[;:](?P<ss>\d+)')
kwargs = {'hours': 'hh', 'minutes': 'mm', 'seconds': 'ss'}
class Attr(_Filter): class Attr(_Filter):
def __init__(self, xpath, attr): def __init__(self, xpath, attr):