add filter Duration
This commit is contained in:
parent
16615dd337
commit
06d1907d3d
3 changed files with 31 additions and 52 deletions
|
|
@ -18,12 +18,11 @@
|
||||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
|
||||||
import datetime
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from weboob.tools.browser2 import HTMLPage
|
from weboob.tools.browser2 import HTMLPage
|
||||||
from weboob.tools.browser2.page import ListElement, method, ItemElement
|
from weboob.tools.browser2.page import ListElement, method, ItemElement
|
||||||
from weboob.tools.browser2.filters import Filter, Link, CleanText
|
from weboob.tools.browser2.filters import Filter, Link, CleanText, Duration
|
||||||
from weboob.capabilities.image import BaseImage
|
from weboob.capabilities.image import BaseImage
|
||||||
from weboob.capabilities.video import BaseVideo
|
from weboob.capabilities.video import BaseVideo
|
||||||
|
|
||||||
|
|
@ -45,23 +44,6 @@ class IndexPage(HTMLPage):
|
||||||
def filter(self, link):
|
def filter(self, link):
|
||||||
return re.sub(r'/videos/(.+)\.html', r'\1', link)
|
return re.sub(r'/videos/(.+)\.html', r'\1', link)
|
||||||
|
|
||||||
class Duration(Filter):
|
|
||||||
def filter(self, txt):
|
|
||||||
time_txt = txt.replace(';', ':')
|
|
||||||
hours, minutes, seconds = 0, 0, 0
|
|
||||||
if ':' in time_txt:
|
|
||||||
t = time_txt.split(':')
|
|
||||||
t.reverse()
|
|
||||||
seconds = int(t[0])
|
|
||||||
minutes = int(t[1])
|
|
||||||
if len(t) == 3:
|
|
||||||
hours = int(t[2])
|
|
||||||
elif time_txt != 'N/A':
|
|
||||||
raise ValueError('Unable to parse the video duration: %s' % time_txt)
|
|
||||||
|
|
||||||
return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
|
|
||||||
|
|
||||||
|
|
||||||
obj_id = Id(Link('.//a'))
|
obj_id = Id(Link('.//a'))
|
||||||
obj_title = CleanText('.//span[@id="title1"]')
|
obj_title = CleanText('.//span[@id="title1"]')
|
||||||
obj_duration = Duration(CleanText('.//span[@class="thumbtime"]//span'))
|
obj_duration = Duration(CleanText('.//span[@class="thumbtime"]//span'))
|
||||||
|
|
|
||||||
|
|
@ -18,13 +18,11 @@
|
||||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
|
||||||
import datetime
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from weboob.tools.browser2 import HTMLPage
|
from weboob.tools.browser2 import HTMLPage
|
||||||
from weboob.tools.browser2.page import method, ItemElement
|
from weboob.tools.browser2.page import method, ItemElement
|
||||||
from weboob.tools.browser2.filters import CleanText, Env
|
from weboob.tools.browser2.filters import CleanText, Env, Duration
|
||||||
from weboob.capabilities.base import NotAvailable
|
|
||||||
from weboob.capabilities.video import BaseVideo
|
from weboob.capabilities.video import BaseVideo
|
||||||
from weboob.tools.misc import to_unicode
|
from weboob.tools.misc import to_unicode
|
||||||
|
|
||||||
|
|
@ -41,19 +39,7 @@ class VideoPage(HTMLPage):
|
||||||
obj_title = CleanText('//title')
|
obj_title = CleanText('//title')
|
||||||
obj_nsfw = True
|
obj_nsfw = True
|
||||||
obj_ext = u'flv'
|
obj_ext = u'flv'
|
||||||
|
obj_duration = Duration(CleanText('//div[@id="video_text"]'))
|
||||||
def obj_duration(self):
|
|
||||||
# youjizz HTML is crap, we must parse it with regexps
|
|
||||||
m = re.search(r'<strong>.*?Runtime.*?</strong> (.+?)</div>', self.page.response.text)
|
|
||||||
if m:
|
|
||||||
txt = m.group(1).strip()
|
|
||||||
if txt == 'Unknown':
|
|
||||||
return NotAvailable
|
|
||||||
else:
|
|
||||||
minutes, seconds = (int(v) for v in to_unicode(txt).split(':'))
|
|
||||||
return datetime.timedelta(minutes=minutes, seconds=seconds)
|
|
||||||
else:
|
|
||||||
raise ValueError('Unable to retrieve video duration')
|
|
||||||
|
|
||||||
def obj_url(self):
|
def obj_url(self):
|
||||||
real_id = int(self.env['id'].split('-')[-1])
|
real_id = int(self.env['id'].split('-')[-1])
|
||||||
|
|
|
||||||
|
|
@ -23,7 +23,7 @@ from dateutil.parser import parse as parse_date
|
||||||
import datetime
|
import datetime
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
import re
|
import re
|
||||||
from weboob.capabilities.base import NotAvailable
|
from weboob.capabilities.base import empty
|
||||||
|
|
||||||
_NO_DEFAULT = object()
|
_NO_DEFAULT = object()
|
||||||
|
|
||||||
|
|
@ -68,7 +68,7 @@ class Filter(_Filter):
|
||||||
"""
|
"""
|
||||||
This method have to be overrided by children classes.
|
This method have to be overrided by children classes.
|
||||||
"""
|
"""
|
||||||
return value
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
class Env(_Filter):
|
class Env(_Filter):
|
||||||
|
|
@ -139,15 +139,15 @@ class CleanText(Filter):
|
||||||
return self.remove(txt, self.symbols)
|
return self.remove(txt, self.symbols)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def clean(self, txt):
|
def clean(cls, txt):
|
||||||
if not isinstance(txt, basestring):
|
if not isinstance(txt, basestring):
|
||||||
txt = [t.strip() for t in txt.itertext()]
|
txt = [t.strip() for t in txt.itertext()]
|
||||||
txt = u' '.join(txt) # 'foo bar'
|
txt = u' '.join(txt) # 'foo bar'
|
||||||
txt = re.sub(u'[\s\xa0\t]+', u' ', txt) # 'foo bar'
|
txt = re.sub(u'[\\s\xa0\t]+', u' ', txt) # 'foo bar'
|
||||||
return txt.strip()
|
return txt.strip()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def remove(self, txt, symbols):
|
def remove(cls, txt, symbols):
|
||||||
for symbol in symbols:
|
for symbol in symbols:
|
||||||
txt = txt.replace(symbol, '')
|
txt = txt.replace(symbol, '')
|
||||||
return txt
|
return txt
|
||||||
|
|
@ -159,7 +159,7 @@ class CleanDecimal(CleanText):
|
||||||
def filter(self, text):
|
def filter(self, text):
|
||||||
text = super(CleanDecimal, self).filter(text)
|
text = super(CleanDecimal, self).filter(text)
|
||||||
text = text.replace('.','').replace(',','.')
|
text = text.replace('.','').replace(',','.')
|
||||||
return Decimal(re.sub(u'[^\d\-\.]', '', text))
|
return Decimal(re.sub(ur'[^\d\-\.]', '', text))
|
||||||
|
|
||||||
class Link(Filter):
|
class Link(Filter):
|
||||||
"""
|
"""
|
||||||
|
|
@ -184,7 +184,7 @@ class Field(_Filter):
|
||||||
|
|
||||||
|
|
||||||
class Regexp(Filter):
|
class Regexp(Filter):
|
||||||
"""
|
r"""
|
||||||
Apply a regex.
|
Apply a regex.
|
||||||
|
|
||||||
>>> from lxml.html import etree
|
>>> from lxml.html import etree
|
||||||
|
|
@ -216,14 +216,14 @@ class Regexp(Filter):
|
||||||
return mobj.expand(self.template)
|
return mobj.expand(self.template)
|
||||||
|
|
||||||
class Map(Filter):
|
class Map(Filter):
|
||||||
def __init__(self, selector, map, default=_NO_DEFAULT):
|
def __init__(self, selector, map_dict, default=_NO_DEFAULT):
|
||||||
super(Map, self).__init__(selector)
|
super(Map, self).__init__(selector)
|
||||||
self.map = map
|
self.map_dict = map_dict
|
||||||
self.default = default
|
self.default = default
|
||||||
|
|
||||||
def filter(self, txt):
|
def filter(self, txt):
|
||||||
try:
|
try:
|
||||||
return self.map[txt]
|
return self.map_dict[txt]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
if self.default is not _NO_DEFAULT:
|
if self.default is not _NO_DEFAULT:
|
||||||
return self.default
|
return self.default
|
||||||
|
|
@ -232,18 +232,29 @@ class Map(Filter):
|
||||||
|
|
||||||
class Date(Filter):
|
class Date(Filter):
|
||||||
def filter(self, txt):
|
def filter(self, txt):
|
||||||
if txt is NotAvailable:
|
if empty(txt):
|
||||||
return NotAvailable
|
return txt
|
||||||
return parse_date(txt)
|
return parse_date(txt)
|
||||||
|
|
||||||
class Time(Filter):
|
class Time(Filter):
|
||||||
|
klass = datetime.time
|
||||||
|
regexp = re.compile(ur'(?P<hh>\d+):?(?P<mm>\d+)(:(?P<ss>\d+))?')
|
||||||
|
kwargs = {'hour': 'hh', 'minute': 'mm', 'second': 'ss'}
|
||||||
|
|
||||||
def filter(self, txt):
|
def filter(self, txt):
|
||||||
m = re.search('((?P<hh>\d+):)?(?P<mm>\d+):(?P<ss>\d+)', txt)
|
m = self.regexp.search(txt)
|
||||||
if m:
|
if m:
|
||||||
hh = int(m.groupdict()['hh'] or 0)
|
kwargs = {}
|
||||||
mm = int(m.groupdict()['mm'] or 0)
|
for key, index in self.kwargs.iteritems():
|
||||||
ss = int(m.groupdict()['ss'] or 0)
|
kwargs[key] = int(m.groupdict()[index] or 0)
|
||||||
return datetime.time(hh, mm, ss)
|
return self.klass(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class Duration(Time):
|
||||||
|
klass = datetime.timedelta
|
||||||
|
regexp = re.compile(ur'((?P<hh>\d+)[:;])?(?P<mm>\d+)[;:](?P<ss>\d+)')
|
||||||
|
kwargs = {'hours': 'hh', 'minutes': 'mm', 'seconds': 'ss'}
|
||||||
|
|
||||||
|
|
||||||
class Attr(_Filter):
|
class Attr(_Filter):
|
||||||
def __init__(self, xpath, attr):
|
def __init__(self, xpath, attr):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue