[francetelevisions] adapt to browser2

This commit is contained in:
Bezleputh 2014-05-01 03:04:22 +02:00 committed by Florent
commit 20bea658f3
5 changed files with 79 additions and 167 deletions

View file

@ -18,14 +18,11 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.video import ICapVideo, BaseVideo from weboob.capabilities.video import ICapVideo, BaseVideo
from weboob.capabilities.collection import ICapCollection, CollectionNotFound from weboob.capabilities.collection import ICapCollection, CollectionNotFound
from weboob.tools.backend import BaseBackend from weboob.tools.backend import BaseBackend
from .browser import PluzzBrowser from .browser import PluzzBrowser
from .video import PluzzVideo
__all__ = ['PluzzBackend'] __all__ = ['PluzzBackend']
@ -41,21 +38,17 @@ class PluzzBackend(BaseBackend, ICapVideo, ICapCollection):
BROWSER = PluzzBrowser BROWSER = PluzzBrowser
def get_video(self, _id): def get_video(self, _id):
with self.browser: return self.browser.get_video(_id)
return self.browser.get_video(_id)
def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False): def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
with self.browser: return self.browser.search_videos(pattern)
return self.browser.search_videos(pattern)
def fill_video(self, video, fields): def fill_video(self, video, fields):
if fields != ['thumbnail']: if fields != ['thumbnail']:
# if we don't want only the thumbnail, we probably want also every fields # if we don't want only the thumbnail, we probably want also every fields
with self.browser: video = self.browser.get_video(video.id, video)
video = self.browser.get_video(PluzzVideo.id2url(video.id), video)
if 'thumbnail' in fields and video.thumbnail: if 'thumbnail' in fields and video.thumbnail:
with self.browser: video.thumbnail.data = self.browser.readurl(video.thumbnail.url)
video.thumbnail.data = self.browser.readurl(video.thumbnail.url)
return video return video
@ -76,4 +69,4 @@ class PluzzBackend(BaseBackend, ICapVideo, ICapCollection):
return return
raise CollectionNotFound(collection.split_path) raise CollectionNotFound(collection.split_path)
OBJECTS = {PluzzVideo: fill_video} OBJECTS = {BaseVideo: fill_video}

View file

@ -17,76 +17,26 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime from weboob.tools.browser2 import PagesBrowser, URL
from lxml import etree
from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url
from .pages import IndexPage, VideoPage from .pages import IndexPage, VideoPage
from .video import PluzzVideo
__all__ = ['PluzzBrowser'] __all__ = ['PluzzBrowser']
class PluzzBrowser(BaseBrowser): class PluzzBrowser(PagesBrowser):
DOMAIN = 'pluzz.francetv.fr'
ENCODING = 'utf-8' ENCODING = 'utf-8'
PAGES = {r'http://[w\.]*pluzz.francetv.fr/replay/1': IndexPage,
r'http://[w\.]*pluzz.francetv.fr/recherche.*': IndexPage,
r'http://[w\.]*pluzz.francetv.fr/videos/(.+).html': VideoPage,
}
@id2url(PluzzVideo.id2url) BASEURL = 'http://pluzz.francetv.fr'
def get_video(self, url, video=None):
self.location(url)
assert self.is_on_page(VideoPage)
_id = self.page.get_id() index_page = URL('recherche\?recherche=(?P<pattern>.*)', IndexPage)
if video is None: latest_page = URL('lesplusrecents', IndexPage)
video = PluzzVideo(_id) video_page = URL('http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/\?idDiffusion=(?P<_id>.*)&catalogue=Pluzz', VideoPage)
infourl = self.page.get_info_url()
if infourl is not None:
self.parse_info(self.openurl(infourl).read(), video)
return video
def home(self):
self.search_videos('')
def search_videos(self, pattern): def search_videos(self, pattern):
self.location(self.buildurl('/recherche', recherche=pattern.encode('utf-8'))) return self.index_page.go(pattern=pattern).iter_videos()
assert self.is_on_page(IndexPage) def get_video(self, _id, video=None):
return self.page.iter_videos() return self.video_page.go(_id=_id).get_video(obj=video)
def latest_videos(self): def latest_videos(self):
self.home() return self.latest_page.go().iter_videos()
assert self.is_on_page(IndexPage)
return self.page.iter_videos()
def parse_info(self, data, video):
parser = etree.XMLParser(encoding='utf-8')
root = etree.XML(data, parser)
assert root.tag == 'oeuvre'
video.title = unicode(root.findtext('titre'))
hours, minutes, seconds = root.findtext('duree').split(':')
video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))
for vid in root.find('videos'):
if vid.findtext('statut') == 'ONLINE' and vid.findtext('format') == 'wmv':
video.url = unicode(vid.findtext('url'))
date = root.findtext('diffusions/diffusion')
if date:
video.date = datetime.datetime.strptime(date, '%d/%m/%Y %H:%M')
video.description = unicode(root.findtext('synopsis'))
return video

View file

@ -17,71 +17,74 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
import re
from dateutil.parser import parse as parse_dt
from weboob.capabilities import UserError
from weboob.capabilities.image import BaseImage from weboob.capabilities.image import BaseImage
from weboob.tools.browser import BasePage, BrokenPageError from weboob.capabilities.video import BaseVideo
from datetime import timedelta
from dateutil.parser import parse as parse_date
from .video import PluzzVideo from weboob.tools.browser2.page import HTMLPage, method, ItemElement, ListElement, JsonPage
from weboob.tools.browser2.filters import Filter, Link, CleanText, Regexp, Attr, Format, DateTime, Env
__all__ = ['IndexPage', 'VideoPage'] __all__ = ['IndexPage', 'VideoPage']
class IndexPage(BasePage): class DurationPluzz(Filter):
def iter_videos(self): def filter(self, el):
for div in self.parser.select(self.document.getroot(), 'article.rs-cell'): duration = Regexp(CleanText('.'), '.+\|(.+)')(el[0])
title = self.parser.select(div, 'h3 a', 1) if duration[-1:] == "'":
url = title.attrib['href'] t = [0, int(duration[:-1])]
m = re.match('^http://pluzz.francetv.fr/videos/(.+).html$', url)
if not m:
self.logger.debug('url %s does not match' % url)
continue
_id = m.group(1)
video = PluzzVideo(_id)
video.title = unicode(title.text.strip())
for p in div.xpath('.//p[@class="bientot"]'):
video.title += ' - %s' % p.text.split('|')[0].strip()
date = div.xpath('.//p[@class="diffusion"]')[0].text.split('|')[0].strip()
pattern = re.compile(r'(\d{2}-\d{2}-\d{2})(.*?)(\d{2}:\d{2})')
match = pattern.search(date)
if match:
video.date = parse_dt("%s %s" % (match.group(1), match.group(3)))
duration = div.xpath('.//span[@class="type-duree"]')[0].text.split('|')[1].strip()
if duration[-1:] == "'":
t = [0, int(duration[:-1])]
else:
t = map(int, duration.split(':'))
video.duration = datetime.timedelta(hours=t[0], minutes=t[1])
url = self.parser.select(div, 'a.vignette img', 1).attrib['src']
video.thumbnail = BaseImage(url)
video.thumbnail.url = video.thumbnail.id
yield video
class VideoPage(BasePage):
def on_loaded(self):
p = self.parser.select(self.document.getroot(), 'p.alert')
if len(p) > 0:
raise UserError(p[0].text)
def get_info_url(self):
try:
div = self.parser.select(self.document.getroot(), 'a#current_video', 1)
except BrokenPageError:
return None
else: else:
m = re.match( t = map(int, duration.split(':'))
'^%s(\d+)$' % re.escape('http://info.francetelevisions.fr/?id-video='), return timedelta(hours=t[0], minutes=t[1])
div.attrib['href'])
if m:
return r'http://pluzz.francetv.fr/appftv/webservices/video/getInfosOeuvre.php?mode=zeri&id-diffusion=%s' % m.group(1)
def get_id(self):
return self.groups[0] class IndexPage(HTMLPage):
@method
class iter_videos(ListElement):
item_xpath = '//div[@id="section-list_results"]/article'
class item(ItemElement):
klass = BaseVideo
obj_title = Format('%s - %s', CleanText('h3/a'), CleanText('div[@class="rs-cell-details"]/a'))
obj_id = Regexp(Link('h3/a'), '^http://pluzz.francetv.fr/videos/.+,(.+).html$')
obj_date = DateTime(Regexp(CleanText('div/p[@class="diffusion"]', replace=[(u'à', u''), (u' ', u' ')]), '.+(\d{2}-\d{2}-\d{2}.+\d{2}).+'))
obj_duration = DurationPluzz('div/span[@class="type-duree"]')
def obj_thumbnail(self):
url = Attr('a[@class="vignette"]/img', 'data-src')(self)
thumbnail = BaseImage(url)
thumbnail.url = thumbnail.id
return thumbnail
class VideoPage(JsonPage):
@method
class get_video(ItemElement):
klass = BaseVideo
def parse(self, el):
for video in el['videos']:
if video['format'] != 'm3u8-download':
continue
self.env['url'] = video['url']
self.env['date'] = parse_date(el['diffusion']['date_debut'], dayfirst=True)
self.env['title'] = u'%s - %s' % (el['titre'], el['sous_titre'])
hours, minutes, seconds = el['duree'].split(':')
self.env['duration'] = timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))
url = 'http://pluzz.francetv.fr%s' % (el['image'])
thumbnail = BaseImage(url)
thumbnail.url = thumbnail.id
self.env['thumbnail'] = thumbnail
self.env['description'] = el['synopsis']
obj_id = Env('_id')
obj_title = Env('title')
obj_url = Env('url')
obj_date = Env('date')
obj_duration = Env('duration')
obj_thumbnail = Env('thumbnail')
obj_description = Env('description')

View file

@ -27,11 +27,11 @@ class PluzzTest(BackendTest):
def test_search(self): def test_search(self):
# If the test fails, it might be good news! # If the test fails, it might be good news!
l = list(self.backend.search_videos('Plus belle la vie')) l = list(self.backend.search_videos('d art'))
self.assertTrue(len(l) > 0) self.assertTrue(len(l) > 0)
v = l[0] v = l[0]
self.backend.fillobj(v, ('url',)) self.backend.fillobj(v, ('url',))
self.assertTrue(v.url and v.url.startswith('mms://'), 'URL for video "%s" not found: %s' % (v.id, v.url)) self.assertTrue(v.url, 'URL for video "%s" not found: %s' % (v.id, v.url))
def test_latest(self): def test_latest(self):
l = list(self.backend.iter_resources([BaseVideo], [u'latest'])) l = list(self.backend.iter_resources([BaseVideo], [u'latest']))

View file

@ -1,34 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.video import BaseVideo
__all__ = ['PluzzVideo']
class PluzzVideo(BaseVideo):
def __init__(self, *args, **kwargs):
BaseVideo.__init__(self, *args, **kwargs)
self.ext = u'wmv'
@classmethod
def id2url(cls, _id):
return 'http://pluzz.francetv.fr/videos/%s.html' % _id