From 20bea658f3f68028a5999faf074b4daea3122794 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Thu, 1 May 2014 03:04:22 +0200 Subject: [PATCH] [francetelevisions] adapt to browser2 --- modules/francetelevisions/backend.py | 17 ++-- modules/francetelevisions/browser.py | 70 +++------------- modules/francetelevisions/pages.py | 119 ++++++++++++++------------- modules/francetelevisions/test.py | 4 +- modules/francetelevisions/video.py | 34 -------- 5 files changed, 78 insertions(+), 166 deletions(-) delete mode 100644 modules/francetelevisions/video.py diff --git a/modules/francetelevisions/backend.py b/modules/francetelevisions/backend.py index 07e6b455..c2d1c70b 100644 --- a/modules/francetelevisions/backend.py +++ b/modules/francetelevisions/backend.py @@ -18,14 +18,11 @@ # along with weboob. If not, see . - - from weboob.capabilities.video import ICapVideo, BaseVideo from weboob.capabilities.collection import ICapCollection, CollectionNotFound from weboob.tools.backend import BaseBackend from .browser import PluzzBrowser -from .video import PluzzVideo __all__ = ['PluzzBackend'] @@ -41,21 +38,17 @@ class PluzzBackend(BaseBackend, ICapVideo, ICapCollection): BROWSER = PluzzBrowser def get_video(self, _id): - with self.browser: - return self.browser.get_video(_id) + return self.browser.get_video(_id) def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False): - with self.browser: - return self.browser.search_videos(pattern) + return self.browser.search_videos(pattern) def fill_video(self, video, fields): if fields != ['thumbnail']: # if we don't want only the thumbnail, we probably want also every fields - with self.browser: - video = self.browser.get_video(PluzzVideo.id2url(video.id), video) + video = self.browser.get_video(video.id, video) if 'thumbnail' in fields and video.thumbnail: - with self.browser: - video.thumbnail.data = self.browser.readurl(video.thumbnail.url) + video.thumbnail.data = self.browser.readurl(video.thumbnail.url) return video @@ -76,4 +69,4 @@ class PluzzBackend(BaseBackend, ICapVideo, ICapCollection): return raise CollectionNotFound(collection.split_path) - OBJECTS = {PluzzVideo: fill_video} + OBJECTS = {BaseVideo: fill_video} diff --git a/modules/francetelevisions/browser.py b/modules/francetelevisions/browser.py index 5607bcb7..bb1d5fd5 100644 --- a/modules/francetelevisions/browser.py +++ b/modules/francetelevisions/browser.py @@ -17,76 +17,26 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -import datetime - -from lxml import etree - -from weboob.tools.browser import BaseBrowser -from weboob.tools.browser.decorators import id2url - +from weboob.tools.browser2 import PagesBrowser, URL from .pages import IndexPage, VideoPage -from .video import PluzzVideo - __all__ = ['PluzzBrowser'] -class PluzzBrowser(BaseBrowser): - DOMAIN = 'pluzz.francetv.fr' +class PluzzBrowser(PagesBrowser): ENCODING = 'utf-8' - PAGES = {r'http://[w\.]*pluzz.francetv.fr/replay/1': IndexPage, - r'http://[w\.]*pluzz.francetv.fr/recherche.*': IndexPage, - r'http://[w\.]*pluzz.francetv.fr/videos/(.+).html': VideoPage, - } - @id2url(PluzzVideo.id2url) - def get_video(self, url, video=None): - self.location(url) - assert self.is_on_page(VideoPage) + BASEURL = 'http://pluzz.francetv.fr' - _id = self.page.get_id() - if video is None: - video = PluzzVideo(_id) - - infourl = self.page.get_info_url() - if infourl is not None: - self.parse_info(self.openurl(infourl).read(), video) - - return video - - def home(self): - self.search_videos('') + index_page = URL('recherche\?recherche=(?P.*)', IndexPage) + latest_page = URL('lesplusrecents', IndexPage) + video_page = URL('http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/\?idDiffusion=(?P<_id>.*)&catalogue=Pluzz', VideoPage) def search_videos(self, pattern): - self.location(self.buildurl('/recherche', recherche=pattern.encode('utf-8'))) + return self.index_page.go(pattern=pattern).iter_videos() - assert self.is_on_page(IndexPage) - return self.page.iter_videos() + def get_video(self, _id, video=None): + return self.video_page.go(_id=_id).get_video(obj=video) def latest_videos(self): - self.home() - - assert self.is_on_page(IndexPage) - return self.page.iter_videos() - - def parse_info(self, data, video): - parser = etree.XMLParser(encoding='utf-8') - root = etree.XML(data, parser) - assert root.tag == 'oeuvre' - - video.title = unicode(root.findtext('titre')) - - hours, minutes, seconds = root.findtext('duree').split(':') - video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) - - for vid in root.find('videos'): - if vid.findtext('statut') == 'ONLINE' and vid.findtext('format') == 'wmv': - video.url = unicode(vid.findtext('url')) - - date = root.findtext('diffusions/diffusion') - if date: - video.date = datetime.datetime.strptime(date, '%d/%m/%Y %H:%M') - - video.description = unicode(root.findtext('synopsis')) - - return video + return self.latest_page.go().iter_videos() diff --git a/modules/francetelevisions/pages.py b/modules/francetelevisions/pages.py index 02b9e4b5..558a4b1f 100644 --- a/modules/francetelevisions/pages.py +++ b/modules/francetelevisions/pages.py @@ -17,71 +17,74 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -import datetime -import re -from dateutil.parser import parse as parse_dt - -from weboob.capabilities import UserError from weboob.capabilities.image import BaseImage -from weboob.tools.browser import BasePage, BrokenPageError +from weboob.capabilities.video import BaseVideo +from datetime import timedelta +from dateutil.parser import parse as parse_date -from .video import PluzzVideo +from weboob.tools.browser2.page import HTMLPage, method, ItemElement, ListElement, JsonPage +from weboob.tools.browser2.filters import Filter, Link, CleanText, Regexp, Attr, Format, DateTime, Env __all__ = ['IndexPage', 'VideoPage'] -class IndexPage(BasePage): - def iter_videos(self): - for div in self.parser.select(self.document.getroot(), 'article.rs-cell'): - title = self.parser.select(div, 'h3 a', 1) - url = title.attrib['href'] - m = re.match('^http://pluzz.francetv.fr/videos/(.+).html$', url) - if not m: - self.logger.debug('url %s does not match' % url) - continue - _id = m.group(1) - video = PluzzVideo(_id) - video.title = unicode(title.text.strip()) - for p in div.xpath('.//p[@class="bientot"]'): - video.title += ' - %s' % p.text.split('|')[0].strip() - date = div.xpath('.//p[@class="diffusion"]')[0].text.split('|')[0].strip() - pattern = re.compile(r'(\d{2}-\d{2}-\d{2})(.*?)(\d{2}:\d{2})') - match = pattern.search(date) - if match: - video.date = parse_dt("%s %s" % (match.group(1), match.group(3))) - duration = div.xpath('.//span[@class="type-duree"]')[0].text.split('|')[1].strip() - if duration[-1:] == "'": - t = [0, int(duration[:-1])] - else: - t = map(int, duration.split(':')) - video.duration = datetime.timedelta(hours=t[0], minutes=t[1]) - - url = self.parser.select(div, 'a.vignette img', 1).attrib['src'] - video.thumbnail = BaseImage(url) - video.thumbnail.url = video.thumbnail.id - - yield video - - -class VideoPage(BasePage): - def on_loaded(self): - p = self.parser.select(self.document.getroot(), 'p.alert') - if len(p) > 0: - raise UserError(p[0].text) - - def get_info_url(self): - try: - div = self.parser.select(self.document.getroot(), 'a#current_video', 1) - except BrokenPageError: - return None +class DurationPluzz(Filter): + def filter(self, el): + duration = Regexp(CleanText('.'), '.+\|(.+)')(el[0]) + if duration[-1:] == "'": + t = [0, int(duration[:-1])] else: - m = re.match( - '^%s(\d+)$' % re.escape('http://info.francetelevisions.fr/?id-video='), - div.attrib['href']) - if m: - return r'http://pluzz.francetv.fr/appftv/webservices/video/getInfosOeuvre.php?mode=zeri&id-diffusion=%s' % m.group(1) + t = map(int, duration.split(':')) + return timedelta(hours=t[0], minutes=t[1]) - def get_id(self): - return self.groups[0] + +class IndexPage(HTMLPage): + + @method + class iter_videos(ListElement): + item_xpath = '//div[@id="section-list_results"]/article' + + class item(ItemElement): + klass = BaseVideo + + obj_title = Format('%s - %s', CleanText('h3/a'), CleanText('div[@class="rs-cell-details"]/a')) + obj_id = Regexp(Link('h3/a'), '^http://pluzz.francetv.fr/videos/.+,(.+).html$') + obj_date = DateTime(Regexp(CleanText('div/p[@class="diffusion"]', replace=[(u'à', u''), (u' ', u' ')]), '.+(\d{2}-\d{2}-\d{2}.+\d{2}).+')) + obj_duration = DurationPluzz('div/span[@class="type-duree"]') + + def obj_thumbnail(self): + url = Attr('a[@class="vignette"]/img', 'data-src')(self) + thumbnail = BaseImage(url) + thumbnail.url = thumbnail.id + return thumbnail + + +class VideoPage(JsonPage): + @method + class get_video(ItemElement): + klass = BaseVideo + + def parse(self, el): + for video in el['videos']: + if video['format'] != 'm3u8-download': + continue + self.env['url'] = video['url'] + self.env['date'] = parse_date(el['diffusion']['date_debut'], dayfirst=True) + self.env['title'] = u'%s - %s' % (el['titre'], el['sous_titre']) + hours, minutes, seconds = el['duree'].split(':') + self.env['duration'] = timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) + url = 'http://pluzz.francetv.fr%s' % (el['image']) + thumbnail = BaseImage(url) + thumbnail.url = thumbnail.id + self.env['thumbnail'] = thumbnail + self.env['description'] = el['synopsis'] + + obj_id = Env('_id') + obj_title = Env('title') + obj_url = Env('url') + obj_date = Env('date') + obj_duration = Env('duration') + obj_thumbnail = Env('thumbnail') + obj_description = Env('description') diff --git a/modules/francetelevisions/test.py b/modules/francetelevisions/test.py index 5f3cb75c..9be94836 100644 --- a/modules/francetelevisions/test.py +++ b/modules/francetelevisions/test.py @@ -27,11 +27,11 @@ class PluzzTest(BackendTest): def test_search(self): # If the test fails, it might be good news! - l = list(self.backend.search_videos('Plus belle la vie')) + l = list(self.backend.search_videos('d art')) self.assertTrue(len(l) > 0) v = l[0] self.backend.fillobj(v, ('url',)) - self.assertTrue(v.url and v.url.startswith('mms://'), 'URL for video "%s" not found: %s' % (v.id, v.url)) + self.assertTrue(v.url, 'URL for video "%s" not found: %s' % (v.id, v.url)) def test_latest(self): l = list(self.backend.iter_resources([BaseVideo], [u'latest'])) diff --git a/modules/francetelevisions/video.py b/modules/francetelevisions/video.py deleted file mode 100644 index 4acbd859..00000000 --- a/modules/francetelevisions/video.py +++ /dev/null @@ -1,34 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright(C) 2011 Romain Bignon -# -# This file is part of weboob. -# -# weboob is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# weboob is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with weboob. If not, see . - - -from weboob.capabilities.video import BaseVideo - - -__all__ = ['PluzzVideo'] - - -class PluzzVideo(BaseVideo): - def __init__(self, *args, **kwargs): - BaseVideo.__init__(self, *args, **kwargs) - self.ext = u'wmv' - - @classmethod - def id2url(cls, _id): - return 'http://pluzz.francetv.fr/videos/%s.html' % _id