From a7b9b6e9cc119413c3c153b79eb061f62ee58e84 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Tue, 6 Jan 2015 16:43:53 +0100 Subject: [PATCH] [francetelevisions] fix #1700 pluzz's search is crap --- modules/francetelevisions/browser.py | 21 +++++++++- modules/francetelevisions/pages.py | 58 +++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/modules/francetelevisions/browser.py b/modules/francetelevisions/browser.py index e0c73e20..706b7696 100644 --- a/modules/francetelevisions/browser.py +++ b/modules/francetelevisions/browser.py @@ -19,7 +19,7 @@ from weboob.browser import PagesBrowser, URL -from .pages import IndexPage, VideoPage +from .pages import IndexPage, VideoPage, Programs, VideoListPage __all__ = ['PluzzBrowser'] @@ -28,12 +28,29 @@ class PluzzBrowser(PagesBrowser): ENCODING = 'utf-8' BASEURL = 'http://pluzz.francetv.fr' + PROGRAMS = None + programs_page = URL('http://pluzz.webservices.francetelevisions.fr/pluzz/programme', Programs) index_page = URL(r'recherche\?recherche=(?P.*)', IndexPage) video_page = URL(r'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/\?idDiffusion=(?P.*)&catalogue=Pluzz', VideoPage) + videos_list_page = URL('(?Pvideos/.*)', VideoListPage) def search_videos(self, pattern): - return self.index_page.go(pattern=pattern).iter_videos() + if not self.PROGRAMS: + self.PROGRAMS = self.get_program_list() + + videos = [] + for program in self.PROGRAMS: + if pattern.upper() in program._title.upper(): + video = self.videos_list_page.go(program=program.id).get_last_video() + if video: + videos.append(video) + videos += list(self.page.iter_videos()) + + return videos if len(videos) > 0 else self.index_page.go(pattern=pattern).iter_videos() + + def get_program_list(self): + return list(self.programs_page.go().iter_programs()) @video_page.id2url def get_video(self, url, video=None): diff --git a/modules/francetelevisions/pages.py b/modules/francetelevisions/pages.py index 0bb871ce..673b4c96 100644 --- a/modules/francetelevisions/pages.py +++ b/modules/francetelevisions/pages.py @@ -19,7 +19,7 @@ from weboob.capabilities.image import BaseImage from weboob.capabilities.video import BaseVideo - +from weboob.capabilities.base import BaseObject from datetime import timedelta from weboob.browser.pages import HTMLPage, JsonPage @@ -29,6 +29,15 @@ from weboob.browser.filters.html import Link, Attr from weboob.browser.filters.json import Dict +class DictElement(ListElement): + def find_elements(self): + if self.item_xpath is not None: + for el in self.el.get('reponse').get(self.item_xpath): + yield el + else: + yield self.el + + class DurationPluzz(Filter): def filter(self, el): duration = Regexp(CleanText('.'), r'.+\|(.+)')(el[0]) @@ -39,6 +48,36 @@ class DurationPluzz(Filter): return timedelta(hours=t[0], minutes=t[1]) +class VideoListPage(HTMLPage): + @method + class get_last_video(ItemElement): + klass = BaseVideo + + obj_id = CleanText('//div[@id="diffusion-info"]/@data-diffusion') + obj_title = CleanText('//div[@id="diffusion-info"]/h1/div[@id="diffusion-titre"]') + obj_date = DateTime(Regexp(CleanText('//div[@id="diffusion-info"]/div/div/span/span[1]', + replace=[(u'à', u''), (u' ', u' ')]), + '.+(\d{2}-\d{2}-\d{2}.+\d{1,2}h\d{1,2}).+'), + dayfirst=True) + + @method + class iter_videos(ListElement): + item_xpath = '//div[@id="player-memeProgramme"]/a' + + class item(ItemElement): + klass = BaseVideo + + def condition(self): + return CleanText('div[@class="autre-emission-c3"]')(self) == "En replay" + + obj_id = Regexp(Link('.'), '^/videos/.+,(.+).html$') + obj_title = CleanText('//meta[@name="programme_titre"]/@content') + obj_date = DateTime(Regexp(CleanText('./div[@class="autre-emission-c2"]', + replace=[(u'à', u''), (u' ', u' ')]), + '(\d{2}-\d{2}.+\d{1,2}:\d{1,2})'), + dayfirst=True) + + class IndexPage(HTMLPage): @method @@ -48,7 +87,10 @@ class IndexPage(HTMLPage): class item(ItemElement): klass = BaseVideo - obj_title = Format('%s', CleanText('div/div[@class="resultat-titre-diff"]/a')) + obj_title = Format('%s du %s', + CleanText('div/div[@class="resultat-titre-diff"]/a'), + Regexp(CleanText('div/div[@class="resultat-soustitre-diff"]/span'), + '.+(\d{2}-\d{2}-\d{2}).+')) obj_id = Regexp(Link('div/div[@class="resultat-titre-diff"]/a'), '^/videos/.+,(.+).html$') obj_date = DateTime(Regexp(CleanText('div/div[@class="resultat-soustitre-diff"]/span', @@ -88,3 +130,15 @@ class VideoPage(JsonPage): thumbnail = BaseImage(url) thumbnail.url = thumbnail.id return thumbnail + + +class Programs(JsonPage): + @method + class iter_programs(DictElement): + item_xpath = 'programme' + + class item(ItemElement): + klass = BaseObject + + obj_id = CleanText(Dict('url')) + obj__title = CleanText(Dict('titre_programme'))