[francetelevisions] fix #1700 pluzz's search is crap

This commit is contained in:
Bezleputh 2015-01-06 16:43:53 +01:00 committed by Florent
commit a7b9b6e9cc
2 changed files with 75 additions and 4 deletions

View file

@ -19,7 +19,7 @@
from weboob.browser import PagesBrowser, URL from weboob.browser import PagesBrowser, URL
from .pages import IndexPage, VideoPage from .pages import IndexPage, VideoPage, Programs, VideoListPage
__all__ = ['PluzzBrowser'] __all__ = ['PluzzBrowser']
@ -28,12 +28,29 @@ class PluzzBrowser(PagesBrowser):
ENCODING = 'utf-8' ENCODING = 'utf-8'
BASEURL = 'http://pluzz.francetv.fr' BASEURL = 'http://pluzz.francetv.fr'
PROGRAMS = None
programs_page = URL('http://pluzz.webservices.francetelevisions.fr/pluzz/programme', Programs)
index_page = URL(r'recherche\?recherche=(?P<pattern>.*)', IndexPage) index_page = URL(r'recherche\?recherche=(?P<pattern>.*)', IndexPage)
video_page = URL(r'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/\?idDiffusion=(?P<id>.*)&catalogue=Pluzz', VideoPage) video_page = URL(r'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/\?idDiffusion=(?P<id>.*)&catalogue=Pluzz', VideoPage)
videos_list_page = URL('(?P<program>videos/.*)', VideoListPage)
def search_videos(self, pattern): def search_videos(self, pattern):
return self.index_page.go(pattern=pattern).iter_videos() if not self.PROGRAMS:
self.PROGRAMS = self.get_program_list()
videos = []
for program in self.PROGRAMS:
if pattern.upper() in program._title.upper():
video = self.videos_list_page.go(program=program.id).get_last_video()
if video:
videos.append(video)
videos += list(self.page.iter_videos())
return videos if len(videos) > 0 else self.index_page.go(pattern=pattern).iter_videos()
def get_program_list(self):
return list(self.programs_page.go().iter_programs())
@video_page.id2url @video_page.id2url
def get_video(self, url, video=None): def get_video(self, url, video=None):

View file

@ -19,7 +19,7 @@
from weboob.capabilities.image import BaseImage from weboob.capabilities.image import BaseImage
from weboob.capabilities.video import BaseVideo from weboob.capabilities.video import BaseVideo
from weboob.capabilities.base import BaseObject
from datetime import timedelta from datetime import timedelta
from weboob.browser.pages import HTMLPage, JsonPage from weboob.browser.pages import HTMLPage, JsonPage
@ -29,6 +29,15 @@ from weboob.browser.filters.html import Link, Attr
from weboob.browser.filters.json import Dict from weboob.browser.filters.json import Dict
class DictElement(ListElement):
def find_elements(self):
if self.item_xpath is not None:
for el in self.el.get('reponse').get(self.item_xpath):
yield el
else:
yield self.el
class DurationPluzz(Filter): class DurationPluzz(Filter):
def filter(self, el): def filter(self, el):
duration = Regexp(CleanText('.'), r'.+\|(.+)')(el[0]) duration = Regexp(CleanText('.'), r'.+\|(.+)')(el[0])
@ -39,6 +48,36 @@ class DurationPluzz(Filter):
return timedelta(hours=t[0], minutes=t[1]) return timedelta(hours=t[0], minutes=t[1])
class VideoListPage(HTMLPage):
@method
class get_last_video(ItemElement):
klass = BaseVideo
obj_id = CleanText('//div[@id="diffusion-info"]/@data-diffusion')
obj_title = CleanText('//div[@id="diffusion-info"]/h1/div[@id="diffusion-titre"]')
obj_date = DateTime(Regexp(CleanText('//div[@id="diffusion-info"]/div/div/span/span[1]',
replace=[(u'à', u''), (u' ', u' ')]),
'.+(\d{2}-\d{2}-\d{2}.+\d{1,2}h\d{1,2}).+'),
dayfirst=True)
@method
class iter_videos(ListElement):
item_xpath = '//div[@id="player-memeProgramme"]/a'
class item(ItemElement):
klass = BaseVideo
def condition(self):
return CleanText('div[@class="autre-emission-c3"]')(self) == "En replay"
obj_id = Regexp(Link('.'), '^/videos/.+,(.+).html$')
obj_title = CleanText('//meta[@name="programme_titre"]/@content')
obj_date = DateTime(Regexp(CleanText('./div[@class="autre-emission-c2"]',
replace=[(u'à', u''), (u' ', u' ')]),
'(\d{2}-\d{2}.+\d{1,2}:\d{1,2})'),
dayfirst=True)
class IndexPage(HTMLPage): class IndexPage(HTMLPage):
@method @method
@ -48,7 +87,10 @@ class IndexPage(HTMLPage):
class item(ItemElement): class item(ItemElement):
klass = BaseVideo klass = BaseVideo
obj_title = Format('%s', CleanText('div/div[@class="resultat-titre-diff"]/a')) obj_title = Format('%s du %s',
CleanText('div/div[@class="resultat-titre-diff"]/a'),
Regexp(CleanText('div/div[@class="resultat-soustitre-diff"]/span'),
'.+(\d{2}-\d{2}-\d{2}).+'))
obj_id = Regexp(Link('div/div[@class="resultat-titre-diff"]/a'), obj_id = Regexp(Link('div/div[@class="resultat-titre-diff"]/a'),
'^/videos/.+,(.+).html$') '^/videos/.+,(.+).html$')
obj_date = DateTime(Regexp(CleanText('div/div[@class="resultat-soustitre-diff"]/span', obj_date = DateTime(Regexp(CleanText('div/div[@class="resultat-soustitre-diff"]/span',
@ -88,3 +130,15 @@ class VideoPage(JsonPage):
thumbnail = BaseImage(url) thumbnail = BaseImage(url)
thumbnail.url = thumbnail.id thumbnail.url = thumbnail.id
return thumbnail return thumbnail
class Programs(JsonPage):
@method
class iter_programs(DictElement):
item_xpath = 'programme'
class item(ItemElement):
klass = BaseObject
obj_id = CleanText(Dict('url'))
obj__title = CleanText(Dict('titre_programme'))