diff --git a/modules/francetelevisions/browser.py b/modules/francetelevisions/browser.py index 0830c2a7..4d4efb2b 100644 --- a/modules/francetelevisions/browser.py +++ b/modules/francetelevisions/browser.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright(C) 2011 Romain Bignon +# Copyright(C) 2011-2012 Romain Bignon, Laurent Bachelier # # This file is part of weboob. # @@ -17,10 +17,14 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . +import datetime + +from lxml import etree + from weboob.tools.browser import BaseBrowser from weboob.tools.browser.decorators import id2url -from .pages import IndexPage, VideoPage, MetaVideoPage +from .pages import IndexPage, VideoPage from .video import PluzzVideo @@ -29,12 +33,11 @@ __all__ = ['PluzzBrowser'] class PluzzBrowser(BaseBrowser): DOMAIN = 'pluzz.fr' - ENCODING = None + ENCODING = 'ISO-8859-1' PAGES = {r'http://[w\.]*pluzz.fr/?': IndexPage, r'http://[w\.]*pluzz.fr/recherche.html.*': IndexPage, r'http://[w\.]*pluzz.fr/[-\w]+/.*': IndexPage, r'http://[w\.]*pluzz.fr/((?!recherche).+)\.html': VideoPage, - r'http://info\.francetelevisions\.fr/\?id-video=.*': MetaVideoPage, } @id2url(PluzzVideo.id2url) @@ -42,15 +45,15 @@ class PluzzBrowser(BaseBrowser): self.location(url) assert self.is_on_page(VideoPage) - id = self.page.get_id() - metaurl = self.page.get_meta_url() - if metaurl is None: - return None + _id = self.page.get_id() + if video is None: + video = PluzzVideo(_id) - self.location(metaurl) - assert self.is_on_page(MetaVideoPage) + infourl = self.page.get_info_url() + if infourl is not None: + self.parse_info(self.openurl(infourl).read(), video) - return self.page.get_video(id, video) + return video def iter_search_results(self, pattern): if not pattern: @@ -60,3 +63,25 @@ class PluzzBrowser(BaseBrowser): assert self.is_on_page(IndexPage) return self.page.iter_videos() + + def parse_info(self, data, video): + parser = etree.XMLParser(encoding='utf-8') + root = etree.XML(data, parser) + assert root.tag == 'oeuvre' + + video.title = root.findtext('titre') + + hours, minutes, seconds = root.findtext('duree').split(':') + video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) + + for vid in root.find('videos'): + if vid.findtext('statut') == 'ONLINE' and vid.findtext('format') == 'wmv': + video.url = vid.findtext('url') + + date = root.findtext('diffusions/diffusion') + if date: + video.date = datetime.datetime.strptime(date, '%d/%m/%Y %H:%M') + + video.description = root.findtext('synopsis') + + return video diff --git a/modules/francetelevisions/pages.py b/modules/francetelevisions/pages.py index 4fbcc30b..52c1387a 100644 --- a/modules/francetelevisions/pages.py +++ b/modules/francetelevisions/pages.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright(C) 2011 Romain Bignon +# Copyright(C) 2011-2012 Romain Bignon, Laurent Bachelier # # This file is part of weboob. # @@ -22,7 +22,6 @@ import re from weboob.tools.capabilities.thumbnail import Thumbnail from weboob.tools.browser import BasePage, BrokenPageError -from weboob.capabilities.base import NotAvailable from .video import PluzzVideo @@ -34,57 +33,55 @@ __all__ = ['IndexPage', 'VideoPage'] class IndexPage(BasePage): def iter_videos(self): for div in self.parser.select(self.document.getroot(), 'li.vignette'): - url = self.parser.select(div, 'h4 a', 1).attrib['href'] - m = re.match('http://www.pluzz.fr/([^/]+).html', url) + title = self.parser.select(div, 'h4 a', 1) + url = title.attrib['href'] + m = re.match('^http://www.pluzz.fr/([^/]+)\.html$', url) if not m: - print ':(' + print ':( %s' % url continue _id = m.group(1) video = PluzzVideo(_id) - video.title = self.parser.select(div, 'h4 a', 1).text + m = re.match('^(.+) - ([0-2][0-9])h([0-5][0-9])$', title.text) + if m: + video.title = m.group(1) + hour = int(m.group(2)) + minute = int(m.group(3)) + else: + video.title = title.text + hour = 0 + minute = 0 - # Date is not available anymore on search results. - video.date = NotAvailable - #m = re.match('(\d+)/(\d+)/(\d+)', self.parser.select(div, 'p.date', 1).text) - #if m: - # video.date = datetime.datetime(int(m.group(3)), - # int(m.group(2)), - # int(m.group(1))) + m = re.match('(\d+)/(\d+)/(\d+)', self.parser.select(div, 'p.date', 1).text) + if m: + video.date = datetime.datetime(int(m.group(3)), + int(m.group(2)), + int(m.group(1)), + hour, + minute) url = self.parser.select(div, 'img.illustration', 1).attrib['src'] video.thumbnail = Thumbnail('http://www.pluzz.fr/%s' % url) yield video + class VideoPage(BasePage): def on_loaded(self): p = self.parser.select(self.document.getroot(), 'p.alert') if len(p) > 0: raise Exception(p[0].text) - def get_meta_url(self): + def get_info_url(self): try: div = self.parser.select(self.document.getroot(), 'a#current_video', 1) except BrokenPageError: return None else: - return div.attrib['href'] + m = re.match( + '^%s(\d+)$' % re.escape('http://info.francetelevisions.fr/?id-video='), + div.attrib['href']) + if m: + return r'http://www.pluzz.fr/appftv/webservices/video/getInfosOeuvre.php?mode=zeri&id-diffusion=%s' % m.group(1) def get_id(self): return self.groups[0] - -class MetaVideoPage(BasePage): - def get_meta(self, name): - return self.parser.select(self.document.getroot(), 'meta[name=%s]' % name, 1).attrib['content'] - - def get_video(self, id, video=None): - if video is None: - video = PluzzVideo(id) - - video.title = self.get_meta('vignette-titre-court') - video.url = 'mms://a988.v101995.c10199.e.vm.akamaistream.net/7/988/10199/3f97c7e6/ftvigrp.download.akamai.com/10199/cappuccino/production/publication/%s' % self.get_meta('urls-url-video') - video.description = self.get_meta('description') - hours, minutes, seconds = self.get_meta('vignette-duree').split(':') - video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) - - return video