Fix francetelevisions for website updates

And also add support for existing data it lacked.
This commit is contained in:
Laurent Bachelier 2012-02-01 00:22:12 +01:00 committed by Romain Bignon
commit f3470a0128
2 changed files with 65 additions and 43 deletions

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2011 Romain Bignon # Copyright(C) 2011-2012 Romain Bignon, Laurent Bachelier
# #
# This file is part of weboob. # This file is part of weboob.
# #
@ -17,10 +17,14 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
from lxml import etree
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url from weboob.tools.browser.decorators import id2url
from .pages import IndexPage, VideoPage, MetaVideoPage from .pages import IndexPage, VideoPage
from .video import PluzzVideo from .video import PluzzVideo
@ -29,12 +33,11 @@ __all__ = ['PluzzBrowser']
class PluzzBrowser(BaseBrowser): class PluzzBrowser(BaseBrowser):
DOMAIN = 'pluzz.fr' DOMAIN = 'pluzz.fr'
ENCODING = None ENCODING = 'ISO-8859-1'
PAGES = {r'http://[w\.]*pluzz.fr/?': IndexPage, PAGES = {r'http://[w\.]*pluzz.fr/?': IndexPage,
r'http://[w\.]*pluzz.fr/recherche.html.*': IndexPage, r'http://[w\.]*pluzz.fr/recherche.html.*': IndexPage,
r'http://[w\.]*pluzz.fr/[-\w]+/.*': IndexPage, r'http://[w\.]*pluzz.fr/[-\w]+/.*': IndexPage,
r'http://[w\.]*pluzz.fr/((?!recherche).+)\.html': VideoPage, r'http://[w\.]*pluzz.fr/((?!recherche).+)\.html': VideoPage,
r'http://info\.francetelevisions\.fr/\?id-video=.*': MetaVideoPage,
} }
@id2url(PluzzVideo.id2url) @id2url(PluzzVideo.id2url)
@ -42,15 +45,15 @@ class PluzzBrowser(BaseBrowser):
self.location(url) self.location(url)
assert self.is_on_page(VideoPage) assert self.is_on_page(VideoPage)
id = self.page.get_id() _id = self.page.get_id()
metaurl = self.page.get_meta_url() if video is None:
if metaurl is None: video = PluzzVideo(_id)
return None
self.location(metaurl) infourl = self.page.get_info_url()
assert self.is_on_page(MetaVideoPage) if infourl is not None:
self.parse_info(self.openurl(infourl).read(), video)
return self.page.get_video(id, video) return video
def iter_search_results(self, pattern): def iter_search_results(self, pattern):
if not pattern: if not pattern:
@ -60,3 +63,25 @@ class PluzzBrowser(BaseBrowser):
assert self.is_on_page(IndexPage) assert self.is_on_page(IndexPage)
return self.page.iter_videos() return self.page.iter_videos()
def parse_info(self, data, video):
parser = etree.XMLParser(encoding='utf-8')
root = etree.XML(data, parser)
assert root.tag == 'oeuvre'
video.title = root.findtext('titre')
hours, minutes, seconds = root.findtext('duree').split(':')
video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))
for vid in root.find('videos'):
if vid.findtext('statut') == 'ONLINE' and vid.findtext('format') == 'wmv':
video.url = vid.findtext('url')
date = root.findtext('diffusions/diffusion')
if date:
video.date = datetime.datetime.strptime(date, '%d/%m/%Y %H:%M')
video.description = root.findtext('synopsis')
return video

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2011 Romain Bignon # Copyright(C) 2011-2012 Romain Bignon, Laurent Bachelier
# #
# This file is part of weboob. # This file is part of weboob.
# #
@ -22,7 +22,6 @@ import re
from weboob.tools.capabilities.thumbnail import Thumbnail from weboob.tools.capabilities.thumbnail import Thumbnail
from weboob.tools.browser import BasePage, BrokenPageError from weboob.tools.browser import BasePage, BrokenPageError
from weboob.capabilities.base import NotAvailable
from .video import PluzzVideo from .video import PluzzVideo
@ -34,57 +33,55 @@ __all__ = ['IndexPage', 'VideoPage']
class IndexPage(BasePage): class IndexPage(BasePage):
def iter_videos(self): def iter_videos(self):
for div in self.parser.select(self.document.getroot(), 'li.vignette'): for div in self.parser.select(self.document.getroot(), 'li.vignette'):
url = self.parser.select(div, 'h4 a', 1).attrib['href'] title = self.parser.select(div, 'h4 a', 1)
m = re.match('http://www.pluzz.fr/([^/]+).html', url) url = title.attrib['href']
m = re.match('^http://www.pluzz.fr/([^/]+)\.html$', url)
if not m: if not m:
print ':(' print ':( %s' % url
continue continue
_id = m.group(1) _id = m.group(1)
video = PluzzVideo(_id) video = PluzzVideo(_id)
video.title = self.parser.select(div, 'h4 a', 1).text m = re.match('^(.+) - ([0-2][0-9])h([0-5][0-9])$', title.text)
if m:
video.title = m.group(1)
hour = int(m.group(2))
minute = int(m.group(3))
else:
video.title = title.text
hour = 0
minute = 0
# Date is not available anymore on search results. m = re.match('(\d+)/(\d+)/(\d+)', self.parser.select(div, 'p.date', 1).text)
video.date = NotAvailable if m:
#m = re.match('(\d+)/(\d+)/(\d+)', self.parser.select(div, 'p.date', 1).text) video.date = datetime.datetime(int(m.group(3)),
#if m: int(m.group(2)),
# video.date = datetime.datetime(int(m.group(3)), int(m.group(1)),
# int(m.group(2)), hour,
# int(m.group(1))) minute)
url = self.parser.select(div, 'img.illustration', 1).attrib['src'] url = self.parser.select(div, 'img.illustration', 1).attrib['src']
video.thumbnail = Thumbnail('http://www.pluzz.fr/%s' % url) video.thumbnail = Thumbnail('http://www.pluzz.fr/%s' % url)
yield video yield video
class VideoPage(BasePage): class VideoPage(BasePage):
def on_loaded(self): def on_loaded(self):
p = self.parser.select(self.document.getroot(), 'p.alert') p = self.parser.select(self.document.getroot(), 'p.alert')
if len(p) > 0: if len(p) > 0:
raise Exception(p[0].text) raise Exception(p[0].text)
def get_meta_url(self): def get_info_url(self):
try: try:
div = self.parser.select(self.document.getroot(), 'a#current_video', 1) div = self.parser.select(self.document.getroot(), 'a#current_video', 1)
except BrokenPageError: except BrokenPageError:
return None return None
else: else:
return div.attrib['href'] m = re.match(
'^%s(\d+)$' % re.escape('http://info.francetelevisions.fr/?id-video='),
div.attrib['href'])
if m:
return r'http://www.pluzz.fr/appftv/webservices/video/getInfosOeuvre.php?mode=zeri&id-diffusion=%s' % m.group(1)
def get_id(self): def get_id(self):
return self.groups[0] return self.groups[0]
class MetaVideoPage(BasePage):
def get_meta(self, name):
return self.parser.select(self.document.getroot(), 'meta[name=%s]' % name, 1).attrib['content']
def get_video(self, id, video=None):
if video is None:
video = PluzzVideo(id)
video.title = self.get_meta('vignette-titre-court')
video.url = 'mms://a988.v101995.c10199.e.vm.akamaistream.net/7/988/10199/3f97c7e6/ftvigrp.download.akamai.com/10199/cappuccino/production/publication/%s' % self.get_meta('urls-url-video')
video.description = self.get_meta('description')
hours, minutes, seconds = self.get_meta('vignette-duree').split(':')
video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))
return video