support videos on www.ina.fr (in addition to boutique.ina.fr)

This commit is contained in:
Romain Bignon 2011-08-12 15:39:33 +02:00
commit 30026290c8
4 changed files with 74 additions and 31 deletions

View file

@ -21,7 +21,7 @@
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url from weboob.tools.browser.decorators import id2url
from .pages.video import VideoPage from .pages.video import VideoPage, BoutiqueVideoPage
from .pages.search import SearchPage from .pages.search import SearchPage
from .video import InaVideo from .video import InaVideo
@ -30,8 +30,9 @@ __all__ = ['InaBrowser']
class InaBrowser(BaseBrowser): class InaBrowser(BaseBrowser):
DOMAIN = 'boutique.ina.fr' DOMAIN = 'ina.fr'
PAGES = {'http://boutique\.ina\.fr/video/.+\.html': VideoPage, PAGES = {'http://boutique\.ina\.fr/video/.+\.html': BoutiqueVideoPage,
'http://www\.ina\.fr/.+\.html': VideoPage,
'http://boutique\.ina\.fr/recherche/.+': SearchPage, 'http://boutique\.ina\.fr/recherche/.+': SearchPage,
} }
@ -41,6 +42,6 @@ class InaBrowser(BaseBrowser):
return self.page.get_video(video) return self.page.get_video(video)
def iter_search_results(self, pattern): def iter_search_results(self, pattern):
self.location(self.buildurl('/recherche/recherche', search=pattern.encode('utf-8'))) self.location(self.buildurl('http://boutique.ina.fr/recherche/recherche', search=pattern.encode('utf-8')))
assert self.is_on_page(SearchPage) assert self.is_on_page(SearchPage)
return self.page.iter_videos() return self.page.iter_videos()

View file

@ -31,7 +31,7 @@ __all__ = ['SearchPage']
class SearchPage(BasePage): class SearchPage(BasePage):
URL_REGEXP = re.compile('/video/(.+).html') URL_REGEXP = re.compile(r'/video/(.+)\.html')
def iter_videos(self): def iter_videos(self):
try: try:
@ -40,7 +40,7 @@ class SearchPage(BasePage):
# It means there are no results. # It means there are no results.
return return
for li in ul.findall('li'): for li in ul.findall('li'):
id = re.sub(r'/video/(.+)\.html', r'\1', li.find('a').attrib['href']) id = re.sub(self.URL_REGEXP, r'\1', li.find('a').attrib['href'])
thumbnail = 'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src'] thumbnail = 'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src']
@ -57,7 +57,7 @@ class SearchPage(BasePage):
else: else:
raise BrokenPageError('Unable to match duration (%r)' % duration) raise BrokenPageError('Unable to match duration (%r)' % duration)
yield InaVideo(id, yield InaVideo('boutique.%s' % id,
title=title, title=title,
date=date, date=date,
duration=duration, duration=duration,

View file

@ -19,7 +19,6 @@
import datetime import datetime
from logging import warning
import re import re
try: try:
from urlparse import parse_qs from urlparse import parse_qs
@ -32,12 +31,10 @@ from weboob.tools.browser import BrokenPageError
from ..video import InaVideo from ..video import InaVideo
__all__ = ['VideoPage'] __all__ = ['VideoPage', 'BoutiqueVideoPage']
class VideoPage(BasePage): class BaseVideoPage(BasePage):
URL_REGEXP = re.compile('http://boutique.ina.fr/video/(.+).html')
def get_video(self, video): def get_video(self, video):
date, duration = self.get_date_and_duration() date, duration = self.get_date_and_duration()
if not video: if not video:
@ -53,29 +50,73 @@ class VideoPage(BasePage):
def get_id(self): def get_id(self):
m = self.URL_REGEXP.match(self.url) m = self.URL_REGEXP.match(self.url)
if m: if m:
return unicode(m.group(1)) return self.create_id(m.group(1))
warning('Unable to parse ID') self.logger.warning('Unable to parse ID')
return 0 return 0
def get_url(self):
qs = parse_qs(self.document.getroot().cssselect('param[name="flashvars"]')[0].attrib['value'])
url = 'http://mp4.ina.fr/lecture/lire/id_notice/%s/token_notice/%s' % (qs['id_notice'][0], qs['token_notice'][0])
return url
def parse_date_and_duration(self, text):
duration_regexp = re.compile('(.* - )?(.+) - ((.+)h)?((.+)min)?(.+)s')
m = duration_regexp.match(text)
if m:
day, month, year = [int(s) for s in m.group(2).split('/')]
date = datetime.datetime(year, month, day)
duration = datetime.timedelta(hours=int(m.group(4) if m.group(4) is not None else 0),
minutes=int(m.group(6) if m.group(6) is not None else 0),
seconds=int(m.group(7)))
return date, duration
else:
raise BrokenPageError('Unable to parse date and duration')
def create_id(self, id):
raise NotImplementedError()
def get_date_and_duration(self):
raise NotImplementedError()
def get_title(self):
raise NotImplementedError()
def get_description(self):
raise NotImplementedError()
class VideoPage(BaseVideoPage):
URL_REGEXP = re.compile('http://www.ina.fr/(.+)\.html')
def create_id(self, id):
return u'www.%s' % id
def get_date_and_duration(self):
qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0].find('div').findall('div')[1]
return self.parse_date_and_duration(qr.find('h2').tail.strip())
def get_title(self):
qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0].find('div').findall('div')[1]
return qr.find('h2').text.strip()
def get_description(self):
return self.parser.select(self.document.getroot(), 'div.container-global-qr')[1].find('div').find('p').text.strip()
class BoutiqueVideoPage(BaseVideoPage):
URL_REGEXP = re.compile('http://boutique.ina.fr/video/(.+).html')
def create_id(self, id):
return u'boutique.%s' % id
def get_description(self): def get_description(self):
el = self.document.getroot().cssselect('div.bloc-produit-haut div.contenu p')[0] el = self.document.getroot().cssselect('div.bloc-produit-haut div.contenu p')[0]
if el is not None: if el is not None:
return el.text.strip() return el.text.strip()
def get_date_and_duration(self): def get_date_and_duration(self):
duration_regexp = re.compile('(.+) - ((.+)h)?((.+)min)?(.+)s')
el = self.document.getroot().cssselect('div.bloc-produit-haut p.date')[0] el = self.document.getroot().cssselect('div.bloc-produit-haut p.date')[0]
if el is not None: if el is not None:
m = duration_regexp.match(el.text.strip()) return self.parse_date_and_duration(el.text.strip())
if m:
day, month, year = [int(s) for s in m.group(1).split('/')]
date = datetime.datetime(year, month, day)
duration = datetime.timedelta(hours=int(m.group(3) if m.group(3) is not None else 0),
minutes=int(m.group(5) if m.group(5) is not None else 0),
seconds=int(m.group(6)))
return date, duration
else:
raise BrokenPageError('Unable to parse date and duration')
else: else:
raise BrokenPageError('Unable to find date and duration element') raise BrokenPageError('Unable to find date and duration element')
@ -85,8 +126,3 @@ class VideoPage(BasePage):
return unicode(el.text.strip()) return unicode(el.text.strip())
else: else:
return None return None
def get_url(self):
qs = parse_qs(self.document.getroot().cssselect('param[name="flashvars"]')[0].attrib['value'])
url = 'http://mp4.ina.fr/lecture/lire/id_notice/%s/token_notice/%s' % (qs['id_notice'][0], qs['token_notice'][0])
return url

View file

@ -27,4 +27,10 @@ __all__ = ['InaVideo']
class InaVideo(BaseVideo): class InaVideo(BaseVideo):
@classmethod @classmethod
def id2url(cls, _id): def id2url(cls, _id):
return 'http://boutique.ina.fr/video/%s.html' % _id if not '.' in _id:
return None
site, _id = _id.split('.', 1)
if site == 'boutique':
return 'http://boutique.ina.fr/video/%s.html' % _id
if site == 'www':
return 'http://www.ina.fr/%s.html' % _id