From 30026290c87739e33e3730b4e3578c348f4edc28 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Fri, 12 Aug 2011 15:39:33 +0200 Subject: [PATCH] support videos on www.ina.fr (in addition to boutique.ina.fr) --- weboob/backends/ina/browser.py | 9 ++-- weboob/backends/ina/pages/search.py | 6 +-- weboob/backends/ina/pages/video.py | 82 +++++++++++++++++++++-------- weboob/backends/ina/video.py | 8 ++- 4 files changed, 74 insertions(+), 31 deletions(-) diff --git a/weboob/backends/ina/browser.py b/weboob/backends/ina/browser.py index 0b50c20f..a566cb41 100644 --- a/weboob/backends/ina/browser.py +++ b/weboob/backends/ina/browser.py @@ -21,7 +21,7 @@ from weboob.tools.browser import BaseBrowser from weboob.tools.browser.decorators import id2url -from .pages.video import VideoPage +from .pages.video import VideoPage, BoutiqueVideoPage from .pages.search import SearchPage from .video import InaVideo @@ -30,8 +30,9 @@ __all__ = ['InaBrowser'] class InaBrowser(BaseBrowser): - DOMAIN = 'boutique.ina.fr' - PAGES = {'http://boutique\.ina\.fr/video/.+\.html': VideoPage, + DOMAIN = 'ina.fr' + PAGES = {'http://boutique\.ina\.fr/video/.+\.html': BoutiqueVideoPage, + 'http://www\.ina\.fr/.+\.html': VideoPage, 'http://boutique\.ina\.fr/recherche/.+': SearchPage, } @@ -41,6 +42,6 @@ class InaBrowser(BaseBrowser): return self.page.get_video(video) def iter_search_results(self, pattern): - self.location(self.buildurl('/recherche/recherche', search=pattern.encode('utf-8'))) + self.location(self.buildurl('http://boutique.ina.fr/recherche/recherche', search=pattern.encode('utf-8'))) assert self.is_on_page(SearchPage) return self.page.iter_videos() diff --git a/weboob/backends/ina/pages/search.py b/weboob/backends/ina/pages/search.py index 940dc533..2479c316 100644 --- a/weboob/backends/ina/pages/search.py +++ b/weboob/backends/ina/pages/search.py @@ -31,7 +31,7 @@ __all__ = ['SearchPage'] class SearchPage(BasePage): - URL_REGEXP = re.compile('/video/(.+).html') + URL_REGEXP = re.compile(r'/video/(.+)\.html') def iter_videos(self): try: @@ -40,7 +40,7 @@ class SearchPage(BasePage): # It means there are no results. return for li in ul.findall('li'): - id = re.sub(r'/video/(.+)\.html', r'\1', li.find('a').attrib['href']) + id = re.sub(self.URL_REGEXP, r'\1', li.find('a').attrib['href']) thumbnail = 'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src'] @@ -57,7 +57,7 @@ class SearchPage(BasePage): else: raise BrokenPageError('Unable to match duration (%r)' % duration) - yield InaVideo(id, + yield InaVideo('boutique.%s' % id, title=title, date=date, duration=duration, diff --git a/weboob/backends/ina/pages/video.py b/weboob/backends/ina/pages/video.py index 069894e4..c6e8f7cc 100644 --- a/weboob/backends/ina/pages/video.py +++ b/weboob/backends/ina/pages/video.py @@ -19,7 +19,6 @@ import datetime -from logging import warning import re try: from urlparse import parse_qs @@ -32,12 +31,10 @@ from weboob.tools.browser import BrokenPageError from ..video import InaVideo -__all__ = ['VideoPage'] +__all__ = ['VideoPage', 'BoutiqueVideoPage'] -class VideoPage(BasePage): - URL_REGEXP = re.compile('http://boutique.ina.fr/video/(.+).html') - +class BaseVideoPage(BasePage): def get_video(self, video): date, duration = self.get_date_and_duration() if not video: @@ -53,29 +50,73 @@ class VideoPage(BasePage): def get_id(self): m = self.URL_REGEXP.match(self.url) if m: - return unicode(m.group(1)) - warning('Unable to parse ID') + return self.create_id(m.group(1)) + self.logger.warning('Unable to parse ID') return 0 + def get_url(self): + qs = parse_qs(self.document.getroot().cssselect('param[name="flashvars"]')[0].attrib['value']) + url = 'http://mp4.ina.fr/lecture/lire/id_notice/%s/token_notice/%s' % (qs['id_notice'][0], qs['token_notice'][0]) + return url + + def parse_date_and_duration(self, text): + duration_regexp = re.compile('(.* - )?(.+) - ((.+)h)?((.+)min)?(.+)s') + m = duration_regexp.match(text) + if m: + day, month, year = [int(s) for s in m.group(2).split('/')] + date = datetime.datetime(year, month, day) + duration = datetime.timedelta(hours=int(m.group(4) if m.group(4) is not None else 0), + minutes=int(m.group(6) if m.group(6) is not None else 0), + seconds=int(m.group(7))) + return date, duration + else: + raise BrokenPageError('Unable to parse date and duration') + + def create_id(self, id): + raise NotImplementedError() + + def get_date_and_duration(self): + raise NotImplementedError() + + def get_title(self): + raise NotImplementedError() + + def get_description(self): + raise NotImplementedError() + +class VideoPage(BaseVideoPage): + URL_REGEXP = re.compile('http://www.ina.fr/(.+)\.html') + + def create_id(self, id): + return u'www.%s' % id + + def get_date_and_duration(self): + qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0].find('div').findall('div')[1] + return self.parse_date_and_duration(qr.find('h2').tail.strip()) + + def get_title(self): + qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0].find('div').findall('div')[1] + return qr.find('h2').text.strip() + + def get_description(self): + return self.parser.select(self.document.getroot(), 'div.container-global-qr')[1].find('div').find('p').text.strip() + + +class BoutiqueVideoPage(BaseVideoPage): + URL_REGEXP = re.compile('http://boutique.ina.fr/video/(.+).html') + + def create_id(self, id): + return u'boutique.%s' % id + def get_description(self): el = self.document.getroot().cssselect('div.bloc-produit-haut div.contenu p')[0] if el is not None: return el.text.strip() def get_date_and_duration(self): - duration_regexp = re.compile('(.+) - ((.+)h)?((.+)min)?(.+)s') el = self.document.getroot().cssselect('div.bloc-produit-haut p.date')[0] if el is not None: - m = duration_regexp.match(el.text.strip()) - if m: - day, month, year = [int(s) for s in m.group(1).split('/')] - date = datetime.datetime(year, month, day) - duration = datetime.timedelta(hours=int(m.group(3) if m.group(3) is not None else 0), - minutes=int(m.group(5) if m.group(5) is not None else 0), - seconds=int(m.group(6))) - return date, duration - else: - raise BrokenPageError('Unable to parse date and duration') + return self.parse_date_and_duration(el.text.strip()) else: raise BrokenPageError('Unable to find date and duration element') @@ -85,8 +126,3 @@ class VideoPage(BasePage): return unicode(el.text.strip()) else: return None - - def get_url(self): - qs = parse_qs(self.document.getroot().cssselect('param[name="flashvars"]')[0].attrib['value']) - url = 'http://mp4.ina.fr/lecture/lire/id_notice/%s/token_notice/%s' % (qs['id_notice'][0], qs['token_notice'][0]) - return url diff --git a/weboob/backends/ina/video.py b/weboob/backends/ina/video.py index f8c3bce4..8e000837 100644 --- a/weboob/backends/ina/video.py +++ b/weboob/backends/ina/video.py @@ -27,4 +27,10 @@ __all__ = ['InaVideo'] class InaVideo(BaseVideo): @classmethod def id2url(cls, _id): - return 'http://boutique.ina.fr/video/%s.html' % _id + if not '.' in _id: + return None + site, _id = _id.split('.', 1) + if site == 'boutique': + return 'http://boutique.ina.fr/video/%s.html' % _id + if site == 'www': + return 'http://www.ina.fr/%s.html' % _id