From 91f11bc9489db9387fb9e421c064d8113c0c9bf0 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Mon, 24 Feb 2014 20:11:45 +0100 Subject: [PATCH] [ina] fix : site changed --- modules/ina/backend.py | 2 - modules/ina/browser.py | 8 +-- modules/ina/pages/search.py | 10 +-- modules/ina/pages/video.py | 118 +++++++++++------------------------- modules/ina/video.py | 8 +-- 5 files changed, 47 insertions(+), 99 deletions(-) diff --git a/modules/ina/backend.py b/modules/ina/backend.py index 7c0ff08f..8179f1e3 100644 --- a/modules/ina/backend.py +++ b/modules/ina/backend.py @@ -18,8 +18,6 @@ # along with weboob. If not, see . - - from weboob.capabilities.video import ICapVideo from weboob.tools.backend import BaseBackend diff --git a/modules/ina/browser.py b/modules/ina/browser.py index 83a22b56..38c491c5 100644 --- a/modules/ina/browser.py +++ b/modules/ina/browser.py @@ -21,7 +21,7 @@ from weboob.tools.browser import BaseBrowser from weboob.tools.browser.decorators import id2url -from .pages.video import VideoPage, BoutiqueVideoPage +from .pages.video import VideoPage from .pages.search import SearchPage from .video import InaVideo @@ -31,14 +31,14 @@ __all__ = ['InaBrowser'] class InaBrowser(BaseBrowser): DOMAIN = 'ina.fr' - PAGES = {'http://boutique\.ina\.fr/(video|audio)/.+\.html': BoutiqueVideoPage, - 'http://www\.ina\.fr/.+\.html': VideoPage, + PAGES = {'http://player.ina.fr/notices/.+\.mrss': (VideoPage, 'xml'), 'http://boutique\.ina\.fr/recherche/.+': SearchPage, - } + } @id2url(InaVideo.id2url) def get_video(self, url, video=None): self.location(url) + #assert self.is_on_page(VideoPage) return self.page.get_video(video) def search_videos(self, pattern): diff --git a/modules/ina/pages/search.py b/modules/ina/pages/search.py index cec76bcb..06f46115 100644 --- a/modules/ina/pages/search.py +++ b/modules/ina/pages/search.py @@ -31,7 +31,7 @@ __all__ = ['SearchPage'] class SearchPage(BasePage): - URL_REGEXP = re.compile(r'/video/(.+)\.html') + URL_REGEXP = re.compile(r'/(.+)/(.+)\.jpeg') def iter_videos(self): try: @@ -40,12 +40,12 @@ class SearchPage(BasePage): # It means there are no results. return for li in ul.findall('li'): - id = re.sub(self.URL_REGEXP, r'\1', li.find('a').attrib['href']) + url = li.find('a').find('img').attrib['src'] - video = InaVideo('boutique.%s' % id) + id = re.sub(self.URL_REGEXP, r'\2', url) + video = InaVideo(id) - url = u'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src'] - video.thumbnail = BaseImage(url) + video.thumbnail = BaseImage(u'http://boutique.ina.fr%s' % url) video.thumbnail.url = video.thumbnail.id video.title = unicode(self.parser.select(li, 'p.titre', 1).text) diff --git a/modules/ina/pages/video.py b/modules/ina/pages/video.py index 0f40e496..5c73b59c 100644 --- a/modules/ina/pages/video.py +++ b/modules/ina/pages/video.py @@ -18,83 +18,65 @@ # along with weboob. If not, see . -import datetime +from datetime import datetime import re -from urlparse import parse_qs from weboob.capabilities import NotAvailable -from weboob.tools.browser import BasePage, BrokenPageError +from weboob.capabilities.image import BaseImage +from weboob.tools.browser import BasePage from ..video import InaVideo - -__all__ = ['VideoPage', 'BoutiqueVideoPage'] +__all__ = ['VideoPage'] -class BaseVideoPage(BasePage): - def get_video(self, video): - date, duration = self.get_date_and_duration() - if not video: - video = InaVideo(self.get_id()) - - video.title = self.get_title() - video.url = self.get_url() - video.date = date - video.duration = duration - video.description = self.get_description() - - video.set_empty_fields(NotAvailable) - return video +class VideoPage(BasePage): + URL_REGEXP = re.compile('http://player.ina.fr/notices/(.+)\.mrss') def get_id(self): m = self.URL_REGEXP.match(self.url) if m: - return self.create_id(m.group(1)) + return m.group(1) self.logger.warning('Unable to parse ID') return 0 - def get_url(self): - qs = parse_qs(self.document.getroot().cssselect('param[name="flashvars"]')[0].attrib['value']) - s = self.browser.readurl('http://www.ina.fr/player/infovideo/id_notice/%s/module_request/%s' % (qs['id_notice'][0], qs['module'][0])) - s = s[s.find('')+7:s.find('')] - return u'%s/id_chaine/%s/module_request/%s/pkey/%s' % \ - (s, qs['id_chaine'][0], qs['module'][0], qs['pkey'][0]) + def get_video(self, video): + if not video: + video = InaVideo(self.get_id()) - def parse_date_and_duration(self, text): - duration_regexp = re.compile('(.* - )?(.+) - ((.+)h)?((.+)min)?(.+)s') - m = duration_regexp.match(text) - if m: - day, month, year = [abs(int(s)) for s in m.group(2).split('/')] - date = datetime.datetime(year, month, day) - duration = datetime.timedelta(hours=int(m.group(4) if m.group(4) is not None else 0), - minutes=int(m.group(6) if m.group(6) is not None else 0), - seconds=int(m.group(7))) - return date, duration - else: - raise BrokenPageError('Unable to parse date and duration') + video.title = u'%s' % self.parser.select(self.document.getroot(), + '//rss/channel/item/title', + 1, + method='xpath').text - def create_id(self, id): - raise NotImplementedError() + _image = u'%s' % self.parser.select(self.document.getroot(), + '//rss/channel/item/media:content/media:thumbnail', + 1, + method='xpath', + namespaces={'media': 'http://search.yahoo.com/mrss/'}).attrib['url'] + video.thumbnail = BaseImage(_image) + video.thumbnail.url = video.thumbnail.id - def get_date_and_duration(self): - raise NotImplementedError() + video.url = u'%s' % self.parser.select(self.document.getroot(), + '//rss/channel/item/media:content', + 1, + method='xpath', + namespaces={'media': 'http://search.yahoo.com/mrss/'}).attrib['url'] - def get_title(self): - raise NotImplementedError() - - def get_description(self): - raise NotImplementedError() + _date = self.parser.select(self.document.getroot(), + '//rss/channel/item/pubDate', + 1, + method='xpath').text + video.date = datetime.strptime(_date[:-6], '%a, %d %b %Y %H:%M:%S') -class VideoPage(BaseVideoPage): - URL_REGEXP = re.compile('http://www.ina.fr/(.+)\.html') + video.description = u'%s' % self.parser.select(self.document.getroot(), + '//rss/channel/item/description', + 1, + method='xpath').text - def create_id(self, id): - return u'www.%s' % id - - def get_date_and_duration(self): - qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0].find('div').findall('div')[1] - return self.parse_date_and_duration(qr.find('h2').tail.strip()) + video.set_empty_fields(NotAvailable) + return video def get_title(self): qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0] @@ -104,29 +86,3 @@ class VideoPage(BaseVideoPage): desc = self.parser.select(self.document.getroot(), 'div.container-global-qr')[1].find('div').find('p') if desc: return unicode(desc.text.strip()) - - -class BoutiqueVideoPage(BaseVideoPage): - URL_REGEXP = re.compile('http://boutique.ina.fr/(audio|video)/(.+).html') - - def create_id(self, id): - return u'boutique.%s' % id - - def get_description(self): - el = self.document.getroot().cssselect('div.bloc-produit-haut div.contenu p')[0] - if el is not None: - return unicode(el.text.strip()) - - def get_date_and_duration(self): - el = self.document.getroot().cssselect('div.bloc-produit-haut p.date')[0] - if el is not None: - return self.parse_date_and_duration(el.text.strip()) - else: - raise BrokenPageError('Unable to find date and duration element') - - def get_title(self): - el = self.document.getroot().cssselect('div.bloc-produit-haut h1')[0] - if el is not None: - return unicode(el.text.strip()) - else: - return None diff --git a/modules/ina/video.py b/modules/ina/video.py index 8e000837..7f2ece8d 100644 --- a/modules/ina/video.py +++ b/modules/ina/video.py @@ -27,10 +27,4 @@ __all__ = ['InaVideo'] class InaVideo(BaseVideo): @classmethod def id2url(cls, _id): - if not '.' in _id: - return None - site, _id = _id.split('.', 1) - if site == 'boutique': - return 'http://boutique.ina.fr/video/%s.html' % _id - if site == 'www': - return 'http://www.ina.fr/%s.html' % _id + return "http://player.ina.fr/notices/%s.mrss" % _id