support videos on www.ina.fr (in addition to boutique.ina.fr)

2011-08-12 15:39:33 +02:00 · 2011-08-12 15:39:33 +02:00 · 30026290c8
commit 30026290c8
parent f745ae2b8f
4 changed files with 74 additions and 31 deletions
--- a/weboob/backends/ina/browser.py
+++ b/weboob/backends/ina/browser.py
@ -21,7 +21,7 @@
 from weboob.tools.browser import BaseBrowser
 from weboob.tools.browser.decorators import id2url

-from .pages.video import VideoPage
+from .pages.video import VideoPage, BoutiqueVideoPage
 from .pages.search import SearchPage
 from .video import InaVideo

@ -30,8 +30,9 @@ __all__ = ['InaBrowser']


 class InaBrowser(BaseBrowser):
-    DOMAIN = 'boutique.ina.fr'
-    PAGES = {'http://boutique\.ina\.fr/video/.+\.html': VideoPage,
+    DOMAIN = 'ina.fr'
+    PAGES = {'http://boutique\.ina\.fr/video/.+\.html': BoutiqueVideoPage,
+             'http://www\.ina\.fr/.+\.html': VideoPage,
             'http://boutique\.ina\.fr/recherche/.+': SearchPage,
            }

@ -41,6 +42,6 @@ class InaBrowser(BaseBrowser):
        return self.page.get_video(video)

    def iter_search_results(self, pattern):
-        self.location(self.buildurl('/recherche/recherche', search=pattern.encode('utf-8')))
+        self.location(self.buildurl('http://boutique.ina.fr/recherche/recherche', search=pattern.encode('utf-8')))
        assert self.is_on_page(SearchPage)
        return self.page.iter_videos()
--- a/weboob/backends/ina/pages/search.py
+++ b/weboob/backends/ina/pages/search.py
@ -31,7 +31,7 @@ __all__ = ['SearchPage']


 class SearchPage(BasePage):
-    URL_REGEXP = re.compile('/video/(.+).html')
+    URL_REGEXP = re.compile(r'/video/(.+)\.html')

    def iter_videos(self):
        try:
@ -40,7 +40,7 @@ class SearchPage(BasePage):
            # It means there are no results.
            return
        for li in ul.findall('li'):
-            id = re.sub(r'/video/(.+)\.html', r'\1', li.find('a').attrib['href'])
+            id = re.sub(self.URL_REGEXP, r'\1', li.find('a').attrib['href'])

            thumbnail = 'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src']

@ -57,7 +57,7 @@ class SearchPage(BasePage):
            else:
                raise BrokenPageError('Unable to match duration (%r)' % duration)

-            yield InaVideo(id,
+            yield InaVideo('boutique.%s' % id,
                           title=title,
                           date=date,
                           duration=duration,
--- a/weboob/backends/ina/pages/video.py
+++ b/weboob/backends/ina/pages/video.py
@ -19,7 +19,6 @@


 import datetime
-from logging import warning
 import re
 try:
    from urlparse import parse_qs
@ -32,12 +31,10 @@ from weboob.tools.browser import BrokenPageError
 from ..video import InaVideo


-__all__ = ['VideoPage']
+__all__ = ['VideoPage', 'BoutiqueVideoPage']


-class VideoPage(BasePage):
-    URL_REGEXP = re.compile('http://boutique.ina.fr/video/(.+).html')
-
+class BaseVideoPage(BasePage):
    def get_video(self, video):
        date, duration = self.get_date_and_duration()
        if not video:
@ -53,29 +50,73 @@ class VideoPage(BasePage):
    def get_id(self):
        m = self.URL_REGEXP.match(self.url)
        if m:
-            return unicode(m.group(1))
-        warning('Unable to parse ID')
+            return self.create_id(m.group(1))
+        self.logger.warning('Unable to parse ID')
        return 0

+    def get_url(self):
+        qs = parse_qs(self.document.getroot().cssselect('param[name="flashvars"]')[0].attrib['value'])
+        url = 'http://mp4.ina.fr/lecture/lire/id_notice/%s/token_notice/%s' % (qs['id_notice'][0], qs['token_notice'][0])
+        return url
+
+    def parse_date_and_duration(self, text):
+        duration_regexp = re.compile('(.* - )?(.+) - ((.+)h)?((.+)min)?(.+)s')
+        m = duration_regexp.match(text)
+        if m:
+            day, month, year = [int(s) for s in m.group(2).split('/')]
+            date = datetime.datetime(year, month, day)
+            duration = datetime.timedelta(hours=int(m.group(4) if m.group(4) is not None else 0),
+                                          minutes=int(m.group(6) if m.group(6) is not None else 0),
+                                          seconds=int(m.group(7)))
+            return date, duration
+        else:
+            raise BrokenPageError('Unable to parse date and duration')
+
+    def create_id(self, id):
+        raise NotImplementedError()
+
+    def get_date_and_duration(self):
+        raise NotImplementedError()
+
+    def get_title(self):
+        raise NotImplementedError()
+
+    def get_description(self):
+        raise NotImplementedError()
+
+class VideoPage(BaseVideoPage):
+    URL_REGEXP = re.compile('http://www.ina.fr/(.+)\.html')
+
+    def create_id(self, id):
+        return u'www.%s' % id
+
+    def get_date_and_duration(self):
+        qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0].find('div').findall('div')[1]
+        return self.parse_date_and_duration(qr.find('h2').tail.strip())
+
+    def get_title(self):
+        qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0].find('div').findall('div')[1]
+        return qr.find('h2').text.strip()
+
+    def get_description(self):
+        return self.parser.select(self.document.getroot(), 'div.container-global-qr')[1].find('div').find('p').text.strip()
+
+
+class BoutiqueVideoPage(BaseVideoPage):
+    URL_REGEXP = re.compile('http://boutique.ina.fr/video/(.+).html')
+
+    def create_id(self, id):
+        return u'boutique.%s' % id
+
    def get_description(self):
        el = self.document.getroot().cssselect('div.bloc-produit-haut div.contenu p')[0]
        if el is not None:
            return el.text.strip()

    def get_date_and_duration(self):
-        duration_regexp = re.compile('(.+) - ((.+)h)?((.+)min)?(.+)s')
        el = self.document.getroot().cssselect('div.bloc-produit-haut p.date')[0]
        if el is not None:
-            m = duration_regexp.match(el.text.strip())
-            if m:
-                day, month, year = [int(s) for s in m.group(1).split('/')]
-                date = datetime.datetime(year, month, day)
-                duration = datetime.timedelta(hours=int(m.group(3) if m.group(3) is not None else 0),
-                                              minutes=int(m.group(5) if m.group(5) is not None else 0),
-                                              seconds=int(m.group(6)))
-                return date, duration
-            else:
-                raise BrokenPageError('Unable to parse date and duration')
+            return self.parse_date_and_duration(el.text.strip())
        else:
            raise BrokenPageError('Unable to find date and duration element')

@ -85,8 +126,3 @@ class VideoPage(BasePage):
            return unicode(el.text.strip())
        else:
            return None
-
-    def get_url(self):
-        qs = parse_qs(self.document.getroot().cssselect('param[name="flashvars"]')[0].attrib['value'])
-        url = 'http://mp4.ina.fr/lecture/lire/id_notice/%s/token_notice/%s' % (qs['id_notice'][0], qs['token_notice'][0])
-        return url
--- a/weboob/backends/ina/video.py
+++ b/weboob/backends/ina/video.py
@ -27,4 +27,10 @@ __all__ = ['InaVideo']
 class InaVideo(BaseVideo):
    @classmethod
    def id2url(cls, _id):
-        return 'http://boutique.ina.fr/video/%s.html' % _id
+        if not '.' in _id:
+            return None
+        site, _id = _id.split('.', 1)
+        if site == 'boutique':
+            return 'http://boutique.ina.fr/video/%s.html' % _id
+        if site == 'www':
+            return 'http://www.ina.fr/%s.html' % _id