From 30026290c87739e33e3730b4e3578c348f4edc28 Mon Sep 17 00:00:00 2001
From: Romain Bignon <romain@peerfuse.org>
Date: Fri, 12 Aug 2011 15:39:33 +0200
Subject: [PATCH] support videos on www.ina.fr (in addition to boutique.ina.fr)

---
 weboob/backends/ina/browser.py      |  9 ++--
 weboob/backends/ina/pages/search.py |  6 +--
 weboob/backends/ina/pages/video.py  | 82 +++++++++++++++++++++--------
 weboob/backends/ina/video.py        |  8 ++-
 4 files changed, 74 insertions(+), 31 deletions(-)

diff --git a/weboob/backends/ina/browser.py b/weboob/backends/ina/browser.py
index 0b50c20f..a566cb41 100644
--- a/weboob/backends/ina/browser.py
+++ b/weboob/backends/ina/browser.py
@@ -21,7 +21,7 @@
 from weboob.tools.browser import BaseBrowser
 from weboob.tools.browser.decorators import id2url
 
-from .pages.video import VideoPage
+from .pages.video import VideoPage, BoutiqueVideoPage
 from .pages.search import SearchPage
 from .video import InaVideo
 
@@ -30,8 +30,9 @@ __all__ = ['InaBrowser']
 
 
 class InaBrowser(BaseBrowser):
-    DOMAIN = 'boutique.ina.fr'
-    PAGES = {'http://boutique\.ina\.fr/video/.+\.html': VideoPage,
+    DOMAIN = 'ina.fr'
+    PAGES = {'http://boutique\.ina\.fr/video/.+\.html': BoutiqueVideoPage,
+             'http://www\.ina\.fr/.+\.html': VideoPage,
              'http://boutique\.ina\.fr/recherche/.+': SearchPage,
             }
 
@@ -41,6 +42,6 @@ class InaBrowser(BaseBrowser):
         return self.page.get_video(video)
 
     def iter_search_results(self, pattern):
-        self.location(self.buildurl('/recherche/recherche', search=pattern.encode('utf-8')))
+        self.location(self.buildurl('http://boutique.ina.fr/recherche/recherche', search=pattern.encode('utf-8')))
         assert self.is_on_page(SearchPage)
         return self.page.iter_videos()
diff --git a/weboob/backends/ina/pages/search.py b/weboob/backends/ina/pages/search.py
index 940dc533..2479c316 100644
--- a/weboob/backends/ina/pages/search.py
+++ b/weboob/backends/ina/pages/search.py
@@ -31,7 +31,7 @@ __all__ = ['SearchPage']
 
 
 class SearchPage(BasePage):
-    URL_REGEXP = re.compile('/video/(.+).html')
+    URL_REGEXP = re.compile(r'/video/(.+)\.html')
 
     def iter_videos(self):
         try:
@@ -40,7 +40,7 @@ class SearchPage(BasePage):
             # It means there are no results.
             return
         for li in ul.findall('li'):
-            id = re.sub(r'/video/(.+)\.html', r'\1', li.find('a').attrib['href'])
+            id = re.sub(self.URL_REGEXP, r'\1', li.find('a').attrib['href'])
 
             thumbnail = 'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src']
 
@@ -57,7 +57,7 @@ class SearchPage(BasePage):
             else:
                 raise BrokenPageError('Unable to match duration (%r)' % duration)
 
-            yield InaVideo(id,
+            yield InaVideo('boutique.%s' % id,
                            title=title,
                            date=date,
                            duration=duration,
diff --git a/weboob/backends/ina/pages/video.py b/weboob/backends/ina/pages/video.py
index 069894e4..c6e8f7cc 100644
--- a/weboob/backends/ina/pages/video.py
+++ b/weboob/backends/ina/pages/video.py
@@ -19,7 +19,6 @@
 
 
 import datetime
-from logging import warning
 import re
 try:
     from urlparse import parse_qs
@@ -32,12 +31,10 @@ from weboob.tools.browser import BrokenPageError
 from ..video import InaVideo
 
 
-__all__ = ['VideoPage']
+__all__ = ['VideoPage', 'BoutiqueVideoPage']
 
 
-class VideoPage(BasePage):
-    URL_REGEXP = re.compile('http://boutique.ina.fr/video/(.+).html')
-
+class BaseVideoPage(BasePage):
     def get_video(self, video):
         date, duration = self.get_date_and_duration()
         if not video:
@@ -53,29 +50,73 @@ class VideoPage(BasePage):
     def get_id(self):
         m = self.URL_REGEXP.match(self.url)
         if m:
-            return unicode(m.group(1))
-        warning('Unable to parse ID')
+            return self.create_id(m.group(1))
+        self.logger.warning('Unable to parse ID')
         return 0
 
+    def get_url(self):
+        qs = parse_qs(self.document.getroot().cssselect('param[name="flashvars"]')[0].attrib['value'])
+        url = 'http://mp4.ina.fr/lecture/lire/id_notice/%s/token_notice/%s' % (qs['id_notice'][0], qs['token_notice'][0])
+        return url
+
+    def parse_date_and_duration(self, text):
+        duration_regexp = re.compile('(.* - )?(.+) - ((.+)h)?((.+)min)?(.+)s')
+        m = duration_regexp.match(text)
+        if m:
+            day, month, year = [int(s) for s in m.group(2).split('/')]
+            date = datetime.datetime(year, month, day)
+            duration = datetime.timedelta(hours=int(m.group(4) if m.group(4) is not None else 0),
+                                          minutes=int(m.group(6) if m.group(6) is not None else 0),
+                                          seconds=int(m.group(7)))
+            return date, duration
+        else:
+            raise BrokenPageError('Unable to parse date and duration')
+
+    def create_id(self, id):
+        raise NotImplementedError()
+
+    def get_date_and_duration(self):
+        raise NotImplementedError()
+
+    def get_title(self):
+        raise NotImplementedError()
+
+    def get_description(self):
+        raise NotImplementedError()
+
+class VideoPage(BaseVideoPage):
+    URL_REGEXP = re.compile('http://www.ina.fr/(.+)\.html')
+
+    def create_id(self, id):
+        return u'www.%s' % id
+
+    def get_date_and_duration(self):
+        qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0].find('div').findall('div')[1]
+        return self.parse_date_and_duration(qr.find('h2').tail.strip())
+
+    def get_title(self):
+        qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0].find('div').findall('div')[1]
+        return qr.find('h2').text.strip()
+
+    def get_description(self):
+        return self.parser.select(self.document.getroot(), 'div.container-global-qr')[1].find('div').find('p').text.strip()
+
+
+class BoutiqueVideoPage(BaseVideoPage):
+    URL_REGEXP = re.compile('http://boutique.ina.fr/video/(.+).html')
+
+    def create_id(self, id):
+        return u'boutique.%s' % id
+
     def get_description(self):
         el = self.document.getroot().cssselect('div.bloc-produit-haut div.contenu p')[0]
         if el is not None:
             return el.text.strip()
 
     def get_date_and_duration(self):
-        duration_regexp = re.compile('(.+) - ((.+)h)?((.+)min)?(.+)s')
         el = self.document.getroot().cssselect('div.bloc-produit-haut p.date')[0]
         if el is not None:
-            m = duration_regexp.match(el.text.strip())
-            if m:
-                day, month, year = [int(s) for s in m.group(1).split('/')]
-                date = datetime.datetime(year, month, day)
-                duration = datetime.timedelta(hours=int(m.group(3) if m.group(3) is not None else 0),
-                                              minutes=int(m.group(5) if m.group(5) is not None else 0),
-                                              seconds=int(m.group(6)))
-                return date, duration
-            else:
-                raise BrokenPageError('Unable to parse date and duration')
+            return self.parse_date_and_duration(el.text.strip())
         else:
             raise BrokenPageError('Unable to find date and duration element')
 
@@ -85,8 +126,3 @@ class VideoPage(BasePage):
             return unicode(el.text.strip())
         else:
             return None
-
-    def get_url(self):
-        qs = parse_qs(self.document.getroot().cssselect('param[name="flashvars"]')[0].attrib['value'])
-        url = 'http://mp4.ina.fr/lecture/lire/id_notice/%s/token_notice/%s' % (qs['id_notice'][0], qs['token_notice'][0])
-        return url
diff --git a/weboob/backends/ina/video.py b/weboob/backends/ina/video.py
index f8c3bce4..8e000837 100644
--- a/weboob/backends/ina/video.py
+++ b/weboob/backends/ina/video.py
@@ -27,4 +27,10 @@ __all__ = ['InaVideo']
 class InaVideo(BaseVideo):
     @classmethod
     def id2url(cls, _id):
-        return 'http://boutique.ina.fr/video/%s.html' % _id
+        if not '.' in _id:
+            return None
+        site, _id = _id.split('.', 1)
+        if site == 'boutique':
+            return 'http://boutique.ina.fr/video/%s.html' % _id
+        if site == 'www':
+            return 'http://www.ina.fr/%s.html' % _id