From 62fc2a87c79fda8e16ed07348658ddb5970ad0cc Mon Sep 17 00:00:00 2001
From: Christophe Benz <christophe.benz@gmail.com>
Date: Mon, 12 Jul 2010 03:11:54 +0200
Subject: [PATCH] handle required fields and forbidden videos

---
 weboob/backends/youtube/backend.py        | 48 ++++++++++---
 weboob/backends/youtube/browser.py        | 11 ++-
 weboob/backends/youtube/pages.py          | 82 +++++++++++++++++++++++
 weboob/backends/youtube/pages/__init__.py | 18 -----
 weboob/backends/youtube/pages/video.py    | 64 ------------------
 5 files changed, 128 insertions(+), 95 deletions(-)
 create mode 100644 weboob/backends/youtube/pages.py
 delete mode 100644 weboob/backends/youtube/pages/__init__.py
 delete mode 100644 weboob/backends/youtube/pages/video.py

diff --git a/weboob/backends/youtube/backend.py b/weboob/backends/youtube/backend.py
index 597f3af0..10140d8c 100644
--- a/weboob/backends/youtube/backend.py
+++ b/weboob/backends/youtube/backend.py
@@ -21,8 +21,10 @@ import logging
 
 from weboob.capabilities.video import ICapVideo
 from weboob.tools.backend import BaseBackend
+from weboob.tools.misc import iter_fields
 
 from .browser import YoutubeBrowser
+from .pages import ForbiddenVideo
 from .video import YoutubeVideo
 
 
@@ -37,13 +39,25 @@ class YoutubeBackend(BaseBackend, ICapVideo):
     DESCRIPTION = 'Youtube videos website'
     LICENSE = 'GPLv3'
 
-    CONFIG = {}
     BROWSER = YoutubeBrowser
 
-    def get_video(self, _id):
-        return self.browser.get_video(_id)
+    def get_video(self, _id, video=None):
+        try:
+            browser_video = self.browser.get_video(_id)
+        except ForbiddenVideo:
+            if video is None:
+                return None
+            else:
+                raise
+        if video is None:
+            return browser_video
+        else:
+            for k, v in iter_fields(browser_video):
+                if v and getattr(video, k) != v:
+                    setattr(video, k, v)
+            return video
 
-    def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
+    def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None):
         import gdata.youtube.service
         yt_service = gdata.youtube.service.YouTubeService()
         query = gdata.youtube.service.YouTubeVideoQuery()
@@ -57,12 +71,26 @@ class YoutubeBackend(BaseBackend, ICapVideo):
                 author = entry.media.name.text.decode('utf-8').strip()
             else:
                 author = None
-            yield YoutubeVideo(entry.id.text.split('/')[-1].decode('utf-8'),
-                               title=entry.media.title.text.decode('utf-8').strip(),
-                               author=author,
-                               duration=datetime.timedelta(seconds=entry.media.duration.seconds.decode('utf-8').strip()),
-                               thumbnail_url=entry.media.thumbnail[0].url.decode('utf-8').strip(),
-                               )
+            video = YoutubeVideo(entry.id.text.split('/')[-1].decode('utf-8'),
+                                 title=entry.media.title.text.decode('utf-8').strip(),
+                                 author=author,
+                                 duration=datetime.timedelta(seconds=int(entry.media.duration.seconds.decode('utf-8').strip())),
+                                 thumbnail_url=entry.media.thumbnail[0].url.decode('utf-8').strip(),
+                                 )
+            if required_fields is not None:
+                missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
+                if missing_required_fields:
+                    logging.debug(u'Completing missing required fields: %s' % missing_required_fields)
+                    try:
+                        self.get_video(video.id, video=video)
+                    except ForbiddenVideo, e:
+                        logging.debug(e)
+                        continue
+                    else:
+                        missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
+                        if missing_required_fields:
+                            raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields)
+            yield video
 
     def iter_page_urls(self, mozaic_url):
         raise NotImplementedError()
diff --git a/weboob/backends/youtube/browser.py b/weboob/backends/youtube/browser.py
index f456bed2..c84eb9c6 100644
--- a/weboob/backends/youtube/browser.py
+++ b/weboob/backends/youtube/browser.py
@@ -19,7 +19,7 @@
 from weboob.tools.browser import BaseBrowser
 from weboob.tools.browser.decorators import id2url
 
-from .pages import VideoPage
+from .pages import ForbiddenVideoPage, VerifyAgePage, VideoPage
 from .video import YoutubeVideo
 
 
@@ -28,10 +28,15 @@ __all__ = ['YoutubeBrowser']
 
 class YoutubeBrowser(BaseBrowser):
     DOMAIN = u'youtube.com'
-    PAGES = {'.*youtube\.com/watch\?v=(.+)': VideoPage,
+    PAGES = {'.*youtube\.com/watch\?v=(?P<id>.+)': VideoPage,
+             '.*youtube\.com/index\?ytsession=.+': ForbiddenVideoPage,
+             '.*youtube\.com/verify_age\?next_url=(?P<next_url>.+)': VerifyAgePage,
             }
 
     @id2url(YoutubeVideo.id2url)
     def get_video(self, url):
         self.location(url)
-        return self.page.video
+        if hasattr(self.page, 'video'):
+            return self.page.video
+        else:
+            return None
diff --git a/weboob/backends/youtube/pages.py b/weboob/backends/youtube/pages.py
new file mode 100644
index 00000000..bace372e
--- /dev/null
+++ b/weboob/backends/youtube/pages.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010  Christophe Benz, Romain Bignon
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+
+import re
+
+from weboob.tools.browser import BasePage, ExpectedElementNotFound
+
+from .video import YoutubeVideo
+
+
+__all__ = ['ForbiddenVideoPage', 'VerifyAgePage', 'VideoPage']
+
+
+class ForbiddenVideo(Exception):
+    pass
+
+
+class ForbiddenVideoPage(BasePage):
+    def on_loaded(self):
+        selector = '.yt-alert-content'
+        try:
+            element = self.document.getroot().cssselect(selector)[0]
+        except IndexError:
+            raise ExpectedElementNotFound(selector)
+        raise ForbiddenVideo(element.text.strip())
+
+
+class VerifyAgePage(BasePage):
+    def on_loaded(self):
+        raise ForbiddenVideo('verify age not implemented')
+
+
+class VideoPage(BasePage):
+    VIDEO_SIGNATURE_REGEX = re.compile(r'&t=([^ ,&]*)')
+
+    def on_loaded(self):
+        _id = self.group_dict['id']
+        self.video = YoutubeVideo(_id,
+                                  title=self.get_title(),
+                                  url=self.get_url(_id),
+                                  author=self.get_author(),
+                                  )
+
+    def get_author(self):
+        selector = 'a.watch-description-username strong'
+        try:
+            element = self.document.getroot().cssselect(selector)[0]
+        except IndexError:
+            raise ExpectedElementNotFound(selector)
+        return element.text.strip()
+
+    def get_title(self):
+        selector = 'meta[name=title]'
+        try:
+            element = self.document.getroot().cssselect(selector)[0]
+        except IndexError:
+            raise ExpectedElementNotFound(selector)
+        return unicode(element.attrib['content']).strip()
+
+    def get_url(self, _id):
+        video_signature = None
+        for data in self.document.getiterator('script'):
+            if not data.text:
+                continue
+            for m in re.finditer(self.VIDEO_SIGNATURE_REGEX, data.text):
+                video_signature = m.group(1)
+        return u'http://www.youtube.com/get_video?video_id=%s&t=%s&fmt=18' % (_id, video_signature)
diff --git a/weboob/backends/youtube/pages/__init__.py b/weboob/backends/youtube/pages/__init__.py
deleted file mode 100644
index 6900826f..00000000
--- a/weboob/backends/youtube/pages/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright(C) 2010  Christophe Benz
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, version 3 of the License.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
-from .video import VideoPage
diff --git a/weboob/backends/youtube/pages/video.py b/weboob/backends/youtube/pages/video.py
deleted file mode 100644
index 6b8cfea0..00000000
--- a/weboob/backends/youtube/pages/video.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright(C) 2010  Christophe Benz, Romain Bignon
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, version 3 of the License.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
-
-import re
-from logging import warning
-
-from weboob.tools.browser import BasePage
-
-from ..video import YoutubeVideo
-
-
-__all__ = ['VideoPage']
-
-
-class VideoPage(BasePage):
-    URL_REGEX = re.compile(r"https?://[w\.]*youtube.com/watch\?v=(.+)")
-    VIDEO_SIGNATURE_REGEX = re.compile(r'&t=([^ ,&]*)')
-
-    def on_loaded(self):
-        self.video = YoutubeVideo(self.get_id())
-        self.video.title = self.get_title()
-        self.video.url = self.get_url()
-        self.set_details(self.video)
-
-    def get_id(self):
-        m = self.URL_REGEX.match(self.url)
-        if m:
-            return m.group(1)
-        warning("Unable to parse ID")
-        return 0
-
-    def get_url(self):
-        video_signature = None
-        for data in self.document.getiterator('script'):
-            if not data.text:
-                continue
-            for m in re.finditer(self.VIDEO_SIGNATURE_REGEX, data.text):
-                video_signature = m.group(1)
-        return 'http://www.youtube.com/get_video?video_id=%s&t=%s&fmt=18' % (self.video.id, video_signature)
-
-    def get_title(self):
-        found = self.document.getroot().cssselect('meta[name=title]')
-        if found:
-            content = found[0].attrib['content']
-            return unicode(content).strip()
-        return u''
-
-    def set_details(self, v):
-        v.author = self.document.getroot().cssselect('a.watch-description-username strong')[0].text