From 62fc2a87c79fda8e16ed07348658ddb5970ad0cc Mon Sep 17 00:00:00 2001 From: Christophe Benz Date: Mon, 12 Jul 2010 03:11:54 +0200 Subject: [PATCH] handle required fields and forbidden videos --- weboob/backends/youtube/backend.py | 48 ++++++++++--- weboob/backends/youtube/browser.py | 11 ++- weboob/backends/youtube/pages.py | 82 +++++++++++++++++++++++ weboob/backends/youtube/pages/__init__.py | 18 ----- weboob/backends/youtube/pages/video.py | 64 ------------------ 5 files changed, 128 insertions(+), 95 deletions(-) create mode 100644 weboob/backends/youtube/pages.py delete mode 100644 weboob/backends/youtube/pages/__init__.py delete mode 100644 weboob/backends/youtube/pages/video.py diff --git a/weboob/backends/youtube/backend.py b/weboob/backends/youtube/backend.py index 597f3af0..10140d8c 100644 --- a/weboob/backends/youtube/backend.py +++ b/weboob/backends/youtube/backend.py @@ -21,8 +21,10 @@ import logging from weboob.capabilities.video import ICapVideo from weboob.tools.backend import BaseBackend +from weboob.tools.misc import iter_fields from .browser import YoutubeBrowser +from .pages import ForbiddenVideo from .video import YoutubeVideo @@ -37,13 +39,25 @@ class YoutubeBackend(BaseBackend, ICapVideo): DESCRIPTION = 'Youtube videos website' LICENSE = 'GPLv3' - CONFIG = {} BROWSER = YoutubeBrowser - def get_video(self, _id): - return self.browser.get_video(_id) + def get_video(self, _id, video=None): + try: + browser_video = self.browser.get_video(_id) + except ForbiddenVideo: + if video is None: + return None + else: + raise + if video is None: + return browser_video + else: + for k, v in iter_fields(browser_video): + if v and getattr(video, k) != v: + setattr(video, k, v) + return video - def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False): + def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None): import gdata.youtube.service yt_service = gdata.youtube.service.YouTubeService() query = gdata.youtube.service.YouTubeVideoQuery() @@ -57,12 +71,26 @@ class YoutubeBackend(BaseBackend, ICapVideo): author = entry.media.name.text.decode('utf-8').strip() else: author = None - yield YoutubeVideo(entry.id.text.split('/')[-1].decode('utf-8'), - title=entry.media.title.text.decode('utf-8').strip(), - author=author, - duration=datetime.timedelta(seconds=entry.media.duration.seconds.decode('utf-8').strip()), - thumbnail_url=entry.media.thumbnail[0].url.decode('utf-8').strip(), - ) + video = YoutubeVideo(entry.id.text.split('/')[-1].decode('utf-8'), + title=entry.media.title.text.decode('utf-8').strip(), + author=author, + duration=datetime.timedelta(seconds=int(entry.media.duration.seconds.decode('utf-8').strip())), + thumbnail_url=entry.media.thumbnail[0].url.decode('utf-8').strip(), + ) + if required_fields is not None: + missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v) + if missing_required_fields: + logging.debug(u'Completing missing required fields: %s' % missing_required_fields) + try: + self.get_video(video.id, video=video) + except ForbiddenVideo, e: + logging.debug(e) + continue + else: + missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v) + if missing_required_fields: + raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields) + yield video def iter_page_urls(self, mozaic_url): raise NotImplementedError() diff --git a/weboob/backends/youtube/browser.py b/weboob/backends/youtube/browser.py index f456bed2..c84eb9c6 100644 --- a/weboob/backends/youtube/browser.py +++ b/weboob/backends/youtube/browser.py @@ -19,7 +19,7 @@ from weboob.tools.browser import BaseBrowser from weboob.tools.browser.decorators import id2url -from .pages import VideoPage +from .pages import ForbiddenVideoPage, VerifyAgePage, VideoPage from .video import YoutubeVideo @@ -28,10 +28,15 @@ __all__ = ['YoutubeBrowser'] class YoutubeBrowser(BaseBrowser): DOMAIN = u'youtube.com' - PAGES = {'.*youtube\.com/watch\?v=(.+)': VideoPage, + PAGES = {'.*youtube\.com/watch\?v=(?P.+)': VideoPage, + '.*youtube\.com/index\?ytsession=.+': ForbiddenVideoPage, + '.*youtube\.com/verify_age\?next_url=(?P.+)': VerifyAgePage, } @id2url(YoutubeVideo.id2url) def get_video(self, url): self.location(url) - return self.page.video + if hasattr(self.page, 'video'): + return self.page.video + else: + return None diff --git a/weboob/backends/youtube/pages.py b/weboob/backends/youtube/pages.py new file mode 100644 index 00000000..bace372e --- /dev/null +++ b/weboob/backends/youtube/pages.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010 Christophe Benz, Romain Bignon +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +import re + +from weboob.tools.browser import BasePage, ExpectedElementNotFound + +from .video import YoutubeVideo + + +__all__ = ['ForbiddenVideoPage', 'VerifyAgePage', 'VideoPage'] + + +class ForbiddenVideo(Exception): + pass + + +class ForbiddenVideoPage(BasePage): + def on_loaded(self): + selector = '.yt-alert-content' + try: + element = self.document.getroot().cssselect(selector)[0] + except IndexError: + raise ExpectedElementNotFound(selector) + raise ForbiddenVideo(element.text.strip()) + + +class VerifyAgePage(BasePage): + def on_loaded(self): + raise ForbiddenVideo('verify age not implemented') + + +class VideoPage(BasePage): + VIDEO_SIGNATURE_REGEX = re.compile(r'&t=([^ ,&]*)') + + def on_loaded(self): + _id = self.group_dict['id'] + self.video = YoutubeVideo(_id, + title=self.get_title(), + url=self.get_url(_id), + author=self.get_author(), + ) + + def get_author(self): + selector = 'a.watch-description-username strong' + try: + element = self.document.getroot().cssselect(selector)[0] + except IndexError: + raise ExpectedElementNotFound(selector) + return element.text.strip() + + def get_title(self): + selector = 'meta[name=title]' + try: + element = self.document.getroot().cssselect(selector)[0] + except IndexError: + raise ExpectedElementNotFound(selector) + return unicode(element.attrib['content']).strip() + + def get_url(self, _id): + video_signature = None + for data in self.document.getiterator('script'): + if not data.text: + continue + for m in re.finditer(self.VIDEO_SIGNATURE_REGEX, data.text): + video_signature = m.group(1) + return u'http://www.youtube.com/get_video?video_id=%s&t=%s&fmt=18' % (_id, video_signature) diff --git a/weboob/backends/youtube/pages/__init__.py b/weboob/backends/youtube/pages/__init__.py deleted file mode 100644 index 6900826f..00000000 --- a/weboob/backends/youtube/pages/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright(C) 2010 Christophe Benz -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -from .video import VideoPage diff --git a/weboob/backends/youtube/pages/video.py b/weboob/backends/youtube/pages/video.py deleted file mode 100644 index 6b8cfea0..00000000 --- a/weboob/backends/youtube/pages/video.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright(C) 2010 Christophe Benz, Romain Bignon -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - - -import re -from logging import warning - -from weboob.tools.browser import BasePage - -from ..video import YoutubeVideo - - -__all__ = ['VideoPage'] - - -class VideoPage(BasePage): - URL_REGEX = re.compile(r"https?://[w\.]*youtube.com/watch\?v=(.+)") - VIDEO_SIGNATURE_REGEX = re.compile(r'&t=([^ ,&]*)') - - def on_loaded(self): - self.video = YoutubeVideo(self.get_id()) - self.video.title = self.get_title() - self.video.url = self.get_url() - self.set_details(self.video) - - def get_id(self): - m = self.URL_REGEX.match(self.url) - if m: - return m.group(1) - warning("Unable to parse ID") - return 0 - - def get_url(self): - video_signature = None - for data in self.document.getiterator('script'): - if not data.text: - continue - for m in re.finditer(self.VIDEO_SIGNATURE_REGEX, data.text): - video_signature = m.group(1) - return 'http://www.youtube.com/get_video?video_id=%s&t=%s&fmt=18' % (self.video.id, video_signature) - - def get_title(self): - found = self.document.getroot().cssselect('meta[name=title]') - if found: - content = found[0].attrib['content'] - return unicode(content).strip() - return u'' - - def set_details(self, v): - v.author = self.document.getroot().cssselect('a.watch-description-username strong')[0].text