handle required fields and forbidden videos

This commit is contained in:
Christophe Benz 2010-07-12 03:11:54 +02:00
commit 62fc2a87c7
5 changed files with 128 additions and 95 deletions

View file

@ -21,8 +21,10 @@ import logging
from weboob.capabilities.video import ICapVideo from weboob.capabilities.video import ICapVideo
from weboob.tools.backend import BaseBackend from weboob.tools.backend import BaseBackend
from weboob.tools.misc import iter_fields
from .browser import YoutubeBrowser from .browser import YoutubeBrowser
from .pages import ForbiddenVideo
from .video import YoutubeVideo from .video import YoutubeVideo
@ -37,13 +39,25 @@ class YoutubeBackend(BaseBackend, ICapVideo):
DESCRIPTION = 'Youtube videos website' DESCRIPTION = 'Youtube videos website'
LICENSE = 'GPLv3' LICENSE = 'GPLv3'
CONFIG = {}
BROWSER = YoutubeBrowser BROWSER = YoutubeBrowser
def get_video(self, _id): def get_video(self, _id, video=None):
return self.browser.get_video(_id) try:
browser_video = self.browser.get_video(_id)
except ForbiddenVideo:
if video is None:
return None
else:
raise
if video is None:
return browser_video
else:
for k, v in iter_fields(browser_video):
if v and getattr(video, k) != v:
setattr(video, k, v)
return video
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False): def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None):
import gdata.youtube.service import gdata.youtube.service
yt_service = gdata.youtube.service.YouTubeService() yt_service = gdata.youtube.service.YouTubeService()
query = gdata.youtube.service.YouTubeVideoQuery() query = gdata.youtube.service.YouTubeVideoQuery()
@ -57,12 +71,26 @@ class YoutubeBackend(BaseBackend, ICapVideo):
author = entry.media.name.text.decode('utf-8').strip() author = entry.media.name.text.decode('utf-8').strip()
else: else:
author = None author = None
yield YoutubeVideo(entry.id.text.split('/')[-1].decode('utf-8'), video = YoutubeVideo(entry.id.text.split('/')[-1].decode('utf-8'),
title=entry.media.title.text.decode('utf-8').strip(), title=entry.media.title.text.decode('utf-8').strip(),
author=author, author=author,
duration=datetime.timedelta(seconds=entry.media.duration.seconds.decode('utf-8').strip()), duration=datetime.timedelta(seconds=int(entry.media.duration.seconds.decode('utf-8').strip())),
thumbnail_url=entry.media.thumbnail[0].url.decode('utf-8').strip(), thumbnail_url=entry.media.thumbnail[0].url.decode('utf-8').strip(),
) )
if required_fields is not None:
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
if missing_required_fields:
logging.debug(u'Completing missing required fields: %s' % missing_required_fields)
try:
self.get_video(video.id, video=video)
except ForbiddenVideo, e:
logging.debug(e)
continue
else:
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
if missing_required_fields:
raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields)
yield video
def iter_page_urls(self, mozaic_url): def iter_page_urls(self, mozaic_url):
raise NotImplementedError() raise NotImplementedError()

View file

@ -19,7 +19,7 @@
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url from weboob.tools.browser.decorators import id2url
from .pages import VideoPage from .pages import ForbiddenVideoPage, VerifyAgePage, VideoPage
from .video import YoutubeVideo from .video import YoutubeVideo
@ -28,10 +28,15 @@ __all__ = ['YoutubeBrowser']
class YoutubeBrowser(BaseBrowser): class YoutubeBrowser(BaseBrowser):
DOMAIN = u'youtube.com' DOMAIN = u'youtube.com'
PAGES = {'.*youtube\.com/watch\?v=(.+)': VideoPage, PAGES = {'.*youtube\.com/watch\?v=(?P<id>.+)': VideoPage,
'.*youtube\.com/index\?ytsession=.+': ForbiddenVideoPage,
'.*youtube\.com/verify_age\?next_url=(?P<next_url>.+)': VerifyAgePage,
} }
@id2url(YoutubeVideo.id2url) @id2url(YoutubeVideo.id2url)
def get_video(self, url): def get_video(self, url):
self.location(url) self.location(url)
return self.page.video if hasattr(self.page, 'video'):
return self.page.video
else:
return None

View file

@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010 Christophe Benz, Romain Bignon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import re
from weboob.tools.browser import BasePage, ExpectedElementNotFound
from .video import YoutubeVideo
__all__ = ['ForbiddenVideoPage', 'VerifyAgePage', 'VideoPage']
class ForbiddenVideo(Exception):
pass
class ForbiddenVideoPage(BasePage):
def on_loaded(self):
selector = '.yt-alert-content'
try:
element = self.document.getroot().cssselect(selector)[0]
except IndexError:
raise ExpectedElementNotFound(selector)
raise ForbiddenVideo(element.text.strip())
class VerifyAgePage(BasePage):
def on_loaded(self):
raise ForbiddenVideo('verify age not implemented')
class VideoPage(BasePage):
VIDEO_SIGNATURE_REGEX = re.compile(r'&t=([^ ,&]*)')
def on_loaded(self):
_id = self.group_dict['id']
self.video = YoutubeVideo(_id,
title=self.get_title(),
url=self.get_url(_id),
author=self.get_author(),
)
def get_author(self):
selector = 'a.watch-description-username strong'
try:
element = self.document.getroot().cssselect(selector)[0]
except IndexError:
raise ExpectedElementNotFound(selector)
return element.text.strip()
def get_title(self):
selector = 'meta[name=title]'
try:
element = self.document.getroot().cssselect(selector)[0]
except IndexError:
raise ExpectedElementNotFound(selector)
return unicode(element.attrib['content']).strip()
def get_url(self, _id):
video_signature = None
for data in self.document.getiterator('script'):
if not data.text:
continue
for m in re.finditer(self.VIDEO_SIGNATURE_REGEX, data.text):
video_signature = m.group(1)
return u'http://www.youtube.com/get_video?video_id=%s&t=%s&fmt=18' % (_id, video_signature)

View file

@ -1,18 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010 Christophe Benz
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from .video import VideoPage

View file

@ -1,64 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010 Christophe Benz, Romain Bignon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import re
from logging import warning
from weboob.tools.browser import BasePage
from ..video import YoutubeVideo
__all__ = ['VideoPage']
class VideoPage(BasePage):
URL_REGEX = re.compile(r"https?://[w\.]*youtube.com/watch\?v=(.+)")
VIDEO_SIGNATURE_REGEX = re.compile(r'&t=([^ ,&]*)')
def on_loaded(self):
self.video = YoutubeVideo(self.get_id())
self.video.title = self.get_title()
self.video.url = self.get_url()
self.set_details(self.video)
def get_id(self):
m = self.URL_REGEX.match(self.url)
if m:
return m.group(1)
warning("Unable to parse ID")
return 0
def get_url(self):
video_signature = None
for data in self.document.getiterator('script'):
if not data.text:
continue
for m in re.finditer(self.VIDEO_SIGNATURE_REGEX, data.text):
video_signature = m.group(1)
return 'http://www.youtube.com/get_video?video_id=%s&t=%s&fmt=18' % (self.video.id, video_signature)
def get_title(self):
found = self.document.getroot().cssselect('meta[name=title]')
if found:
content = found[0].attrib['content']
return unicode(content).strip()
return u''
def set_details(self, v):
v.author = self.document.getroot().cssselect('a.watch-description-username strong')[0].text