From d11084f05e4e50025efe4cf7cb65529f17e41512 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Sun, 24 Jul 2011 16:57:01 +0200 Subject: [PATCH] use browser to get file URL, and leave an error when it needs an age verification --- weboob/backends/youtube/backend.py | 42 ++++++-------------------- weboob/backends/youtube/browser.py | 6 ++++ weboob/backends/youtube/pages.py | 47 ++++++++++++++++++++++++++---- 3 files changed, 57 insertions(+), 38 deletions(-) diff --git a/weboob/backends/youtube/backend.py b/weboob/backends/youtube/backend.py index 50203c09..57e62946 100644 --- a/weboob/backends/youtube/backend.py +++ b/weboob/backends/youtube/backend.py @@ -46,17 +46,6 @@ class YoutubeBackend(BaseBackend, ICapVideo): BROWSER = YoutubeBrowser URL_RE = re.compile(r'^https?://(?:\w*\.?youtube\.com/(?:watch\?v=|v/)|youtu\.be\/|\w*\.?youtube\.com\/user\/\w+#p\/u\/\d+\/)([^\?&]+)') - AVAILABLE_FORMATS = [38, 37, 22, 45, 35, 34, 43, 18, 6, 5, 17, 13] - FORMAT_EXTENSIONS = { - 13: '3gp', - 17: 'mp4', - 18: 'mp4', - 22: 'mp4', - 37: 'mp4', - 38: 'video', # You actually don't know if this will be MOV, AVI or whatever - 43: 'webm', - 45: 'webm', - } def _entry2video(self, entry): """ @@ -72,35 +61,22 @@ class YoutubeBackend(BaseBackend, ICapVideo): video.author = to_unicode(entry.media.name.text.strip()) return video - def _set_video_url(self, video, format=18): + def _set_video_url(self, video): """ In the case of a download, if the user-chosen format is not available, the next available format will be used. Much of the code for this method is borrowed from youtubeservice.py of Cutetube http://maemo.org/packages/view/cutetube/. """ - player_url = YoutubeVideo.id2url(video.id) - html = urllib.urlopen(player_url).read() - html = ''.join(html.split()) - formats = {} - pos = html.find('","fmt_url_map":"') - if (pos != -1): - pos2 = html.find('"', pos + 17) - fmt_map = urllib.unquote(html[pos + 17:pos2]) + ',' - parts = fmt_map.split('|') - key = parts[0] - for p in parts[1:]: - idx = p.rfind(',') - value = p[:idx].replace('\\/', '/').replace('\u0026', '&').replace(',', '%2C') - formats[int(key)] = value - key = p[idx + 1:] - for format in self.AVAILABLE_FORMATS[self.AVAILABLE_FORMATS.index(format):]: - if format in formats: - video.url = formats.get(format) - video.ext = self.FORMAT_EXTENSIONS.get(format, 'flv') - return True + if video.url: + return - return False + player_url = YoutubeVideo.id2url(video.id) + with self.browser: + url, ext = self.browser.get_video_url(player_url) + + video.url = url + video.ext = ext def get_video(self, _id): m = self.URL_RE.match(_id) diff --git a/weboob/backends/youtube/browser.py b/weboob/backends/youtube/browser.py index 83259f5a..54142752 100644 --- a/weboob/backends/youtube/browser.py +++ b/weboob/backends/youtube/browser.py @@ -33,3 +33,9 @@ class YoutubeBrowser(BaseBrowser): r'.*youtube\.com/index\?ytsession=.+': ForbiddenVideoPage, r'.*youtube\.com/verify_age\?next_url=(?P.+)': VerifyAgePage, } + + def get_video_url(self, player_url): + self.location(player_url) + + assert self.is_on_page(VideoPage) + return self.page.get_video_url() diff --git a/weboob/backends/youtube/pages.py b/weboob/backends/youtube/pages.py index 9b47f18b..f5996330 100644 --- a/weboob/backends/youtube/pages.py +++ b/weboob/backends/youtube/pages.py @@ -18,8 +18,9 @@ # along with weboob. If not, see . -from weboob.tools.browser import BasePage +import urllib +from weboob.tools.browser import BasePage __all__ = ['ForbiddenVideo', 'ForbiddenVideoPage', 'VerifyAgePage', 'VideoPage'] @@ -30,15 +31,51 @@ class ForbiddenVideo(Exception): class ForbiddenVideoPage(BasePage): - def get_video(self, video=None): + def on_loaded(self): element = self.parser.select(self.document.getroot(), '.yt-alert-content', 1) raise ForbiddenVideo(element.text.strip()) class VerifyAgePage(BasePage): - def get_video(self, video=None): - raise ForbiddenVideo('verify age not implemented') + def on_loaded(self): + raise ForbiddenVideo('This video or group may contain content that is inappropriate for some users') class VideoPage(BasePage): - pass + AVAILABLE_FORMATS = [38, 37, 22, 45, 35, 34, 43, 18, 6, 5, 17, 13] + FORMAT_EXTENSIONS = { + 13: '3gp', + 17: 'mp4', + 18: 'mp4', + 22: 'mp4', + 37: 'mp4', + 38: 'video', # You actually don't know if this will be MOV, AVI or whatever + 43: 'webm', + 45: 'webm', + } + + def get_video_url(self, format=18): + formats = {} + for script in self.parser.select(self.document.getroot(), 'script'): + text = script.text + if not text: + continue + pos = text.find('"fmt_url_map": "') + if pos >= 0: + pos2 = text.find('"', pos + 17) + fmt_map = urllib.unquote(text[pos + 17:pos2]) + ',' + parts = fmt_map.split('|') + key = parts[0] + for p in parts[1:]: + idx = p.rfind(',') + value = p[:idx].replace('\\/', '/').replace('\u0026', '&').replace(',', '%2C') + formats[int(key)] = value + key = p[idx + 1:] + break + for format in self.AVAILABLE_FORMATS[self.AVAILABLE_FORMATS.index(format):]: + if format in formats: + url = formats.get(format) + ext = self.FORMAT_EXTENSIONS.get(format, 'flv') + return url, ext + + return None, None