use browser to get file URL, and leave an error when it needs an age verification

2011-07-24 16:57:01 +02:00 · 2011-07-24 16:57:01 +02:00 · d11084f05e
commit d11084f05e
parent ddda7f88e4
3 changed files with 57 additions and 38 deletions
--- a/weboob/backends/youtube/backend.py
+++ b/weboob/backends/youtube/backend.py
@ -46,17 +46,6 @@ class YoutubeBackend(BaseBackend, ICapVideo):
    BROWSER = YoutubeBrowser
    URL_RE = re.compile(r'^https?://(?:\w*\.?youtube\.com/(?:watch\?v=|v/)|youtu\.be\/|\w*\.?youtube\.com\/user\/\w+#p\/u\/\d+\/)([^\?&]+)')
    AVAILABLE_FORMATS = [38, 37, 22, 45, 35, 34, 43, 18, 6, 5, 17, 13]
    FORMAT_EXTENSIONS = {
        13: '3gp',
        17: 'mp4',
        18: 'mp4',
        22: 'mp4',
        37: 'mp4',
        38: 'video', # You actually don't know if this will be MOV, AVI or whatever
        43: 'webm',
        45: 'webm',
    }
    def _entry2video(self, entry):
        """
@ -72,35 +61,22 @@ class YoutubeBackend(BaseBackend, ICapVideo):
            video.author = to_unicode(entry.media.name.text.strip())
        return video
-    def _set_video_url(self, video, format=18):
+    def _set_video_url(self, video):
        """
        In the case of a download, if the user-chosen format is not
        available, the next available format will be used.
        Much of the code for this method is borrowed from youtubeservice.py of Cutetube
        http://maemo.org/packages/view/cutetube/.
        """
-        player_url = YoutubeVideo.id2url(video.id)
+        if video.url:
-        html = urllib.urlopen(player_url).read()
+            return
        html = ''.join(html.split())
        formats = {}
        pos = html.find('","fmt_url_map":"')
        if (pos != -1):
            pos2 = html.find('"', pos + 17)
            fmt_map = urllib.unquote(html[pos + 17:pos2]) + ','
            parts = fmt_map.split('|')
            key = parts[0]
            for p in parts[1:]:
                idx = p.rfind(',')
                value = p[:idx].replace('\\/', '/').replace('\u0026', '&').replace(',', '%2C')
                formats[int(key)] = value
                key = p[idx + 1:]
        for format in self.AVAILABLE_FORMATS[self.AVAILABLE_FORMATS.index(format):]:
            if format in formats:
                video.url = formats.get(format)
                video.ext = self.FORMAT_EXTENSIONS.get(format, 'flv')
                return True
-        return False
+        player_url = YoutubeVideo.id2url(video.id)
        with self.browser:
            url, ext = self.browser.get_video_url(player_url)
        video.url = url
        video.ext = ext
    def get_video(self, _id):
        m = self.URL_RE.match(_id)
--- a/weboob/backends/youtube/browser.py
+++ b/weboob/backends/youtube/browser.py
@ -33,3 +33,9 @@ class YoutubeBrowser(BaseBrowser):
             r'.*youtube\.com/index\?ytsession=.+': ForbiddenVideoPage,
             r'.*youtube\.com/verify_age\?next_url=(?P<next_url>.+)': VerifyAgePage,
            }
    def get_video_url(self, player_url):
        self.location(player_url)
        assert self.is_on_page(VideoPage)
        return self.page.get_video_url()
--- a/weboob/backends/youtube/pages.py
+++ b/weboob/backends/youtube/pages.py
@ -18,8 +18,9 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.
-from weboob.tools.browser import BasePage
+import urllib
 from weboob.tools.browser import BasePage
 __all__ = ['ForbiddenVideo', 'ForbiddenVideoPage', 'VerifyAgePage', 'VideoPage']
@ -30,15 +31,51 @@ class ForbiddenVideo(Exception):
 class ForbiddenVideoPage(BasePage):
-    def get_video(self, video=None):
+    def on_loaded(self):
        element = self.parser.select(self.document.getroot(), '.yt-alert-content', 1)
        raise ForbiddenVideo(element.text.strip())
 class VerifyAgePage(BasePage):
-    def get_video(self, video=None):
+    def on_loaded(self):
-        raise ForbiddenVideo('verify age not implemented')
+        raise ForbiddenVideo('This video or group may contain content that is inappropriate for some users')
 class VideoPage(BasePage):
-    pass
+    AVAILABLE_FORMATS = [38, 37, 22, 45, 35, 34, 43, 18, 6, 5, 17, 13]
    FORMAT_EXTENSIONS = {
        13: '3gp',
        17: 'mp4',
        18: 'mp4',
        22: 'mp4',
        37: 'mp4',
        38: 'video', # You actually don't know if this will be MOV, AVI or whatever
        43: 'webm',
        45: 'webm',
    }
    def get_video_url(self, format=18):
        formats = {}
        for script in self.parser.select(self.document.getroot(), 'script'):
            text = script.text
            if not text:
                continue
            pos = text.find('"fmt_url_map": "')
            if pos >= 0:
                pos2 = text.find('"', pos + 17)
                fmt_map = urllib.unquote(text[pos + 17:pos2]) + ','
                parts = fmt_map.split('|')
                key = parts[0]
                for p in parts[1:]:
                    idx = p.rfind(',')
                    value = p[:idx].replace('\\/', '/').replace('\u0026', '&').replace(',', '%2C')
                    formats[int(key)] = value
                    key = p[idx + 1:]
                break
        for format in self.AVAILABLE_FORMATS[self.AVAILABLE_FORMATS.index(format):]:
            if format in formats:
                url = formats.get(format)
                ext = self.FORMAT_EXTENSIONS.get(format, 'flv')
                return url, ext
        return None, None