From d82cc615a3902917000df105619bd58e0a3127a7 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Tue, 4 Mar 2014 20:56:38 +0100 Subject: [PATCH] steal youtube-dl code to fix youtube module --- modules/youtube/backend.py | 2 +- modules/youtube/browser.py | 4 +- modules/youtube/pages.py | 953 ++++++++++++++++++++++++++++++++++--- 3 files changed, 900 insertions(+), 59 deletions(-) diff --git a/modules/youtube/backend.py b/modules/youtube/backend.py index 1b1d6de4..5958d06f 100644 --- a/modules/youtube/backend.py +++ b/modules/youtube/backend.py @@ -92,7 +92,7 @@ class YoutubeBackend(BaseBackend, ICapVideo, ICapCollection): player_url = YoutubeVideo.id2url(video.id) with self.browser: - url, ext = self.browser.get_video_url(player_url) + url, ext = self.browser.get_video_url(video, player_url) video.url = unicode(url) video.ext = unicode(ext) diff --git a/modules/youtube/browser.py b/modules/youtube/browser.py index b506e649..5aaa2963 100644 --- a/modules/youtube/browser.py +++ b/modules/youtube/browser.py @@ -48,8 +48,8 @@ class YoutubeBrowser(BaseBrowser): self.location('https://accounts.google.com/ServiceLogin?uilel=3&service=youtube&passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26nomobiletemp%3D1%26hl%3Den_US%26next%3D%252F&hl=en_US<mpl=sso') self.page.login(self.username, self.password) - def get_video_url(self, player_url): + def get_video_url(self, video, player_url): self.location(player_url + '&has_verified=1') assert self.is_on_page(VideoPage) - return self.page.get_video_url() + return self.page.get_video_url(video) diff --git a/modules/youtube/pages.py b/modules/youtube/pages.py index 28aca560..19f38154 100644 --- a/modules/youtube/pages.py +++ b/modules/youtube/pages.py @@ -20,8 +20,16 @@ # Some parts are taken from youtube-dl, licensed under the UNLICENSE. -from urlparse import parse_qsl +from urlparse import urlparse, parse_qs +import zlib import re +import os +import string +import struct +import collections +import traceback +import urllib +import io from weboob.capabilities.base import UserError from weboob.tools.browser import BasePage, BrokenPageError, BrowserIncorrectPassword @@ -87,85 +95,918 @@ class VerifyControversyPage(BaseYoutubePage): self.browser.submit() +def determine_ext(url, default_ext=u'unknown_video'): + guess = url.partition(u'?')[0].rpartition(u'.')[2] + if re.match(r'^[A-Za-z0-9]+$', guess): + return guess + else: + return default_ext + +_NO_DEFAULT = object() + + class VideoPage(BaseYoutubePage): - AVAILABLE_FORMATS = [38, 37, 45, 22, 43, 35, 34, 18, 6, 5, 17, 13] - FORMAT_EXTENSIONS = { - 13: '3gp', - 17: 'mp4', - 18: 'mp4', - 22: 'mp4', - 37: 'mp4', - 38: 'video', # You actually don't know if this will be MOV, AVI or whatever - 43: 'webm', - 45: 'webm', + _formats = { + '5': {'ext': 'flv', 'width': 400, 'height': 240}, + '6': {'ext': 'flv', 'width': 450, 'height': 270}, + '13': {'ext': '3gp'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720}, + '34': {'ext': 'flv', 'width': 640, 'height': 360}, + '35': {'ext': 'flv', 'width': 854, 'height': 480}, + '36': {'ext': '3gp', 'width': 320, 'height': 240}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072}, + '43': {'ext': 'webm', 'width': 640, 'height': 360}, + '44': {'ext': 'webm', 'width': 854, 'height': 480}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080}, + + + # 3d videos + '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20}, + + # Apple HTTP Live Streaming + '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40}, + '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40}, + '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40}, + '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40}, + '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40}, + '138': {'ext': 'mp4', 'height': 2160, 'resolution': '2160p', 'format_note': 'DASH video', 'preference': -40}, + '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40}, + '264': {'ext': 'mp4', 'height': 1440, 'resolution': '1440p', 'format_note': 'DASH video', 'preference': -40}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, + '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40}, + '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40}, + '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, + '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, + '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, + '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40}, + '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40}, + + # Dash webm audio + '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50}, + '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50}, + + # RTMP (unnamed) + '_rtmp': {'protocol': 'rtmp'}, } - def _decrypt_signature(self, s): + def __init__(self, *args, **kwargs): + BasePage.__init__(self, *args, **kwargs) + self._player_cache = {} + + def _extract_signature_function(self, video_id, player_url, slen): + id_m = re.match(r'.*-(?P[a-zA-Z0-9_-]+)\.(?P[a-z]+)$', + player_url) + player_type = id_m.group('ext') + player_id = id_m.group('id') + + # Read from filesystem cache + func_id = '%s_%s_%d' % (player_type, player_id, slen) + assert os.path.basename(func_id) == func_id + + if player_type == 'js': + code = self.browser.readurl(player_url) + res = self._parse_sig_js(code) + elif player_type == 'swf': + urlh = self.browser.openurl(player_url) + code = urlh.read() + res = self._parse_sig_swf(code) + else: + assert False, 'Invalid player type %r' % player_type + + return res + + def _parse_sig_js(self, jscode): + funcname = self._search_regex( + r'signature=([a-zA-Z]+)', jscode, + u'Initial JS player signature function name') + + functions = {} + + def argidx(varname): + return string.lowercase.index(varname) + + def interpret_statement(stmt, local_vars, allow_recursion=20): + if allow_recursion < 0: + raise BrokenPageError(u'Recursion limit reached') + + if stmt.startswith(u'var '): + stmt = stmt[len(u'var '):] + ass_m = re.match(r'^(?P[a-z]+)(?:\[(?P[^\]]+)\])?' + + r'=(?P.*)$', stmt) + if ass_m: + if ass_m.groupdict().get('index'): + def assign(val): + lvar = local_vars[ass_m.group('out')] + idx = interpret_expression(ass_m.group('index'), + local_vars, allow_recursion) + assert isinstance(idx, int) + lvar[idx] = val + return val + expr = ass_m.group('expr') + else: + def assign(val): + local_vars[ass_m.group('out')] = val + return val + expr = ass_m.group('expr') + elif stmt.startswith(u'return '): + assign = lambda v: v + expr = stmt[len(u'return '):] + else: + raise BrokenPageError( + u'Cannot determine left side of statement in %r' % stmt) + + v = interpret_expression(expr, local_vars, allow_recursion) + return assign(v) + + def interpret_expression(expr, local_vars, allow_recursion): + if expr.isdigit(): + return int(expr) + + if expr.isalpha(): + return local_vars[expr] + + m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) + if m: + member = m.group('member') + val = local_vars[m.group('in')] + if member == 'split("")': + return list(val) + if member == 'join("")': + return u''.join(val) + if member == 'length': + return len(val) + if member == 'reverse()': + return val[::-1] + slice_m = re.match(r'slice\((?P.*)\)', member) + if slice_m: + idx = interpret_expression( + slice_m.group('idx'), local_vars, allow_recursion-1) + return val[idx:] + + m = re.match( + r'^(?P[a-z]+)\[(?P.+)\]$', expr) + if m: + val = local_vars[m.group('in')] + idx = interpret_expression(m.group('idx'), local_vars, + allow_recursion-1) + return val[idx] + + m = re.match(r'^(?P.+?)(?P[%])(?P.+?)$', expr) + if m: + a = interpret_expression(m.group('a'), + local_vars, allow_recursion) + b = interpret_expression(m.group('b'), + local_vars, allow_recursion) + return a % b + + m = re.match( + r'^(?P[a-zA-Z$]+)\((?P[a-z0-9,]+)\)$', expr) + if m: + fname = m.group('func') + if fname not in functions: + functions[fname] = extract_function(fname) + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')] + return functions[fname](argvals) + raise BrokenPageError(u'Unsupported JS expression %r' % expr) + + def extract_function(funcname): + func_m = re.search( + r'function ' + re.escape(funcname) + + r'\((?P[a-z,]+)\){(?P[^}]+)}', + jscode) + argnames = func_m.group('args').split(',') + + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in func_m.group('code').split(';'): + res = interpret_statement(stmt, local_vars) + return res + return resf + + initial_function = extract_function(funcname) + return lambda s: initial_function([s]) + + def _parse_sig_swf(self, file_contents): + if file_contents[1:3] != b'WS': + raise BrokenPageError( + u'Not an SWF file; header is %r' % file_contents[:3]) + if file_contents[:1] == b'C': + content = zlib.decompress(file_contents[8:]) + else: + raise NotImplementedError(u'Unsupported compression format %r' % + file_contents[:1]) + + def extract_tags(content): + pos = 0 + while pos < len(content): + header16 = struct.unpack('> 6 + tag_len = header16 & 0x3f + if tag_len == 0x3f: + tag_len = struct.unpack('> 4 + methods = {} + if kind in [0x00, 0x06]: # Slot or Const + u30() # Slot id + u30() # type_name_idx + vindex = u30() + if vindex != 0: + read_byte() # vkind + elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter + u30() # disp_id + method_idx = u30() + methods[multinames[trait_name_idx]] = method_idx + elif kind == 0x04: # Class + u30() # slot_id + u30() # classi + elif kind == 0x05: # Function + u30() # slot_id + function_idx = u30() + methods[function_idx] = multinames[trait_name_idx] + else: + raise BrokenPageError(u'Unsupported trait kind %d' % kind) + + if attrs & 0x4 != 0: # Metadata present + metadata_count = u30() + for _c3 in range(metadata_count): + u30() # metadata index + + return methods + + # Classes + TARGET_CLASSNAME = u'SignatureDecipher' + searched_idx = multinames.index(TARGET_CLASSNAME) + searched_class_id = None + class_count = u30() + for class_id in range(class_count): + name_idx = u30() + if name_idx == searched_idx: + # We found the class we're looking for! + searched_class_id = class_id + u30() # super_name idx + flags = read_byte() + if flags & 0x08 != 0: # Protected namespace is present + u30() # protected_ns_idx + intrf_count = u30() + for _c2 in range(intrf_count): + u30() + u30() # iinit + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + if searched_class_id is None: + raise BrokenPageError(u'Target class %r not found' % + TARGET_CLASSNAME) + + method_names = {} + method_idxs = {} + for class_id in range(class_count): + u30() # cinit + trait_count = u30() + for _c2 in range(trait_count): + trait_methods = parse_traits_info() + if class_id == searched_class_id: + method_names.update(trait_methods.items()) + method_idxs.update(dict( + (idx, name) + for name, idx in trait_methods.items())) + + # Scripts + script_count = u30() + for _c in range(script_count): + u30() # init + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + # Method bodies + method_body_count = u30() + Method = collections.namedtuple('Method', ['code', 'local_count']) + methods = {} + for _c in range(method_body_count): + method_idx = u30() + u30() # max_stack + local_count = u30() + u30() # init_scope_depth + u30() # max_scope_depth + code_length = u30() + code = read_bytes(code_length) + if method_idx in method_idxs: + m = Method(code, local_count) + methods[method_idxs[method_idx]] = m + exception_count = u30() + for _c2 in range(exception_count): + u30() # from + u30() # to + u30() # target + u30() # exc_type + u30() # var_name + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + assert p + code_reader.tell() == len(code_tag) + assert len(methods) == len(method_idxs) + + method_pyfunctions = {} + + def extract_function(func_name): + if func_name in method_pyfunctions: + return method_pyfunctions[func_name] + if func_name not in methods: + raise BrokenPageError(u'Cannot find function %r' % func_name) + m = methods[func_name] + + def resfunc(args): + registers = ['(this)'] + list(args) + [None] * m.local_count + stack = [] + coder = io.BytesIO(m.code) + while True: + opcode = struct.unpack('!B', coder.read(1))[0] + if opcode == 36: # pushbyte + v = struct.unpack('!B', coder.read(1))[0] + stack.append(v) + elif opcode == 44: # pushstring + idx = u30(coder) + stack.append(constant_strings[idx]) + elif opcode == 48: # pushscope + # We don't implement the scope register, so we'll just + # ignore the popped value + stack.pop() + elif opcode == 70: # callproperty + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'split': + assert len(args) == 1 + assert isinstance(args[0], unicode) + assert isinstance(obj, unicode) + if args[0] == u'': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + elif mname == u'slice': + assert len(args) == 1 + assert isinstance(args[0], int) + assert isinstance(obj, list) + res = obj[args[0]:] + stack.append(res) + elif mname == u'join': + assert len(args) == 1 + assert isinstance(args[0], unicode) + assert isinstance(obj, list) + res = args[0].join(obj) + stack.append(res) + elif mname in method_pyfunctions: + stack.append(method_pyfunctions[mname](args)) + else: + raise NotImplementedError( + u'Unsupported property %r on %r' + % (mname, obj)) + elif opcode == 72: # returnvalue + res = stack.pop() + return res + elif opcode == 79: # callpropvoid + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'reverse': + assert isinstance(obj, list) + obj.reverse() + else: + raise NotImplementedError( + u'Unsupported (void) property %r on %r' + % (mname, obj)) + elif opcode == 93: # findpropstrict + index = u30(coder) + mname = multinames[index] + res = extract_function(mname) + stack.append(res) + elif opcode == 97: # setproperty + index = u30(coder) + value = stack.pop() + idx = stack.pop() + obj = stack.pop() + assert isinstance(obj, list) + assert isinstance(idx, int) + obj[idx] = value + elif opcode == 98: # getlocal + index = u30(coder) + stack.append(registers[index]) + elif opcode == 99: # setlocal + index = u30(coder) + value = stack.pop() + registers[index] = value + elif opcode == 102: # getproperty + index = u30(coder) + pname = multinames[index] + if pname == u'length': + obj = stack.pop() + assert isinstance(obj, list) + stack.append(len(obj)) + else: # Assume attribute access + idx = stack.pop() + assert isinstance(idx, int) + obj = stack.pop() + assert isinstance(obj, list) + stack.append(obj[idx]) + elif opcode == 128: # coerce + u30(coder) + elif opcode == 133: # coerce_s + assert isinstance(stack[-1], (type(None), unicode)) + elif opcode == 164: # modulo + value2 = stack.pop() + value1 = stack.pop() + res = value1 % value2 + stack.append(res) + elif opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 211: # getlocal_3 + stack.append(registers[3]) + elif opcode == 214: # setlocal_2 + registers[2] = stack.pop() + elif opcode == 215: # setlocal_3 + registers[3] = stack.pop() + else: + raise NotImplementedError( + u'Unsupported opcode %d' % opcode) + + method_pyfunctions[func_name] = resfunc + return resfunc + + initial_function = extract_function(u'decipher') + return lambda s: initial_function([s]) + + def _decrypt_signature(self, s, video_id, player_url, age_gate=False): """Turn the encrypted s field into a working signature""" - if len(s) == 92: + if player_url is not None: + if player_url.startswith(u'//'): + player_url = u'https:' + player_url + try: + player_id = (player_url, len(s)) + if player_id not in self._player_cache: + func = self._extract_signature_function( + video_id, player_url, len(s) + ) + self._player_cache[player_id] = func + func = self._player_cache[player_id] + return func(s) + except Exception: + tb = traceback.format_exc() + raise BrokenPageError(u'Automatic signature extraction failed: %s' % tb) + + return self._static_decrypt_signature( + s, video_id, player_url, age_gate) + + def _static_decrypt_signature(self, s, video_id, player_url, age_gate): + if age_gate: + # The videos with age protection use another player, so the + # algorithms can be different. + if len(s) == 86: + return s[2:63] + s[82] + s[64:82] + s[63] + + if len(s) == 93: + return s[86:29:-1] + s[88] + s[28:5:-1] + elif len(s) == 92: return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] + elif len(s) == 91: + return s[84:27:-1] + s[86] + s[26:5:-1] elif len(s) == 90: return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] + elif len(s) == 89: + return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1] elif len(s) == 88: - return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] + return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28] elif len(s) == 87: - return s[62] + s[82:62:-1] + s[83] + s[61:52:-1] + s[0] + s[51:2:-1] + return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: - return s[2:63] + s[82] + s[64:82] + s[63] + return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1] elif len(s) == 85: - return s[2:8] + s[0] + s[9:21] + s[65] + s[22:65] + s[84] + s[66:82] + s[21] + return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84] elif len(s) == 84: - return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26] + return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1] elif len(s) == 83: - return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[53] + s[34:53] + s[24] + s[54:] + return s[80:63:-1] + s[0] + s[62:0:-1] + s[63] elif len(s) == 82: - return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34] + return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37] elif len(s) == 81: - return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[2] + s[34:53] + s[24] + s[54:81] + return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] + elif len(s) == 80: + return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80] + elif len(s) == 79: + return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] else: raise BrokenPageError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) - def get_video_url(self, format=38): - formats = {} - for script in self.parser.select(self.document.getroot(), 'script'): - text = script.text - if not text: - continue + def _extract_from_m3u8(self, manifest_url, video_id): + url_map = {} + def _get_urls(_manifest): + lines = _manifest.split('\n') + urls = filter(lambda l: l and not l.startswith('#'), + lines) + return urls + manifest = self.browser.readurl(manifest_url) + formats_urls = _get_urls(manifest) + for format_url in formats_urls: + itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag') + url_map[itag] = format_url + return url_map - pattern = "ytplayer.config = " - pos = text.find(pattern) - if pos < 0: - continue + def get_video_url(self, video): + video_id = video.id + video_webpage = ' '.join([self.parser.tocleanstring(el) for el in self.document.xpath('//script')]) - sub = text[pos+len(pattern):].rstrip(';\n') - # json spec only allows \u - convert other escape sequences - sub = re.sub(r'\\x([a-f0-9]{2})', r'\u\1', sub) - sub = re.sub(r'\\U([a-f0-9]{4})([a-f0-9]{4})', r'\u\1\u\2', sub) - a = json.loads(sub) + # Attempt to extract SWF player URL + mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) + if mobj is not None: + player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) + else: + player_url = None - for part in a['args']['url_encoded_fmt_stream_map'].split(','): - args = dict(parse_qsl(part)) - url = args['url'] - if 'sig' in args: - signature = args['sig'] - elif 's' in args: - signature = self._decrypt_signature(args['s']) - formats[int(args['itag'])] = args['url'] + '&signature=' + signature + # Get video info + if re.search(r'player-age-gate-content">', video_webpage) is not None: + age_gate = True + # We simulate the access to the video from www.youtube.com/v/{video_id} + # this can be viewed without login into Youtube + data = urllib.urlencode({'video_id': video_id, + 'el': 'player_embedded', + 'gl': 'US', + 'hl': 'en', + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, + 'asv': 3, + 'sts':'1588', + }) + video_info_url = 'https://www.youtube.com/get_video_info?' + data + video_info_webpage = self.browser.readurl(video_info_url) + video_info = parse_qs(video_info_webpage) + else: + age_gate = False + for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: + video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' + % (video_id, el_type)) + video_info_webpage = self.browser.readurl(video_info_url) + video_info = parse_qs(video_info_webpage) + if 'token' in video_info: + break + if 'token' not in video_info: + if 'reason' in video_info: + raise BrokenPageError(u'YouTube said: %s' % video_info['reason'][0], expected=True) + else: + raise BrokenPageError(u'"token" parameter not in video info for unknown reason') - break + # Check for "rental" videos + if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: + raise UserError(u'"rental" videos not supported') - # choose the better format to use. - for format in self.AVAILABLE_FORMATS[self.AVAILABLE_FORMATS.index(format):]: - if format in formats: - url = formats.get(format) - ext = self.FORMAT_EXTENSIONS.get(format, 'flv') - return url, ext + # Decide which formats to download + try: + mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) + if not mobj: + raise ValueError('Could not find vevo ID') + ytplayer_config = json.loads(mobj.group(1)) + args = ytplayer_config['args'] + # Easy way to know if the 's' value is in url_encoded_fmt_stream_map + # this signatures are encrypted + if 'url_encoded_fmt_stream_map' not in args: + raise ValueError(u'No stream_map present') # caught below + re_signature = re.compile(r'[&,]s=') + m_s = re_signature.search(args['url_encoded_fmt_stream_map']) + if m_s is not None: + video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] + m_s = re_signature.search(args.get('adaptive_fmts', u'')) + if m_s is not None: + if 'adaptive_fmts' in video_info: + video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts'] + else: + video_info['adaptive_fmts'] = [args['adaptive_fmts']] + except ValueError: + pass - # check errors only here, in case the video url is available though - error = self.document.xpath('//h1[@id="unavailable-message"]') - if len(error) > 0: - raise ForbiddenVideo(unicode(error[0].text).strip()) + def _map_to_format_list(urlmap): + formats = [] + for itag, video_real_url in urlmap.items(): + dct = { + 'format_id': itag, + 'url': video_real_url, + 'player_url': player_url, + } + if itag in self._formats: + dct.update(self._formats[itag]) + formats.append(dct) + return formats - raise BrokenPageError('Unable to find file URL') + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): + self.report_rtmp_download() + formats = [{ + 'format_id': '_rtmp', + 'protocol': 'rtmp', + 'url': video_info['conn'][0], + 'player_url': player_url, + }] + elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1: + encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0] + if 'rtmpe%3Dyes' in encoded_url_map: + raise BrokenPageError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) + url_map = {} + for url_data_str in encoded_url_map.split(','): + url_data = parse_qs(url_data_str) + if 'itag' in url_data and 'url' in url_data: + url = url_data['url'][0] + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + elif 's' in url_data: + encrypted_sig = url_data['s'][0] + if not age_gate: + jsplayer_url_json = self._search_regex( + r'"assets":.+?"js":\s*("[^"]+")', + video_webpage, u'JS player URL') + player_url = json.loads(jsplayer_url_json) + + signature = self._decrypt_signature( + encrypted_sig, video_id, player_url, age_gate) + url += '&signature=' + signature + if 'ratebypass' not in url: + url += '&ratebypass=yes' + url_map[url_data['itag'][0]] = url + formats = _map_to_format_list(url_map) + elif video_info.get('hlsvp'): + manifest_url = video_info['hlsvp'][0] + url_map = self._extract_from_m3u8(manifest_url, video_id) + formats = _map_to_format_list(url_map) + else: + raise BrokenPageError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + + self._sort_formats(formats) + + return formats[0]['url'], formats[0]['ext'] + + def _sort_formats(self, formats): + if not formats: + raise BrokenPageError(u'No video formats found') + + def _formats_key(f): + # TODO remove the following workaround + if not f.get('ext') and 'url' in f: + f['ext'] = determine_ext(f['url']) + + preference = f.get('preference') + if preference is None: + proto = f.get('protocol') + if proto is None: + proto = urlparse(f.get('url', '')).scheme + + preference = 0 if proto in ['http', 'https'] else -0.1 + if f.get('ext') in ['f4f', 'f4m']: # Not yet supported + preference -= 0.5 + + if f.get('vcodec') == 'none': # audio only + ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a'] + ext_preference = 0 + try: + audio_ext_preference = ORDER.index(f['ext']) + except ValueError: + audio_ext_preference = -1 + else: + ORDER = [u'webm', u'flv', u'mp4'] + try: + ext_preference = ORDER.index(f['ext']) + except ValueError: + ext_preference = -1 + audio_ext_preference = 0 + + return ( + preference, + f.get('quality') if f.get('quality') is not None else -1, + f.get('height') if f.get('height') is not None else -1, + f.get('width') if f.get('width') is not None else -1, + ext_preference, + f.get('tbr') if f.get('tbr') is not None else -1, + f.get('vbr') if f.get('vbr') is not None else -1, + f.get('abr') if f.get('abr') is not None else -1, + audio_ext_preference, + f.get('filesize') if f.get('filesize') is not None else -1, + f.get('format_id'), + ) + formats.sort(key=_formats_key) + + def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. + In case of failure return a default value or raise a WARNING or a + RegexNotFoundError, depending on fatal, specifying the field name. + """ + if isinstance(pattern, (str, unicode, type(re.compile('')))): + mobj = re.search(pattern, string, flags) + else: + for p in pattern: + mobj = re.search(p, string, flags) + if mobj: break + + if mobj: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + elif default is not _NO_DEFAULT: + return default + elif fatal: + raise BrokenPageError(u'Unable to extract %s' % name) + else: + self.logger.warning(u'unable to extract %s' % name) + return None