From a5eb1d789c8e6a851be6e05704c294b3d66efdbb Mon Sep 17 00:00:00 2001 From: Florent Date: Fri, 5 Sep 2014 15:48:52 +0200 Subject: [PATCH] Update part of the js interpreter --- modules/youtube/pages.py | 121 ++++++++++++++++++++++++++++++--------- 1 file changed, 95 insertions(+), 26 deletions(-) diff --git a/modules/youtube/pages.py b/modules/youtube/pages.py index 88fe01b9..31d396a0 100644 --- a/modules/youtube/pages.py +++ b/modules/youtube/pages.py @@ -187,7 +187,7 @@ class VideoPage(BaseYoutubePage): self._player_cache = {} def _extract_signature_function(self, video_id, player_url, slen): - id_m = re.match(r'.*-(?P[a-zA-Z0-9_-]+)\.(?P[a-z]+)$', + id_m = re.match(r'.*-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P[a-z]+)$', player_url) player_type = id_m.group('ext') player_id = id_m.group('id') @@ -214,6 +214,8 @@ class VideoPage(BaseYoutubePage): u'Initial JS player signature function name') functions = {} + objects = {} + code = jscode def argidx(varname): return string.lowercase.index(varname) @@ -245,12 +247,40 @@ class VideoPage(BaseYoutubePage): assign = lambda v: v expr = stmt[len(u'return '):] else: - raise BrokenPageError( - u'Cannot determine left side of statement in %r' % stmt) + # Try interpreting it as an expression + expr = stmt + assign = lambda v: v v = interpret_expression(expr, local_vars, allow_recursion) return assign(v) + def extract_object(objname): + obj = {} + obj_m = re.search( + (r'(?:var\s+)?%s\s*=\s*\{' % re.escape(objname)) + + r'\s*(?P([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\})*)' + + r'\}\s*;', + code) + fields = obj_m.group('fields') + # Currently, it only supports function definitions + fields_m = re.finditer( + r'(?P[a-zA-Z$0-9]+)\s*:\s*function' + r'\((?P[a-z,]+)\){(?P[^}]+)}', + fields) + for f in fields_m: + argnames = f.group('args').split(',') + obj[f.group('key')] = build_function(argnames, f.group('code')) + + return obj + + def build_function(argnames, code): + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in code.split(';'): + res = interpret_statement(stmt, local_vars) + return res + return resf + def interpret_expression(expr, local_vars, allow_recursion): if expr.isdigit(): return int(expr) @@ -258,48 +288,87 @@ class VideoPage(BaseYoutubePage): if expr.isalpha(): return local_vars[expr] - m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) + try: + return json.loads(expr) + except ValueError: + pass + + m = re.match(r'^(?P[a-zA-Z0-9_]+)\.(?P[^(]+)(?:\(+(?P[^()]*)\))?$', expr) if m: + variable = m.group('var') member = m.group('member') - val = local_vars[m.group('in')] - if member == 'split("")': - return list(val) - if member == 'join("")': - return u''.join(val) - if member == 'length': - return len(val) - if member == 'reverse()': - return val[::-1] - slice_m = re.match(r'slice\((?P.*)\)', member) - if slice_m: - idx = interpret_expression( - slice_m.group('idx'), local_vars, allow_recursion-1) - return val[idx:] + arg_str = m.group('args') + + if variable in local_vars: + obj = local_vars[variable] + else: + if variable not in objects: + objects[variable] = extract_object(variable) + obj = objects[variable] + + if arg_str is None: + # Member access + if member == 'length': + return len(obj) + return obj[member] + + assert expr.endswith(')') + # Function call + if arg_str == '': + argvals = tuple() + else: + argvals = tuple([ + interpret_expression(v, local_vars, allow_recursion) + for v in arg_str.split(',')]) + + if member == 'split': + assert argvals == ('',) + return list(obj) + if member == 'join': + assert len(argvals) == 1 + return argvals[0].join(obj) + if member == 'reverse': + assert len(argvals) == 0 + obj.reverse() + return obj + if member == 'slice': + assert len(argvals) == 1 + return obj[argvals[0]:] + if member == 'splice': + assert isinstance(obj, list) + index, howMany = argvals + res = [] + for i in range(index, min(index + howMany, len(obj))): + res.append(obj.pop(index)) + return res + + return obj[member](argvals) m = re.match( r'^(?P[a-z]+)\[(?P.+)\]$', expr) if m: val = local_vars[m.group('in')] - idx = interpret_expression(m.group('idx'), local_vars, - allow_recursion-1) + idx = interpret_expression( + m.group('idx'), local_vars, allow_recursion - 1) return val[idx] m = re.match(r'^(?P.+?)(?P[%])(?P.+?)$', expr) if m: - a = interpret_expression(m.group('a'), - local_vars, allow_recursion) - b = interpret_expression(m.group('b'), - local_vars, allow_recursion) + a = interpret_expression( + m.group('a'), local_vars, allow_recursion) + b = interpret_expression( + m.group('b'), local_vars, allow_recursion) return a % b m = re.match( r'^(?P[a-zA-Z$]+)\((?P[a-z0-9,]+)\)$', expr) if m: fname = m.group('func') + argvals = tuple([ + int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')]) if fname not in functions: functions[fname] = extract_function(fname) - argvals = [int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')] return functions[fname](argvals) raise BrokenPageError(u'Unsupported JS expression %r' % expr)