Update part of the js interpreter

This commit is contained in:
Florent 2014-09-05 15:48:52 +02:00
commit a5eb1d789c

View file

@ -187,7 +187,7 @@ class VideoPage(BaseYoutubePage):
self._player_cache = {} self._player_cache = {}
def _extract_signature_function(self, video_id, player_url, slen): def _extract_signature_function(self, video_id, player_url, slen):
id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$', id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
player_url) player_url)
player_type = id_m.group('ext') player_type = id_m.group('ext')
player_id = id_m.group('id') player_id = id_m.group('id')
@ -214,6 +214,8 @@ class VideoPage(BaseYoutubePage):
u'Initial JS player signature function name') u'Initial JS player signature function name')
functions = {} functions = {}
objects = {}
code = jscode
def argidx(varname): def argidx(varname):
return string.lowercase.index(varname) return string.lowercase.index(varname)
@ -245,12 +247,40 @@ class VideoPage(BaseYoutubePage):
assign = lambda v: v assign = lambda v: v
expr = stmt[len(u'return '):] expr = stmt[len(u'return '):]
else: else:
raise BrokenPageError( # Try interpreting it as an expression
u'Cannot determine left side of statement in %r' % stmt) expr = stmt
assign = lambda v: v
v = interpret_expression(expr, local_vars, allow_recursion) v = interpret_expression(expr, local_vars, allow_recursion)
return assign(v) return assign(v)
def extract_object(objname):
obj = {}
obj_m = re.search(
(r'(?:var\s+)?%s\s*=\s*\{' % re.escape(objname)) +
r'\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\})*)' +
r'\}\s*;',
code)
fields = obj_m.group('fields')
# Currently, it only supports function definitions
fields_m = re.finditer(
r'(?P<key>[a-zA-Z$0-9]+)\s*:\s*function'
r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
fields)
for f in fields_m:
argnames = f.group('args').split(',')
obj[f.group('key')] = build_function(argnames, f.group('code'))
return obj
def build_function(argnames, code):
def resf(args):
local_vars = dict(zip(argnames, args))
for stmt in code.split(';'):
res = interpret_statement(stmt, local_vars)
return res
return resf
def interpret_expression(expr, local_vars, allow_recursion): def interpret_expression(expr, local_vars, allow_recursion):
if expr.isdigit(): if expr.isdigit():
return int(expr) return int(expr)
@ -258,48 +288,87 @@ class VideoPage(BaseYoutubePage):
if expr.isalpha(): if expr.isalpha():
return local_vars[expr] return local_vars[expr]
m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr) try:
return json.loads(expr)
except ValueError:
pass
m = re.match(r'^(?P<var>[a-zA-Z0-9_]+)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$', expr)
if m: if m:
variable = m.group('var')
member = m.group('member') member = m.group('member')
val = local_vars[m.group('in')] arg_str = m.group('args')
if member == 'split("")':
return list(val) if variable in local_vars:
if member == 'join("")': obj = local_vars[variable]
return u''.join(val) else:
if member == 'length': if variable not in objects:
return len(val) objects[variable] = extract_object(variable)
if member == 'reverse()': obj = objects[variable]
return val[::-1]
slice_m = re.match(r'slice\((?P<idx>.*)\)', member) if arg_str is None:
if slice_m: # Member access
idx = interpret_expression( if member == 'length':
slice_m.group('idx'), local_vars, allow_recursion-1) return len(obj)
return val[idx:] return obj[member]
assert expr.endswith(')')
# Function call
if arg_str == '':
argvals = tuple()
else:
argvals = tuple([
interpret_expression(v, local_vars, allow_recursion)
for v in arg_str.split(',')])
if member == 'split':
assert argvals == ('',)
return list(obj)
if member == 'join':
assert len(argvals) == 1
return argvals[0].join(obj)
if member == 'reverse':
assert len(argvals) == 0
obj.reverse()
return obj
if member == 'slice':
assert len(argvals) == 1
return obj[argvals[0]:]
if member == 'splice':
assert isinstance(obj, list)
index, howMany = argvals
res = []
for i in range(index, min(index + howMany, len(obj))):
res.append(obj.pop(index))
return res
return obj[member](argvals)
m = re.match( m = re.match(
r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr) r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
if m: if m:
val = local_vars[m.group('in')] val = local_vars[m.group('in')]
idx = interpret_expression(m.group('idx'), local_vars, idx = interpret_expression(
allow_recursion-1) m.group('idx'), local_vars, allow_recursion - 1)
return val[idx] return val[idx]
m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr) m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
if m: if m:
a = interpret_expression(m.group('a'), a = interpret_expression(
local_vars, allow_recursion) m.group('a'), local_vars, allow_recursion)
b = interpret_expression(m.group('b'), b = interpret_expression(
local_vars, allow_recursion) m.group('b'), local_vars, allow_recursion)
return a % b return a % b
m = re.match( m = re.match(
r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr) r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
if m: if m:
fname = m.group('func') fname = m.group('func')
argvals = tuple([
int(v) if v.isdigit() else local_vars[v]
for v in m.group('args').split(',')])
if fname not in functions: if fname not in functions:
functions[fname] = extract_function(fname) functions[fname] = extract_function(fname)
argvals = [int(v) if v.isdigit() else local_vars[v]
for v in m.group('args').split(',')]
return functions[fname](argvals) return functions[fname](argvals)
raise BrokenPageError(u'Unsupported JS expression %r' % expr) raise BrokenPageError(u'Unsupported JS expression %r' % expr)