folder reorganization

This commit is contained in:
cttynul
2019-04-23 14:32:53 +02:00
parent 659751b2f4
commit 8e7ee78a87
1195 changed files with 267003 additions and 2 deletions
+1
View File
@@ -0,0 +1 @@
__author__ = 'Piotrek'
+308
View File
@@ -0,0 +1,308 @@
from string import ascii_lowercase, digits
##################################
StringName = u'PyJsConstantString%d_'
NumberName = u'PyJsConstantNumber%d_'
RegExpName = u'PyJsConstantRegExp%d_'
##################################
ALPHAS = set(ascii_lowercase + ascii_lowercase.upper())
NUMS = set(digits)
IDENTIFIER_START = ALPHAS.union(NUMS)
ESCAPE_CHARS = {'n', '0', 'b', 'f', 'r', 't', 'v', '"', "'", '\\'}
OCTAL = {'0', '1', '2', '3', '4', '5', '6', '7'}
HEX = set('0123456789abcdefABCDEF')
from utils import *
IDENTIFIER_PART = IDENTIFIER_PART.union({'.'})
def _is_cancelled(source, n):
cancelled = False
k = 0
while True:
k += 1
if source[n - k] != '\\':
break
cancelled = not cancelled
return cancelled
def _ensure_regexp(source, n): #<- this function has to be improved
'''returns True if regexp starts at n else returns False
checks whether it is not a division '''
markers = '(+~"\'=[%:?!*^|&-,;/\\'
k = 0
while True:
k += 1
if n - k < 0:
return True
char = source[n - k]
if char in markers:
return True
if char != ' ' and char != '\n':
break
return False
def parse_num(source, start, charset):
"""Returns a first index>=start of chat not in charset"""
while start < len(source) and source[start] in charset:
start += 1
return start
def parse_exponent(source, start):
"""returns end of exponential, raises SyntaxError if failed"""
if not source[start] in {'e', 'E'}:
if source[start] in IDENTIFIER_PART:
raise SyntaxError('Invalid number literal!')
return start
start += 1
if source[start] in {'-', '+'}:
start += 1
FOUND = False
# we need at least one dig after exponent
while source[start] in NUMS:
FOUND = True
start += 1
if not FOUND or source[start] in IDENTIFIER_PART:
raise SyntaxError('Invalid number literal!')
return start
def remove_constants(source):
'''Replaces Strings and Regexp literals in the source code with
identifiers and *removes comments*. Identifier is of the format:
PyJsStringConst(String const number)_ - for Strings
PyJsRegExpConst(RegExp const number)_ - for RegExps
Returns dict which relates identifier and replaced constant.
Removes single line and multiline comments from JavaScript source code
Pseudo comments (inside strings) will not be removed.
For example this line:
var x = "/*PSEUDO COMMENT*/ TEXT //ANOTHER PSEUDO COMMENT"
will be unaltered'''
source = ' ' + source + '\n'
comments = []
inside_comment, single_comment = False, False
inside_single, inside_double = False, False
inside_regexp = False
regexp_class_count = 0
n = 0
while n < len(source):
char = source[n]
if char == '"' and not (inside_comment or inside_single
or inside_regexp):
if not _is_cancelled(source, n):
if inside_double:
inside_double[1] = n + 1
comments.append(inside_double)
inside_double = False
else:
inside_double = [n, None, 0]
elif char == "'" and not (inside_comment or inside_double
or inside_regexp):
if not _is_cancelled(source, n):
if inside_single:
inside_single[1] = n + 1
comments.append(inside_single)
inside_single = False
else:
inside_single = [n, None, 0]
elif (inside_single or inside_double):
if char in LINE_TERMINATOR:
if _is_cancelled(source, n):
if char == CR and source[n + 1] == LF:
n += 1
n += 1
continue
else:
raise SyntaxError(
'Invalid string literal. Line terminators must be escaped!'
)
else:
if inside_comment:
if single_comment:
if char in LINE_TERMINATOR:
inside_comment[1] = n
comments.append(inside_comment)
inside_comment = False
single_comment = False
else: # Multiline
if char == '/' and source[n - 1] == '*':
inside_comment[1] = n + 1
comments.append(inside_comment)
inside_comment = False
elif inside_regexp:
if not quiting_regexp:
if char in LINE_TERMINATOR:
raise SyntaxError(
'Invalid regexp literal. Line terminators cant appear!'
)
if _is_cancelled(source, n):
n += 1
continue
if char == '[':
regexp_class_count += 1
elif char == ']':
regexp_class_count = max(regexp_class_count - 1, 0)
elif char == '/' and not regexp_class_count:
quiting_regexp = True
else:
if char not in IDENTIFIER_START:
inside_regexp[1] = n
comments.append(inside_regexp)
inside_regexp = False
elif char == '/' and source[n - 1] == '/':
single_comment = True
inside_comment = [n - 1, None, 1]
elif char == '*' and source[n - 1] == '/':
inside_comment = [n - 1, None, 1]
elif char == '/' and source[n + 1] not in ('/', '*'):
if not _ensure_regexp(source, n): #<- improve this one
n += 1
continue #Probably just a division
quiting_regexp = False
inside_regexp = [n, None, 2]
elif not (inside_comment or inside_regexp):
if (char in NUMS and
source[n - 1] not in IDENTIFIER_PART) or char == '.':
if char == '.':
k = parse_num(source, n + 1, NUMS)
if k == n + 1: # just a stupid dot...
n += 1
continue
k = parse_exponent(source, k)
elif char == '0' and source[n + 1] in {
'x', 'X'
}: #Hex number probably
k = parse_num(source, n + 2, HEX)
if k == n + 2 or source[k] in IDENTIFIER_PART:
raise SyntaxError('Invalid hex literal!')
else: #int or exp or flot or exp flot
k = parse_num(source, n + 1, NUMS)
if source[k] == '.':
k = parse_num(source, k + 1, NUMS)
k = parse_exponent(source, k)
comments.append((n, k, 3))
n = k
continue
n += 1
res = ''
start = 0
count = 0
constants = {}
for end, next_start, typ in comments:
res += source[start:end]
start = next_start
if typ == 0: # String
name = StringName
elif typ == 1: # comment
continue
elif typ == 2: # regexp
name = RegExpName
elif typ == 3: # number
name = NumberName
else:
raise RuntimeError()
res += ' ' + name % count + ' '
constants[name % count] = source[end:next_start]
count += 1
res += source[start:]
# remove this stupid white space
for e in WHITE:
res = res.replace(e, ' ')
res = res.replace(CR + LF, '\n')
for e in LINE_TERMINATOR:
res = res.replace(e, '\n')
return res.strip(), constants
def recover_constants(py_source,
replacements): #now has n^2 complexity. improve to n
'''Converts identifiers representing Js constants to the PyJs constants
PyJsNumberConst_1_ which has the true value of 5 will be converted to PyJsNumber(5)'''
for identifier, value in replacements.iteritems():
if identifier.startswith('PyJsConstantRegExp'):
py_source = py_source.replace(identifier,
'JsRegExp(%s)' % repr(value))
elif identifier.startswith('PyJsConstantString'):
py_source = py_source.replace(
identifier, 'Js(u%s)' % unify_string_literals(value))
else:
py_source = py_source.replace(identifier, 'Js(%s)' % value)
return py_source
def unify_string_literals(js_string):
"""this function parses the string just like javascript
for example literal '\d' in JavaScript would be interpreted
as 'd' - backslash would be ignored and in Pyhon this
would be interpreted as '\\d' This function fixes this problem."""
n = 0
res = ''
limit = len(js_string)
while n < limit:
char = js_string[n]
if char == '\\':
new, n = do_escape(js_string, n)
res += new
else:
res += char
n += 1
return res
def unify_regexp_literals(js):
pass
def do_escape(source, n):
"""Its actually quite complicated to cover every case :)
http://www.javascriptkit.com/jsref/escapesequence.shtml"""
if not n + 1 < len(source):
return '' # not possible here but can be possible in general case.
if source[n + 1] in LINE_TERMINATOR:
if source[n + 1] == CR and n + 2 < len(source) and source[n + 2] == LF:
return source[n:n + 3], n + 3
return source[n:n + 2], n + 2
if source[n + 1] in ESCAPE_CHARS:
return source[n:n + 2], n + 2
if source[n + 1] in {'x', 'u'}:
char, length = ('u', 4) if source[n + 1] == 'u' else ('x', 2)
n += 2
end = parse_num(source, n, HEX)
if end - n < length:
raise SyntaxError('Invalid escape sequence!')
#if length==4:
# return unichr(int(source[n:n+4], 16)), n+4 # <- this was a very bad way of solving this problem :)
return source[n - 2:n + length], n + length
if source[n + 1] in OCTAL:
n += 1
end = parse_num(source, n, OCTAL)
end = min(end, n + 3) # cant be longer than 3
# now the max allowed is 377 ( in octal) and 255 in decimal
max_num = 255
num = 0
len_parsed = 0
for e in source[n:end]:
cand = 8 * num + int(e)
if cand > max_num:
break
num = cand
len_parsed += 1
# we have to return in a different form because python may want to parse more...
# for example '\777' will be parsed by python as a whole while js will use only \77
return '\\' + hex(num)[1:], n + len_parsed
return source[n + 1], n + 2
#####TEST######
if __name__ == '__main__':
test = ('''
''')
t, d = remove_constants(test)
print t, d
+83
View File
@@ -0,0 +1,83 @@
"""
exp_translate routine:
It takes a single line of JS code and returns a SINGLE line of Python code.
Note var is not present here because it was removed in previous stages. Also remove this useless void keyword
If case of parsing errors it must return a pos of error.
1. Convert all assignment operations to put operations, this may be hard :( DONE, wasn't that bad
2. Convert all gets and calls to get and callprop.
3. Convert unary operators like typeof, new, !, delete, ++, --
Delete can be handled by replacing last get method with delete.
4. Convert remaining operators that are not handled by python:
&&, || <= these should be easy simply replace && by and and || by or
=== and !==
comma operator , in, instanceof and finally :?
NOTES:
Strings and other literals are not present so each = means assignment
"""
from utils import *
from jsparser import *
def exps_translator(js):
#Check () {} and [] nums
ass = assignment_translator(js)
# Step 1
def assignment_translator(js):
sep = js.split(',')
res = sep[:]
for i, e in enumerate(sep):
if '=' not in e: # no need to convert
continue
res[i] = bass_translator(e)
return ','.join(res)
def bass_translator(s):
# I hope that I will not have to fix any bugs here because it will be terrible
if '(' in s or '[' in s:
converted = ''
for e in bracket_split(s, ['()', '[]'], strip=False):
if e[0] == '(':
converted += '(' + bass_translator(e[1:-1]) + ')'
elif e[0] == '[':
converted += '[' + bass_translator(e[1:-1]) + ']'
else:
converted += e
s = converted
if '=' not in s:
return s
ass = reversed(s.split('='))
last = ass.next()
res = last
for e in ass:
op = ''
if e[-1] in OP_METHODS: #increment assign like +=
op = ', "' + e[-1] + '"'
e = e[:-1]
cand = e.strip(
'() ') # (a) = 40 is valid so we need to transform '(a) ' to 'a'
if not is_property_accessor(cand): # it is not a property assignment
if not is_lval(cand) or is_internal(cand):
raise SyntaxError('Invalid left-hand side in assignment')
res = 'var.put(%s, %s%s)' % (cand.__repr__(), res, op)
elif cand[-1] == ']': # property assignment via []
c = list(bracket_split(cand, ['[]'], strip=False))
meth, prop = ''.join(c[:-1]).strip(), c[-1][1:-1].strip(
) #this does not have to be a string so dont remove
#() because it can be a call
res = '%s.put(%s, %s%s)' % (meth, prop, res, op)
else: # Prop set via '.'
c = cand.rfind('.')
meth, prop = cand[:c].strip(), cand[c + 1:].strip('() ')
if not is_lval(prop):
raise SyntaxError('Invalid left-hand side in assignment')
res = '%s.put(%s, %s%s)' % (meth, prop.__repr__(), res, op)
return res
if __name__ == '__main__':
print bass_translator('3.ddsd = 40')
+480
View File
@@ -0,0 +1,480 @@
"""This module translates JS flow into PY flow.
Translates:
IF ELSE
DO WHILE
WHILE
FOR 123
FOR iter
CONTINUE, BREAK, RETURN, LABEL, THROW, TRY, SWITCH
"""
from utils import *
from jsparser import *
from nodevisitor import exp_translator
import random
TO_REGISTER = []
CONTINUE_LABEL = 'JS_CONTINUE_LABEL_%s'
BREAK_LABEL = 'JS_BREAK_LABEL_%s'
PREPARE = '''HOLDER = var.own.get(NAME)\nvar.force_own_put(NAME, PyExceptionToJs(PyJsTempException))\n'''
RESTORE = '''if HOLDER is not None:\n var.own[NAME] = HOLDER\nelse:\n del var.own[NAME]\ndel HOLDER\n'''
TRY_CATCH = '''%stry:\nBLOCKfinally:\n%s''' % (PREPARE, indent(RESTORE))
def get_continue_label(label):
return CONTINUE_LABEL % label.encode('hex')
def get_break_label(label):
return BREAK_LABEL % label.encode('hex')
def pass_until(source, start, tokens=(';', )):
while start < len(source) and source[start] not in tokens:
start += 1
return start + 1
def do_bracket_exp(source, start, throw=True):
bra, cand = pass_bracket(source, start, '()')
if throw and not bra:
raise SyntaxError('Missing bracket expression')
bra = exp_translator(bra[1:-1])
if throw and not bra:
raise SyntaxError('Empty bracket condition')
return bra, cand if bra else start
def do_if(source, start):
start += 2 # pass this if
bra, start = do_bracket_exp(source, start, throw=True)
statement, start = do_statement(source, start)
if statement is None:
raise SyntaxError('Invalid if statement')
translated = 'if %s:\n' % bra + indent(statement)
elseif = except_keyword(source, start, 'else')
is_elseif = False
if elseif:
start = elseif
if except_keyword(source, start, 'if'):
is_elseif = True
elseif, start = do_statement(source, start)
if elseif is None:
raise SyntaxError('Invalid if statement)')
if is_elseif:
translated += 'el' + elseif
else:
translated += 'else:\n' + indent(elseif)
return translated, start
def do_statement(source, start):
"""returns none if not found other functions that begin with 'do_' raise
also this do_ type function passes white space"""
start = pass_white(source, start)
# start is the fist position after initial start that is not a white space or \n
if not start < len(source): #if finished parsing return None
return None, start
if any(startswith_keyword(source[start:], e) for e in {'case', 'default'}):
return None, start
rest = source[start:]
for key, meth in KEYWORD_METHODS.iteritems(
): # check for statements that are uniquely defined by their keywords
if rest.startswith(key):
# has to startwith this keyword and the next letter after keyword must be either EOF or not in IDENTIFIER_PART
if len(key) == len(rest) or rest[len(key)] not in IDENTIFIER_PART:
return meth(source, start)
if rest[0] == '{': #Block
return do_block(source, start)
# Now only label and expression left
cand = parse_identifier(source, start, False)
if cand is not None: # it can mean that its a label
label, cand_start = cand
cand_start = pass_white(source, cand_start)
if source[cand_start] == ':':
return do_label(source, start)
return do_expression(source, start)
def do_while(source, start):
start += 5 # pass while
bra, start = do_bracket_exp(source, start, throw=True)
statement, start = do_statement(source, start)
if statement is None:
raise SyntaxError('Missing statement to execute in while loop!')
return 'while %s:\n' % bra + indent(statement), start
def do_dowhile(source, start):
start += 2 # pass do
statement, start = do_statement(source, start)
if statement is None:
raise SyntaxError('Missing statement to execute in do while loop!')
start = except_keyword(source, start, 'while')
if not start:
raise SyntaxError('Missing while keyword in do-while loop')
bra, start = do_bracket_exp(source, start, throw=True)
statement += 'if not %s:\n' % bra + indent('break\n')
return 'while 1:\n' + indent(statement), start
def do_block(source, start):
bra, start = pass_bracket(source, start, '{}')
#print source[start:], bra
#return bra +'\n', start
if bra is None:
raise SyntaxError('Missing block ( {code} )')
code = ''
bra = bra[1:-1] + ';'
bra_pos = 0
while bra_pos < len(bra):
st, bra_pos = do_statement(bra, bra_pos)
if st is None:
break
code += st
bra_pos = pass_white(bra, bra_pos)
if bra_pos < len(bra):
raise SyntaxError('Block has more code that could not be parsed:\n' +
bra[bra_pos:])
return code, start
def do_empty(source, start):
return 'pass\n', start + 1
def do_expression(source, start):
start = pass_white(source, start)
end = pass_until(source, start, tokens=(';', ))
if end == start + 1: #empty statement
return 'pass\n', end
# AUTOMATIC SEMICOLON INSERTION FOLLOWS
# Without ASI this function would end with: return exp_translator(source[start:end].rstrip(';'))+'\n', end
# ASI makes things a bit more complicated:
# we will try to parse as much as possible, inserting ; in place of last new line in case of error
rev = False
rpos = 0
while True:
try:
code = source[start:end].rstrip(';')
cand = exp_translator(code) + '\n', end
just_to_test = compile(cand[0], '', 'exec')
return cand
except Exception as e:
if not rev:
rev = source[start:end][::-1]
lpos = rpos
while True:
rpos = pass_until(rev, rpos, LINE_TERMINATOR)
if rpos >= len(rev):
raise
if filter(lambda x: x not in SPACE, rev[lpos:rpos]):
break
end = start + len(rev) - rpos + 1
def do_var(source, start):
#todo auto ; insertion
start += 3 #pass var
end = pass_until(source, start, tokens=(';', ))
defs = argsplit(
source[start:end - 1]
) # defs is the list of defined vars with optional initializer
code = ''
for de in defs:
var, var_end = parse_identifier(de, 0, True)
TO_REGISTER.append(var)
var_end = pass_white(de, var_end)
if var_end < len(
de
): # we have something more to parse... It has to start with =
if de[var_end] != '=':
raise SyntaxError(
'Unexpected initializer in var statement. Expected "=", got "%s"'
% de[var_end])
code += exp_translator(de) + '\n'
if not code.strip():
code = 'pass\n'
return code, end
def do_label(source, start):
label, end = parse_identifier(source, start)
end = pass_white(source, end)
#now source[end] must be :
assert source[end] == ':'
end += 1
inside, end = do_statement(source, end)
if inside is None:
raise SyntaxError('Missing statement after label')
defs = ''
if inside.startswith('while ') or inside.startswith(
'for ') or inside.startswith('#for'):
# we have to add contine label as well...
# 3 or 1 since #for loop type has more lines before real for.
sep = 1 if not inside.startswith('#for') else 3
cont_label = get_continue_label(label)
temp = inside.split('\n')
injected = 'try:\n' + '\n'.join(temp[sep:])
injected += 'except %s:\n pass\n' % cont_label
inside = '\n'.join(temp[:sep]) + '\n' + indent(injected)
defs += 'class %s(Exception): pass\n' % cont_label
break_label = get_break_label(label)
inside = 'try:\n%sexcept %s:\n pass\n' % (indent(inside), break_label)
defs += 'class %s(Exception): pass\n' % break_label
return defs + inside, end
def do_for(source, start):
start += 3 # pass for
entered = start
bra, start = pass_bracket(source, start, '()')
inside, start = do_statement(source, start)
if inside is None:
raise SyntaxError('Missing statement after for')
bra = bra[1:-1]
if ';' in bra:
init = argsplit(bra, ';')
if len(init) != 3:
raise SyntaxError('Invalid for statement')
args = []
for i, item in enumerate(init):
end = pass_white(item, 0)
if end == len(item):
args.append('' if i != 1 else '1')
continue
if not i and except_keyword(item, end, 'var') is not None:
# var statement
args.append(do_var(item, end)[0])
continue
args.append(do_expression(item, end)[0])
return '#for JS loop\n%swhile %s:\n%s%s\n' % (
args[0], args[1].strip(), indent(inside), indent(args[2])), start
# iteration
end = pass_white(bra, 0)
register = False
if bra[end:].startswith('var '):
end += 3
end = pass_white(bra, end)
register = True
name, end = parse_identifier(bra, end)
if register:
TO_REGISTER.append(name)
end = pass_white(bra, end)
if bra[end:end + 2] != 'in' or bra[end + 2] in IDENTIFIER_PART:
#print source[entered-10:entered+50]
raise SyntaxError('Invalid "for x in y" statement')
end += 2 # pass in
exp = exp_translator(bra[end:])
res = 'for temp in %s:\n' % exp
res += indent('var.put(%s, temp)\n' % name.__repr__()) + indent(inside)
return res, start
# todo - IMPORTANT
def do_continue(source, start, name='continue'):
start += len(name) #pass continue
start = pass_white(source, start)
if start < len(source) and source[start] == ';':
return '%s\n' % name, start + 1
# labeled statement or error
label, start = parse_identifier(source, start)
start = pass_white(source, start)
if start < len(source) and source[start] != ';':
raise SyntaxError('Missing ; after label name in %s statement' % name)
return 'raise %s("%s")\n' % (get_continue_label(label)
if name == 'continue' else
get_break_label(label), name), start + 1
def do_break(source, start):
return do_continue(source, start, 'break')
def do_return(source, start):
start += 6 # pass return
end = source.find(';', start) + 1
if end == -1:
end = len(source)
trans = exp_translator(source[start:end].rstrip(';'))
return 'return %s\n' % (trans if trans else "var.get('undefined')"), end
# todo later?- Also important
def do_throw(source, start):
start += 5 # pass throw
end = source.find(';', start) + 1
if not end:
end = len(source)
trans = exp_translator(source[start:end].rstrip(';'))
if not trans:
raise SyntaxError('Invalid throw statement: nothing to throw')
res = 'PyJsTempException = JsToPyException(%s)\nraise PyJsTempException\n' % trans
return res, end
def do_try(source, start):
start += 3 # pass try
block, start = do_block(source, start)
result = 'try:\n%s' % indent(block)
catch = except_keyword(source, start, 'catch')
if catch:
bra, catch = pass_bracket(source, catch, '()')
bra = bra[1:-1]
identifier, bra_end = parse_identifier(bra, 0)
holder = 'PyJsHolder_%s_%d' % (identifier.encode('hex'),
random.randrange(1e8))
identifier = identifier.__repr__()
bra_end = pass_white(bra, bra_end)
if bra_end < len(bra):
raise SyntaxError('Invalid content of catch statement')
result += 'except PyJsException as PyJsTempException:\n'
block, catch = do_block(source, catch)
# fill in except ( catch ) block and remember to recover holder variable to its previous state
result += indent(
TRY_CATCH.replace('HOLDER', holder).replace('NAME',
identifier).replace(
'BLOCK',
indent(block)))
start = max(catch, start)
final = except_keyword(source, start, 'finally')
if not (final or catch):
raise SyntaxError(
'Try statement has to be followed by catch or finally')
if not final:
return result, start
# translate finally statement
block, start = do_block(source, final)
return result + 'finally:\n%s' % indent(block), start
def do_debugger(source, start):
start += 8 # pass debugger
end = pass_white(source, start)
if end < len(source) and source[end] == ';':
end += 1
return 'pass\n', end #ignore errors...
# todo automatic ; insertion. fuck this crappy feature
# Least important
def do_switch(source, start):
start += 6 # pass switch
code = 'while 1:\n' + indent('SWITCHED = False\nCONDITION = (%s)\n')
# parse value of check
val, start = pass_bracket(source, start, '()')
if val is None:
raise SyntaxError('Missing () after switch statement')
if not val.strip():
raise SyntaxError('Missing content inside () after switch statement')
code = code % exp_translator(val)
bra, start = pass_bracket(source, start, '{}')
if bra is None:
raise SyntaxError('Missing block {} after switch statement')
bra_pos = 0
bra = bra[1:-1] + ';'
while True:
case = except_keyword(bra, bra_pos, 'case')
default = except_keyword(bra, bra_pos, 'default')
assert not (case and default)
if case or default: # this ?: expression makes things much harder....
case_code = None
if case:
case_code = 'if SWITCHED or PyJsStrictEq(CONDITION, %s):\n'
# we are looking for a first : with count 1. ? gives -1 and : gives +1.
count = 0
for pos, e in enumerate(bra[case:], case):
if e == '?':
count -= 1
elif e == ':':
count += 1
if count == 1:
break
else:
raise SyntaxError(
'Missing : token after case in switch statement')
case_condition = exp_translator(
bra[case:pos]) # switch {case CONDITION: statements}
case_code = case_code % case_condition
case = pos + 1
if default:
case = except_token(bra, default, ':')
case_code = 'if True:\n'
# now parse case statements (things after ':' )
cand, case = do_statement(bra, case)
while cand:
case_code += indent(cand)
cand, case = do_statement(bra, case)
case_code += indent('SWITCHED = True\n')
code += indent(case_code)
bra_pos = case
else:
break
# prevent infinite loop :)
code += indent('break\n')
return code, start
def do_pyimport(source, start):
start += 8
lib, start = parse_identifier(source, start)
jlib = 'PyImport_%s' % lib
code = 'import %s as %s\n' % (lib, jlib)
#check whether valid lib name...
try:
compile(code, '', 'exec')
except:
raise SyntaxError(
'Invalid Python module name (%s) in pyimport statement' % lib)
# var.pyimport will handle module conversion to PyJs object
code += 'var.pyimport(%s, %s)\n' % (repr(lib), jlib)
return code, start
def do_with(source, start):
raise NotImplementedError('With statement is not implemented yet :(')
KEYWORD_METHODS = {
'do': do_dowhile,
'while': do_while,
'if': do_if,
'throw': do_throw,
'return': do_return,
'continue': do_continue,
'break': do_break,
'try': do_try,
'for': do_for,
'switch': do_switch,
'var': do_var,
'debugger': do_debugger, # this one does not do anything
'with': do_with,
'pyimport': do_pyimport
}
#Also not specific statements (harder to detect)
# Block {}
# Expression or Empty Statement
# Label
#
# Its easy to recognize block but harder to distinguish between label and expression statement
def translate_flow(source):
"""Source cant have arrays, object, constant or function literals.
Returns PySource and variables to register"""
global TO_REGISTER
TO_REGISTER = []
return do_block('{%s}' % source, 0)[0], TO_REGISTER
if __name__ == '__main__':
#print do_dowhile('do {} while(k+f)', 0)[0]
#print 'e: "%s"'%do_expression('++(c?g:h); mj', 0)[0]
print translate_flow('a; yimport test')[0]
+98
View File
@@ -0,0 +1,98 @@
"""This module removes JS functions from source code"""
from jsparser import *
from utils import *
INLINE_NAME = 'PyJsLvalInline%d_'
INLINE_COUNT = 0
PRE_EXP_STARTS = {
'return', 'new', 'void', 'throw', 'typeof', 'in', 'instanceof'
}
PRE_ALLOWED = IDENTIFIER_PART.union({';', '{', '}', ']', ')', ':'})
INCREMENTS = {'++', '--'}
def reset_inline_count():
global INLINE_COUNT
INLINE_COUNT = 0
def remove_functions(source, all_inline=False):
"""removes functions and returns new source, and 2 dicts.
first dict with removed hoisted(global) functions and second with replaced inline functions"""
global INLINE_COUNT
inline = {}
hoisted = {}
n = 0
limit = len(source) - 9 # 8 is length of 'function'
res = ''
last = 0
while n < limit:
if n and source[n - 1] in IDENTIFIER_PART:
n += 1
continue
if source[n:n + 8] == 'function' and source[n +
8] not in IDENTIFIER_PART:
if source[:n].rstrip().endswith(
'.'): # allow function as a property name :)
n += 1
continue
if source[n + 8:].lstrip().startswith(
':'): # allow functions inside objects...
n += 1
continue
entered = n
res += source[last:n]
name = ''
n = pass_white(source, n + 8)
if source[n] in IDENTIFIER_START: # hoisted function
name, n = parse_identifier(source, n)
args, n = pass_bracket(source, n, '()')
if not args:
raise SyntaxError('Function misses bracket with argnames ()')
args = args.strip('() \n')
args = tuple(parse_identifier(e, 0)[0]
for e in argsplit(args)) if args else ()
if len(args) - len(set(args)):
# I know its legal in JS but python does not allow duplicate argnames
# I will not work around it
raise SyntaxError(
'Function has duplicate argument names. Its not legal in this implementation. Sorry.'
)
block, n = pass_bracket(source, n, '{}')
if not block:
raise SyntaxError(
'Function does not have any code block to execute')
mixed = False # named function expression flag
if name and not all_inline:
# Here I will distinguish between named function expression (mixed) and a function statement
before = source[:entered].rstrip()
if any(endswith_keyword(before, e) for e in PRE_EXP_STARTS):
#print 'Ended ith keyword'
mixed = True
elif before and before[-1] not in PRE_ALLOWED and not before[
-2:] in INCREMENTS:
#print 'Ended with'+repr(before[-1]), before[-1]=='}'
mixed = True
else:
#print 'FUNCTION STATEMENT'
#its a function statement.
# todo remove fucking label if present!
hoisted[name] = block, args
if not name or mixed or all_inline: # its a function expression (can be both named and not named)
#print 'FUNCTION EXPRESSION'
INLINE_COUNT += 1
iname = INLINE_NAME % INLINE_COUNT # inline name
res += ' ' + iname
inline['%s@%s' % (
iname, name
)] = block, args #here added real name at the end because it has to be added to the func scope
last = n
else:
n += 1
res += source[last:]
return res, hoisted, inline
if __name__ == '__main__':
print remove_functions(
'5+5 function n (functiona ,functionaj) {dsd s, dsdd}')
+326
View File
@@ -0,0 +1,326 @@
"""
The process of translating JS will go like that: # TOP = 'imports and scope set'
1. Remove all the comments
2. Replace number, string and regexp literals with markers
4. Remove global Functions and move their translation to the TOP. Also add register code there.
5. Replace inline functions with lvals
6. Remove List and Object literals and replace them with lvals
7. Find and remove var declarations, generate python register code that would go on TOP.
Here we should be left with global code only where 1 line of js code = 1 line of python code.
Routine translating this code should be called glob_translate:
1. Search for outer structures and translate them using glob and inside using exps_translate
exps_translate routine:
1. Remove outer {}
2. Split lines at ;
3. Convert line by line using exp_translate
4. In case of error in 3 try to insert ; according to ECMA rules and repeat 3.
exp_translate routine:
It takes a single line of JS code and returns a SINGLE line of Python code.
Note var is not present here because it was removed in previous stages.
If case of parsing errors it must return a pos of error.
1. Convert all assignment operations to put operations, this may be hard :(
2. Convert all gets and calls to get and callprop.
3. Convert unary operators like typeof, new, !, delete.
Delete can be handled by replacing last get method with delete.
4. Convert remaining operators that are not handled by python eg: === and ,
lval format PyJsLvalNR
marker PyJs(TYPE_NAME)(NR)
TODO
1. Number literal replacement
2. Array literal replacement
3. Object literal replacement
5. Function replacement
4. Literal replacement translators
"""
from utils import *
OP_METHODS = {
'*': '__mul__',
'/': '__div__',
'%': '__mod__',
'+': '__add__',
'-': '__sub__',
'<<': '__lshift__',
'>>': '__rshift__',
'&': '__and__',
'^': '__xor__',
'|': '__or__'
}
def dbg(source):
try:
with open('C:\Users\Piotrek\Desktop\dbg.py', 'w') as f:
f.write(source)
except:
pass
def indent(lines, ind=4):
return ind * ' ' + lines.replace('\n', '\n' + ind * ' ').rstrip(' ')
def inject_before_lval(source, lval, code):
if source.count(lval) > 1:
dbg(source)
print
print lval
raise RuntimeError('To many lvals (%s)' % lval)
elif not source.count(lval):
dbg(source)
print
print lval
assert lval not in source
raise RuntimeError('No lval found "%s"' % lval)
end = source.index(lval)
inj = source.rfind('\n', 0, end)
ind = inj
while source[ind + 1] == ' ':
ind += 1
ind -= inj
return source[:inj + 1] + indent(code, ind) + source[inj + 1:]
def bracket_split(source, brackets=('()', '{}', '[]'), strip=False):
"""DOES NOT RETURN EMPTY STRINGS (can only return empty bracket content if strip=True)"""
starts = [e[0] for e in brackets]
in_bracket = 0
n = 0
last = 0
while n < len(source):
e = source[n]
if not in_bracket and e in starts:
in_bracket = 1
start = n
b_start, b_end = brackets[starts.index(e)]
elif in_bracket:
if e == b_start:
in_bracket += 1
elif e == b_end:
in_bracket -= 1
if not in_bracket:
if source[last:start]:
yield source[last:start]
last = n + 1
yield source[start + strip:n + 1 - strip]
n += 1
if source[last:]:
yield source[last:]
def pass_bracket(source, start, bracket='()'):
"""Returns content of brackets with brackets and first pos after brackets
if source[start] is followed by some optional white space and brackets.
Otherwise None"""
e = bracket_split(source[start:], [bracket], False)
try:
cand = e.next()
except StopIteration:
return None, None
if not cand.strip(): #white space...
try:
res = e.next()
return res, start + len(cand) + len(res)
except StopIteration:
return None, None
elif cand[-1] == bracket[1]:
return cand, start + len(cand)
else:
return None, None
def startswith_keyword(start, keyword):
start = start.lstrip()
if start.startswith(keyword):
if len(keyword) < len(start):
if start[len(keyword)] in IDENTIFIER_PART:
return False
return True
return False
def endswith_keyword(ending, keyword):
ending = ending.rstrip()
if ending.endswith(keyword):
if len(keyword) < len(ending):
if ending[len(ending) - len(keyword) - 1] in IDENTIFIER_PART:
return False
return True
return False
def pass_white(source, start):
n = start
while n < len(source):
if source[n] in SPACE:
n += 1
else:
break
return n
def except_token(source, start, token, throw=True):
"""Token can be only a single char. Returns position after token if found. Otherwise raises syntax error if throw
otherwise returns None"""
start = pass_white(source, start)
if start < len(source) and source[start] == token:
return start + 1
if throw:
raise SyntaxError('Missing token. Expected %s' % token)
return None
def except_keyword(source, start, keyword):
""" Returns position after keyword if found else None
Note: skips white space"""
start = pass_white(source, start)
kl = len(keyword) #keyword len
if kl + start > len(source):
return None
if source[start:start + kl] != keyword:
return None
if kl + start < len(source) and source[start + kl] in IDENTIFIER_PART:
return None
return start + kl
def parse_identifier(source, start, throw=True):
"""passes white space from start and returns first identifier,
if identifier invalid and throw raises SyntaxError otherwise returns None"""
start = pass_white(source, start)
end = start
if not end < len(source):
if throw:
raise SyntaxError('Missing identifier!')
return None
if source[end] not in IDENTIFIER_START:
if throw:
raise SyntaxError('Invalid identifier start: "%s"' % source[end])
return None
end += 1
while end < len(source) and source[end] in IDENTIFIER_PART:
end += 1
if not is_valid_lval(source[start:end]):
if throw:
raise SyntaxError(
'Invalid identifier name: "%s"' % source[start:end])
return None
return source[start:end], end
def argsplit(args, sep=','):
"""used to split JS args (it is not that simple as it seems because
sep can be inside brackets).
pass args *without* brackets!
Used also to parse array and object elements, and more"""
parsed_len = 0
last = 0
splits = []
for e in bracket_split(args, brackets=['()', '[]', '{}']):
if e[0] not in {'(', '[', '{'}:
for i, char in enumerate(e):
if char == sep:
splits.append(args[last:parsed_len + i])
last = parsed_len + i + 1
parsed_len += len(e)
splits.append(args[last:])
return splits
def split_add_ops(text):
"""Specialized function splitting text at add/sub operators.
Operands are *not* translated. Example result ['op1', '+', 'op2', '-', 'op3']"""
n = 0
text = text.replace('++', '##').replace(
'--', '@@') #text does not normally contain any of these
spotted = False # set to true if noticed anything other than +- or white space
last = 0
while n < len(text):
e = text[n]
if e == '+' or e == '-':
if spotted:
yield text[last:n].replace('##', '++').replace('@@', '--')
yield e
last = n + 1
spotted = False
elif e == '/' or e == '*' or e == '%':
spotted = False
elif e != ' ':
spotted = True
n += 1
yield text[last:n].replace('##', '++').replace('@@', '--')
def split_at_any(text,
lis,
translate=False,
not_before=[],
not_after=[],
validitate=None):
""" doc """
lis.sort(key=lambda x: len(x), reverse=True)
last = 0
n = 0
text_len = len(text)
while n < text_len:
if any(text[:n].endswith(e)
for e in not_before): #Cant end with end before
n += 1
continue
for e in lis:
s = len(e)
if s + n > text_len:
continue
if validitate and not validitate(e, text[:n], text[n + s:]):
continue
if any(text[n + s:].startswith(e)
for e in not_after): #Cant end with end before
n += 1
break
if e == text[n:n + s]:
yield text[last:n] if not translate else translate(
text[last:n])
yield e
n += s
last = n
break
else:
n += 1
yield text[last:n] if not translate else translate(text[last:n])
def split_at_single(text, sep, not_before=[], not_after=[]):
"""Works like text.split(sep) but separated fragments
cant end with not_before or start with not_after"""
n = 0
lt, s = len(text), len(sep)
last = 0
while n < lt:
if not s + n > lt:
if sep == text[n:n + s]:
if any(text[last:n].endswith(e) for e in not_before):
pass
elif any(text[n + s:].startswith(e) for e in not_after):
pass
else:
yield text[last:n]
last = n + s
n += s - 1
n += 1
yield text[last:]
+562
View File
@@ -0,0 +1,562 @@
from jsparser import *
from utils import *
import re
from utils import *
#Note all white space sent to this module must be ' ' so no '\n'
REPL = {}
#PROBLEMS
# <<=, >>=, >>>=
# they are unusual so I will not fix that now. a++ +b works fine and a+++++b (a++ + ++b) does not work even in V8
ASSIGNMENT_MATCH = '(?<!=|!|<|>)=(?!=)'
def unary_validitator(keyword, before, after):
if keyword[-1] in IDENTIFIER_PART:
if not after or after[0] in IDENTIFIER_PART:
return False
if before and before[-1] in IDENTIFIER_PART: # I am not sure here...
return False
return True
def comb_validitator(keyword, before, after):
if keyword == 'instanceof' or keyword == 'in':
if before and before[-1] in IDENTIFIER_PART:
return False
elif after and after[0] in IDENTIFIER_PART:
return False
return True
def bracket_replace(code):
new = ''
for e in bracket_split(code, ['()', '[]'], False):
if e[0] == '[':
name = '#PYJSREPL' + str(len(REPL)) + '{'
new += name
REPL[name] = e
elif e[0] == '(': # can be a function call
name = '@PYJSREPL' + str(len(REPL)) + '}'
new += name
REPL[name] = e
else:
new += e
return new
class NodeVisitor:
def __init__(self, code):
self.code = code
def rl(self, lis, op):
"""performs this operation on a list from *right to left*
op must take 2 args
a,b,c => op(a, op(b, c))"""
it = reversed(lis)
res = trans(it.next())
for e in it:
e = trans(e)
res = op(e, res)
return res
def lr(self, lis, op):
"""performs this operation on a list from *left to right*
op must take 2 args
a,b,c => op(op(a, b), c)"""
it = iter(lis)
res = trans(it.next())
for e in it:
e = trans(e)
res = op(res, e)
return res
def translate(self):
"""Translates outer operation and calls translate on inner operation.
Returns fully translated code."""
if not self.code:
return ''
new = bracket_replace(self.code)
#Check comma operator:
cand = new.split(',') #every comma in new must be an operator
if len(cand) > 1: #LR
return self.lr(cand, js_comma)
#Check = operator:
# dont split at != or !== or == or === or <= or >=
#note <<=, >>= or this >>> will NOT be supported
# maybe I will change my mind later
# Find this crappy ?:
if '?' in new:
cond_ind = new.find('?')
tenary_start = 0
for ass in re.finditer(ASSIGNMENT_MATCH, new):
cand = ass.span()[1]
if cand < cond_ind:
tenary_start = cand
else:
break
actual_tenary = new[tenary_start:]
spl = ''.join(split_at_any(new, [':', '?'], translate=trans))
tenary_translation = transform_crap(spl)
assignment = new[:tenary_start] + ' PyJsConstantTENARY'
return trans(assignment).replace('PyJsConstantTENARY',
tenary_translation)
cand = list(split_at_single(new, '=', ['!', '=', '<', '>'], ['=']))
if len(cand) > 1: # RL
it = reversed(cand)
res = trans(it.next())
for e in it:
e = e.strip()
if not e:
raise SyntaxError('Missing left-hand in assignment!')
op = ''
if e[-2:] in OP_METHODS:
op = ',' + e[-2:].__repr__()
e = e[:-2]
elif e[-1:] in OP_METHODS:
op = ',' + e[-1].__repr__()
e = e[:-1]
e = trans(e)
#Now replace last get method with put and change args
c = list(bracket_split(e, ['()']))
beg, arglist = ''.join(c[:-1]).strip(), c[-1].strip(
) #strips just to make sure... I will remove it later
if beg[-4:] != '.get':
raise SyntaxError('Invalid left-hand side in assignment')
beg = beg[0:-3] + 'put'
arglist = arglist[0:-1] + ', ' + res + op + ')'
res = beg + arglist
return res
#Now check remaining 2 arg operators that are not handled by python
#They all have Left to Right (LR) associativity
order = [OR, AND, BOR, BXOR, BAND, EQS, COMPS, BSHIFTS, ADDS, MULTS]
# actually we dont need OR and AND because they can be handled easier. But just for fun
dangerous = ['<', '>']
for typ in order:
#we have to use special method for ADDS since they can be also unary operation +/++ or -/-- FUCK
if '+' in typ:
cand = list(split_add_ops(new))
else:
#dont translate. cant start or end on dangerous op.
cand = list(
split_at_any(
new,
typ.keys(),
False,
dangerous,
dangerous,
validitate=comb_validitator))
if not len(cand) > 1:
continue
n = 1
res = trans(cand[0])
if not res:
raise SyntaxError("Missing operand!")
while n < len(cand):
e = cand[n]
if not e:
raise SyntaxError("Missing operand!")
if n % 2:
op = typ[e]
else:
res = op(res, trans(e))
n += 1
return res
#Now replace unary operators - only they are left
cand = list(
split_at_any(
new, UNARY.keys(), False, validitate=unary_validitator))
if len(cand) > 1: #contains unary operators
if '++' in cand or '--' in cand: #it cant contain both ++ and --
if '--' in cand:
op = '--'
meths = js_post_dec, js_pre_dec
else:
op = '++'
meths = js_post_inc, js_pre_inc
pos = cand.index(op)
if cand[pos - 1].strip(): # post increment
a = cand[pos - 1]
meth = meths[0]
elif cand[pos + 1].strip(): #pre increment
a = cand[pos + 1]
meth = meths[1]
else:
raise SyntaxError('Invalid use of ++ operator')
if cand[pos + 2:]:
raise SyntaxError('Too many operands')
operand = meth(trans(a))
cand = cand[:pos - 1]
# now last cand should be operand and every other odd element should be empty
else:
operand = trans(cand[-1])
del cand[-1]
for i, e in enumerate(reversed(cand)):
if i % 2:
if e.strip():
raise SyntaxError('Too many operands')
else:
operand = UNARY[e](operand)
return operand
#Replace brackets
if new[0] == '@' or new[0] == '#':
if len(
list(bracket_split(new, ('#{', '@}')))
) == 1: # we have only one bracket, otherwise pseudobracket like @@....
assert new in REPL
if new[0] == '#':
raise SyntaxError(
'[] cant be used as brackets! Use () instead.')
return '(' + trans(REPL[new][1:-1]) + ')'
#Replace function calls and prop getters
# 'now' must be a reference like: a or b.c.d but it can have also calls or getters ( for example a["b"](3))
#From here @@ means a function call and ## means get operation (note they dont have to present)
it = bracket_split(new, ('#{', '@}'))
res = []
for e in it:
if e[0] != '#' and e[0] != '@':
res += [x.strip() for x in e.split('.')]
else:
res += [e.strip()]
# res[0] can be inside @@ (name)...
res = filter(lambda x: x, res)
if is_internal(res[0]):
out = res[0]
elif res[0][0] in {'#', '@'}:
out = '(' + trans(REPL[res[0]][1:-1]) + ')'
elif is_valid_lval(
res[0]) or res[0] in {'this', 'false', 'true', 'null'}:
out = 'var.get(' + res[0].__repr__() + ')'
else:
if is_reserved(res[0]):
raise SyntaxError('Unexpected reserved word: "%s"' % res[0])
raise SyntaxError('Invalid identifier: "%s"' % res[0])
if len(res) == 1:
return out
n = 1
while n < len(res): #now every func call is a prop call
e = res[n]
if e[0] == '@': # direct call
out += trans_args(REPL[e])
n += 1
continue
args = False #assume not prop call
if n + 1 < len(res) and res[n + 1][0] == '@': #prop call
args = trans_args(REPL[res[n + 1]])[1:]
if args != ')':
args = ',' + args
if e[0] == '#':
prop = trans(REPL[e][1:-1])
else:
if not is_lval(e):
raise SyntaxError('Invalid identifier: "%s"' % e)
prop = e.__repr__()
if args: # prop call
n += 1
out += '.callprop(' + prop + args
else: #prop get
out += '.get(' + prop + ')'
n += 1
return out
def js_comma(a, b):
return 'PyJsComma(' + a + ',' + b + ')'
def js_or(a, b):
return '(' + a + ' or ' + b + ')'
def js_bor(a, b):
return '(' + a + '|' + b + ')'
def js_bxor(a, b):
return '(' + a + '^' + b + ')'
def js_band(a, b):
return '(' + a + '&' + b + ')'
def js_and(a, b):
return '(' + a + ' and ' + b + ')'
def js_strict_eq(a, b):
return 'PyJsStrictEq(' + a + ',' + b + ')'
def js_strict_neq(a, b):
return 'PyJsStrictNeq(' + a + ',' + b + ')'
#Not handled by python in the same way like JS. For example 2==2==True returns false.
# In JS above would return true so we need brackets.
def js_abstract_eq(a, b):
return '(' + a + '==' + b + ')'
#just like ==
def js_abstract_neq(a, b):
return '(' + a + '!=' + b + ')'
def js_lt(a, b):
return '(' + a + '<' + b + ')'
def js_le(a, b):
return '(' + a + '<=' + b + ')'
def js_ge(a, b):
return '(' + a + '>=' + b + ')'
def js_gt(a, b):
return '(' + a + '>' + b + ')'
def js_in(a, b):
return b + '.contains(' + a + ')'
def js_instanceof(a, b):
return a + '.instanceof(' + b + ')'
def js_lshift(a, b):
return '(' + a + '<<' + b + ')'
def js_rshift(a, b):
return '(' + a + '>>' + b + ')'
def js_shit(a, b):
return 'PyJsBshift(' + a + ',' + b + ')'
def js_add(
a,
b): # To simplify later process of converting unary operators + and ++
return '(%s+%s)' % (a, b)
def js_sub(a, b): # To simplify
return '(%s-%s)' % (a, b)
def js_mul(a, b):
return '(' + a + '*' + b + ')'
def js_div(a, b):
return '(' + a + '/' + b + ')'
def js_mod(a, b):
return '(' + a + '%' + b + ')'
def js_typeof(a):
cand = list(bracket_split(a, ('()', )))
if len(cand) == 2 and cand[0] == 'var.get':
return cand[0] + cand[1][:-1] + ',throw=False).typeof()'
return a + '.typeof()'
def js_void(a):
return '(' + a + ')'
def js_new(a):
cands = list(bracket_split(a, ('()', )))
lim = len(cands)
if lim < 2:
return a + '.create()'
n = 0
while n < lim:
c = cands[n]
if c[0] == '(':
if cands[n - 1].endswith(
'.get') and n + 1 >= lim: # last get operation.
return a + '.create()'
elif cands[n - 1][0] == '(':
return ''.join(cands[:n]) + '.create' + c + ''.join(
cands[n + 1:])
elif cands[n - 1] == '.callprop':
beg = ''.join(cands[:n - 1])
args = argsplit(c[1:-1], ',')
prop = args[0]
new_args = ','.join(args[1:])
create = '.get(%s).create(%s)' % (prop, new_args)
return beg + create + ''.join(cands[n + 1:])
n += 1
return a + '.create()'
def js_delete(a):
#replace last get with delete.
c = list(bracket_split(a, ['()']))
beg, arglist = ''.join(c[:-1]).strip(), c[-1].strip(
) #strips just to make sure... I will remove it later
if beg[-4:] != '.get':
raise SyntaxError('Invalid delete operation')
return beg[:-3] + 'delete' + arglist
def js_neg(a):
return '(-' + a + ')'
def js_pos(a):
return '(+' + a + ')'
def js_inv(a):
return '(~' + a + ')'
def js_not(a):
return a + '.neg()'
def postfix(a, inc, post):
bra = list(bracket_split(a, ('()', )))
meth = bra[-2]
if not meth.endswith('get'):
raise SyntaxError('Invalid ++ or -- operation.')
bra[-2] = bra[-2][:-3] + 'put'
bra[-1] = '(%s,%s%sJs(1))' % (bra[-1][1:-1], a, '+' if inc else '-')
res = ''.join(bra)
return res if not post else '(%s%sJs(1))' % (res, '-' if inc else '+')
def js_pre_inc(a):
return postfix(a, True, False)
def js_post_inc(a):
return postfix(a, True, True)
def js_pre_dec(a):
return postfix(a, False, False)
def js_post_dec(a):
return postfix(a, False, True)
OR = {'||': js_or}
AND = {'&&': js_and}
BOR = {'|': js_bor}
BXOR = {'^': js_bxor}
BAND = {'&': js_band}
EQS = {
'===': js_strict_eq,
'!==': js_strict_neq,
'==': js_abstract_eq, # we need == and != too. Read a note above method
'!=': js_abstract_neq
}
#Since JS does not have chained comparisons we need to implement all cmp methods.
COMPS = {
'<': js_lt,
'<=': js_le,
'>=': js_ge,
'>': js_gt,
'instanceof': js_instanceof, #todo change to validitate
'in': js_in
}
BSHIFTS = {'<<': js_lshift, '>>': js_rshift, '>>>': js_shit}
ADDS = {'+': js_add, '-': js_sub}
MULTS = {'*': js_mul, '/': js_div, '%': js_mod}
#Note they dont contain ++ and -- methods because they both have 2 different methods
# correct method will be found automatically in translate function
UNARY = {
'typeof': js_typeof,
'void': js_void,
'new': js_new,
'delete': js_delete,
'!': js_not,
'-': js_neg,
'+': js_pos,
'~': js_inv,
'++': None,
'--': None
}
def transform_crap(code): #needs some more tests
"""Transforms this ?: crap into if else python syntax"""
ind = code.rfind('?')
if ind == -1:
return code
sep = code.find(':', ind)
if sep == -1:
raise SyntaxError('Invalid ?: syntax (probably missing ":" )')
beg = max(code.rfind(':', 0, ind), code.find('?', 0, ind)) + 1
end = code.find(':', sep + 1)
end = len(code) if end == -1 else end
formula = '(' + code[ind + 1:sep] + ' if ' + code[
beg:ind] + ' else ' + code[sep + 1:end] + ')'
return transform_crap(code[:beg] + formula + code[end:])
from code import InteractiveConsole
#e = InteractiveConsole(globals()).interact()
import traceback
def trans(code):
return NodeVisitor(code.strip()).translate().strip()
#todo finish this trans args
def trans_args(code):
new = bracket_replace(code.strip()[1:-1])
args = ','.join(trans(e) for e in new.split(','))
return '(%s)' % args
EXP = 0
def exp_translator(code):
global REPL, EXP
EXP += 1
REPL = {}
#print EXP, code
code = code.replace('\n', ' ')
assert '@' not in code
assert ';' not in code
assert '#' not in code
#if not code.strip(): #?
# return 'var.get("undefined")'
try:
return trans(code)
except:
#print '\n\ntrans failed on \n\n' + code
#raw_input('\n\npress enter')
raise
if __name__ == '__main__':
#print 'Here', trans('(eee ) . ii [ PyJsMarker ] [ jkj ] ( j , j ) .
# jiji (h , ji , i)(non )( )()()()')
for e in xrange(3):
print exp_translator('jk = kk.ik++')
#First line translated with PyJs: PyJsStrictEq(PyJsAdd((Js(100)*Js(50)),Js(30)), Js("5030")), yay!
print exp_translator('delete a.f')
File diff suppressed because one or more lines are too long
+300
View File
@@ -0,0 +1,300 @@
""" This module removes all objects/arrays from JS source code and replace them with LVALS.
Also it has s function translating removed object/array to python code.
Use this module just after removing constants. Later move on to removing functions"""
OBJECT_LVAL = 'PyJsLvalObject%d_'
ARRAY_LVAL = 'PyJsLvalArray%d_'
from utils import *
from jsparser import *
from nodevisitor import exp_translator
import functions
from flow import KEYWORD_METHODS
def FUNC_TRANSLATOR(*a): # stupid import system in python
raise RuntimeError('Remember to set func translator. Thank you.')
def set_func_translator(ftrans):
# stupid stupid Python or Peter
global FUNC_TRANSLATOR
FUNC_TRANSLATOR = ftrans
def is_empty_object(n, last):
"""n may be the inside of block or object"""
if n.strip():
return False
# seems to be but can be empty code
last = last.strip()
markers = {
')',
';',
}
if not last or last[-1] in markers:
return False
return True
# todo refine this function
def is_object(n, last):
"""n may be the inside of block or object.
last is the code before object"""
if is_empty_object(n, last):
return True
if not n.strip():
return False
#Object contains lines of code so it cant be an object
if len(argsplit(n, ';')) > 1:
return False
cands = argsplit(n, ',')
if not cands[-1].strip():
return True # {xxxx,} empty after last , it must be an object
for cand in cands:
cand = cand.strip()
# separate each candidate element at : in dict and check whether they are correct...
kv = argsplit(cand, ':')
if len(
kv
) > 2: # set the len of kv to 2 because of this stupid : expression
kv = kv[0], ':'.join(kv[1:])
if len(kv) == 2:
# key value pair, check whether not label or ?:
k, v = kv
if not is_lval(k.strip()):
return False
v = v.strip()
if v.startswith('function'):
continue
#will fail on label... {xxx: while {}}
if v[0] == '{': # value cant be a code block
return False
for e in KEYWORD_METHODS:
# if v starts with any statement then return false
if v.startswith(e) and len(e) < len(v) and v[len(
e)] not in IDENTIFIER_PART:
return False
elif not (cand.startswith('set ') or cand.startswith('get ')):
return False
return True
def is_array(last):
#it can be prop getter
last = last.strip()
if any(
endswith_keyword(last, e) for e in
{'return', 'new', 'void', 'throw', 'typeof', 'in', 'instanceof'}):
return True
markers = {')', ']'}
return not last or not (last[-1] in markers or last[-1] in IDENTIFIER_PART)
def remove_objects(code, count=1):
""" This function replaces objects with OBJECTS_LVALS, returns new code, replacement dict and count.
count arg is the number that should be added to the LVAL of the first replaced object
"""
replacements = {} #replacement dict
br = bracket_split(code, ['{}', '[]'])
res = ''
last = ''
for e in br:
#test whether e is an object
if e[0] == '{':
n, temp_rep, cand_count = remove_objects(e[1:-1], count)
# if e was not an object then n should not contain any :
if is_object(n, last):
#e was an object
res += ' ' + OBJECT_LVAL % count
replacements[OBJECT_LVAL % count] = e
count += 1
else:
# e was just a code block but could contain objects inside
res += '{%s}' % n
count = cand_count
replacements.update(temp_rep)
elif e[0] == '[':
if is_array(last):
res += e # will be translated later
else: # prop get
n, rep, count = remove_objects(e[1:-1], count)
res += '[%s]' % n
replacements.update(rep)
else: # e does not contain any objects
res += e
last = e #needed to test for this stipid empty object
return res, replacements, count
def remove_arrays(code, count=1):
"""removes arrays and replaces them with ARRAY_LVALS
returns new code and replacement dict
*NOTE* has to be called AFTER remove objects"""
res = ''
last = ''
replacements = {}
for e in bracket_split(code, ['[]']):
if e[0] == '[':
if is_array(last):
name = ARRAY_LVAL % count
res += ' ' + name
replacements[name] = e
count += 1
else: # pseudo array. But pseudo array can contain true array. for example a[['d'][3]] has 2 pseudo and 1 true array
cand, new_replacements, count = remove_arrays(e[1:-1], count)
res += '[%s]' % cand
replacements.update(new_replacements)
else:
res += e
last = e
return res, replacements, count
def translate_object(obj, lval, obj_count=1, arr_count=1):
obj = obj[1:-1] # remove {} from both ends
obj, obj_rep, obj_count = remove_objects(obj, obj_count)
obj, arr_rep, arr_count = remove_arrays(obj, arr_count)
# functions can be defined inside objects. exp translator cant translate them.
# we have to remove them and translate with func translator
# its better explained in translate_array function
obj, hoisted, inline = functions.remove_functions(obj, all_inline=True)
assert not hoisted
gsetters_after = ''
keys = argsplit(obj)
res = []
for i, e in enumerate(keys, 1):
e = e.strip()
if e.startswith('set '):
gsetters_after += translate_setter(lval, e)
elif e.startswith('get '):
gsetters_after += translate_getter(lval, e)
elif ':' not in e:
if i < len(keys
): # can happen legally only in the last element {3:2,}
raise SyntaxError('Unexpected "," in Object literal')
break
else: #Not getter, setter or elision
spl = argsplit(e, ':')
if len(spl) < 2:
raise SyntaxError('Invalid Object literal: ' + e)
try:
key, value = spl
except: #len(spl)> 2
print 'Unusual case ' + repr(e)
key = spl[0]
value = ':'.join(spl[1:])
key = key.strip()
if is_internal(key):
key = '%s.to_string().value' % key
else:
key = repr(key)
value = exp_translator(value)
if not value:
raise SyntaxError('Missing value in Object literal')
res.append('%s:%s' % (key, value))
res = '%s = Js({%s})\n' % (lval, ','.join(res)) + gsetters_after
# translate all the nested objects (including removed earlier functions)
for nested_name, nested_info in inline.iteritems(): # functions
nested_block, nested_args = nested_info
new_def = FUNC_TRANSLATOR(nested_name, nested_block, nested_args)
res = new_def + res
for lval, obj in obj_rep.iteritems(): #objects
new_def, obj_count, arr_count = translate_object(
obj, lval, obj_count, arr_count)
# add object definition BEFORE array definition
res = new_def + res
for lval, obj in arr_rep.iteritems(): # arrays
new_def, obj_count, arr_count = translate_array(
obj, lval, obj_count, arr_count)
# add object definition BEFORE array definition
res = new_def + res
return res, obj_count, arr_count
def translate_setter(lval, setter):
func = 'function' + setter[3:]
try:
_, data, _ = functions.remove_functions(func)
if not data or len(data) > 1:
raise Exception()
except:
raise SyntaxError('Could not parse setter: ' + setter)
prop = data.keys()[0]
body, args = data[prop]
if len(args) != 1: #setter must have exactly 1 argument
raise SyntaxError('Invalid setter. It must take exactly 1 argument.')
# now messy part
res = FUNC_TRANSLATOR('setter', body, args)
res += "%s.define_own_property(%s, {'set': setter})\n" % (lval, repr(prop))
return res
def translate_getter(lval, getter):
func = 'function' + getter[3:]
try:
_, data, _ = functions.remove_functions(func)
if not data or len(data) > 1:
raise Exception()
except:
raise SyntaxError('Could not parse getter: ' + getter)
prop = data.keys()[0]
body, args = data[prop]
if len(args) != 0: #setter must have exactly 0 argument
raise SyntaxError('Invalid getter. It must take exactly 0 argument.')
# now messy part
res = FUNC_TRANSLATOR('getter', body, args)
res += "%s.define_own_property(%s, {'get': setter})\n" % (lval, repr(prop))
return res
def translate_array(array, lval, obj_count=1, arr_count=1):
"""array has to be any js array for example [1,2,3]
lval has to be name of this array.
Returns python code that adds lval to the PY scope it should be put before lval"""
array = array[1:-1]
array, obj_rep, obj_count = remove_objects(array, obj_count)
array, arr_rep, arr_count = remove_arrays(array, arr_count)
#functions can be also defined in arrays, this caused many problems since in Python
# functions cant be defined inside literal
# remove functions (they dont contain arrays or objects so can be translated easily)
# hoisted functions are treated like inline
array, hoisted, inline = functions.remove_functions(array, all_inline=True)
assert not hoisted
arr = []
# separate elements in array
for e in argsplit(array, ','):
# translate expressions in array PyJsLvalInline will not be translated!
e = exp_translator(e.replace('\n', ''))
arr.append(e if e else 'None')
arr = '%s = Js([%s])\n' % (lval, ','.join(arr))
#But we can have more code to add to define arrays/objects/functions defined inside this array
# translate nested objects:
# functions:
for nested_name, nested_info in inline.iteritems():
nested_block, nested_args = nested_info
new_def = FUNC_TRANSLATOR(nested_name, nested_block, nested_args)
arr = new_def + arr
for lval, obj in obj_rep.iteritems():
new_def, obj_count, arr_count = translate_object(
obj, lval, obj_count, arr_count)
# add object definition BEFORE array definition
arr = new_def + arr
for lval, obj in arr_rep.iteritems():
new_def, obj_count, arr_count = translate_array(
obj, lval, obj_count, arr_count)
# add object definition BEFORE array definition
arr = new_def + arr
return arr, obj_count, arr_count
if __name__ == '__main__':
test = 'a = {404:{494:19}}; b = 303; if () {f={:}; { }}'
#print remove_objects(test)
#print list(bracket_split(' {}'))
print
print remove_arrays(
'typeof a&&!db.test(a)&&!ib[(bb.exec(a)||["",""], [][[5][5]])[1].toLowerCase()])'
)
print is_object('', ')')
+4
View File
@@ -0,0 +1,4 @@
from jsparser import *
from utils import *
# maybe I will try rewriting my parser in the future... Tokenizer makes things much easier and faster, unfortunately I
# did not know anything about parsers when I was starting this project so I invented my own.
+151
View File
@@ -0,0 +1,151 @@
from flow import translate_flow
from constants import remove_constants, recover_constants
from objects import remove_objects, remove_arrays, translate_object, translate_array, set_func_translator
from functions import remove_functions, reset_inline_count
from jsparser import inject_before_lval, indent, dbg
TOP_GLOBAL = '''from js2py.pyjs import *\nvar = Scope( JS_BUILTINS )\nset_global_object(var)\n'''
def translate_js(js, top=TOP_GLOBAL):
"""js has to be a javascript source code.
returns equivalent python code."""
# Remove constant literals
no_const, constants = remove_constants(js)
#print 'const count', len(constants)
# Remove object literals
no_obj, objects, obj_count = remove_objects(no_const)
#print 'obj count', len(objects)
# Remove arrays
no_arr, arrays, arr_count = remove_arrays(no_obj)
#print 'arr count', len(arrays)
# Here remove and replace functions
reset_inline_count()
no_func, hoisted, inline = remove_functions(no_arr)
#translate flow and expressions
py_seed, to_register = translate_flow(no_func)
# register variables and hoisted functions
#top += '# register variables\n'
top += 'var.registers(%s)\n' % str(to_register + hoisted.keys())
#Recover functions
# hoisted functions recovery
defs = ''
#defs += '# define hoisted functions\n'
#print len(hoisted) , 'HH'*40
for nested_name, nested_info in hoisted.iteritems():
nested_block, nested_args = nested_info
new_code = translate_func('PyJsLvalTempHoisted', nested_block,
nested_args)
new_code += 'PyJsLvalTempHoisted.func_name = %s\n' % repr(nested_name)
defs += new_code + '\nvar.put(%s, PyJsLvalTempHoisted)\n' % repr(
nested_name)
#defs += '# Everting ready!\n'
# inline functions recovery
for nested_name, nested_info in inline.iteritems():
nested_block, nested_args = nested_info
new_code = translate_func(nested_name, nested_block, nested_args)
py_seed = inject_before_lval(py_seed,
nested_name.split('@')[0], new_code)
# add hoisted definitiond - they have literals that have to be recovered
py_seed = defs + py_seed
#Recover arrays
for arr_lval, arr_code in arrays.iteritems():
translation, obj_count, arr_count = translate_array(
arr_code, arr_lval, obj_count, arr_count)
py_seed = inject_before_lval(py_seed, arr_lval, translation)
#Recover objects
for obj_lval, obj_code in objects.iteritems():
translation, obj_count, arr_count = translate_object(
obj_code, obj_lval, obj_count, arr_count)
py_seed = inject_before_lval(py_seed, obj_lval, translation)
#Recover constants
py_code = recover_constants(py_seed, constants)
return top + py_code
def translate_func(name, block, args):
"""Translates functions and all nested functions to Python code.
name - name of that function (global functions will be available under var while
inline will be available directly under this name )
block - code of the function (*with* brackets {} )
args - arguments that this function takes"""
inline = name.startswith('PyJsLvalInline')
real_name = ''
if inline:
name, real_name = name.split('@')
arglist = ', '.join(args) + ', ' if args else ''
code = '@Js\ndef %s(%sthis, arguments, var=var):\n' % (name, arglist)
# register local variables
scope = "'this':this, 'arguments':arguments" #it will be a simple dictionary
for arg in args:
scope += ', %s:%s' % (repr(arg), arg)
if real_name:
scope += ', %s:%s' % (repr(real_name), name)
code += indent('var = Scope({%s}, var)\n' % scope)
block, nested_hoisted, nested_inline = remove_functions(block)
py_code, to_register = translate_flow(block)
#register variables declared with var and names of hoisted functions.
to_register += nested_hoisted.keys()
if to_register:
code += indent('var.registers(%s)\n' % str(to_register))
for nested_name, info in nested_hoisted.iteritems():
nested_block, nested_args = info
new_code = translate_func('PyJsLvalTempHoisted', nested_block,
nested_args)
# Now put definition of hoisted function on the top
code += indent(new_code)
code += indent(
'PyJsLvalTempHoisted.func_name = %s\n' % repr(nested_name))
code += indent(
'var.put(%s, PyJsLvalTempHoisted)\n' % repr(nested_name))
for nested_name, info in nested_inline.iteritems():
nested_block, nested_args = info
new_code = translate_func(nested_name, nested_block, nested_args)
# Inject definitions of inline functions just before usage
# nested inline names have this format : LVAL_NAME@REAL_NAME
py_code = inject_before_lval(py_code,
nested_name.split('@')[0], new_code)
if py_code.strip():
code += indent(py_code)
return code
set_func_translator(translate_func)
#print inject_before_lval(' chuj\n moj\n lval\nelse\n', 'lval', 'siema\njestem piter\n')
import time
#print time.time()
#print translate_js('if (1) console.log("Hello, World!"); else if (5) console.log("Hello world?");')
#print time.time()
t = """
var x = [1,2,3,4,5,6];
for (var e in x) {console.log(e); delete x[3];}
console.log(5 in [1,2,3,4,5]);
"""
SANDBOX = '''
import traceback
try:
%s
except:
print traceback.format_exc()
print
raw_input('Press Enter to quit')
'''
if __name__ == '__main__':
# test with jq if works then it really works :)
#with open('jq.js', 'r') as f:
#jq = f.read()
#res = translate_js(jq)
res = translate_js(t)
dbg(SANDBOX % indent(res))
print 'Done'
+91
View File
@@ -0,0 +1,91 @@
import sys
import unicodedata
from collections import defaultdict
def is_lval(t):
"""Does not chceck whether t is not resticted or internal"""
if not t:
return False
i = iter(t)
if i.next() not in IDENTIFIER_START:
return False
return all(e in IDENTIFIER_PART for e in i)
def is_valid_lval(t):
"""Checks whether t is valid JS identifier name (no keyword like var, function, if etc)
Also returns false on internal"""
if not is_internal(t) and is_lval(t) and t not in RESERVED_NAMES:
return True
return False
def is_plval(t):
return t.startswith('PyJsLval')
def is_marker(t):
return t.startswith('PyJsMarker') or t.startswith('PyJsConstant')
def is_internal(t):
return is_plval(t) or is_marker(t) or t == 'var' # var is a scope var
def is_property_accessor(t):
return '[' in t or '.' in t
def is_reserved(t):
return t in RESERVED_NAMES
#http://stackoverflow.com/questions/14245893/efficiently-list-all-characters-in-a-given-unicode-category
BOM = u'\uFEFF'
ZWJ = u'\u200D'
ZWNJ = u'\u200C'
TAB = u'\u0009'
VT = u'\u000B'
FF = u'\u000C'
SP = u'\u0020'
NBSP = u'\u00A0'
LF = u'\u000A'
CR = u'\u000D'
LS = u'\u2028'
PS = u'\u2029'
U_CATEGORIES = defaultdict(list) # Thank you Martijn Pieters!
for c in map(unichr, range(sys.maxunicode + 1)):
U_CATEGORIES[unicodedata.category(c)].append(c)
UNICODE_LETTER = set(U_CATEGORIES['Lu'] + U_CATEGORIES['Ll'] +
U_CATEGORIES['Lt'] + U_CATEGORIES['Lm'] +
U_CATEGORIES['Lo'] + U_CATEGORIES['Nl'])
UNICODE_COMBINING_MARK = set(U_CATEGORIES['Mn'] + U_CATEGORIES['Mc'])
UNICODE_DIGIT = set(U_CATEGORIES['Nd'])
UNICODE_CONNECTOR_PUNCTUATION = set(U_CATEGORIES['Pc'])
IDENTIFIER_START = UNICODE_LETTER.union(
{'$', '_'}) # and some fucking unicode escape sequence
IDENTIFIER_PART = IDENTIFIER_START.union(UNICODE_COMBINING_MARK).union(
UNICODE_DIGIT).union(UNICODE_CONNECTOR_PUNCTUATION).union({ZWJ, ZWNJ})
USP = U_CATEGORIES['Zs']
KEYWORD = {
'break', 'do', 'instanceof', 'typeof', 'case', 'else', 'new', 'var',
'catch', 'finally', 'return', 'void', 'continue', 'for', 'switch', 'while',
'debugger', 'function', 'this', 'with', 'default', 'if', 'throw', 'delete',
'in', 'try'
}
FUTURE_RESERVED_WORD = {
'class', 'enum', 'extends', 'super', 'const', 'export', 'import'
}
RESERVED_NAMES = KEYWORD.union(FUTURE_RESERVED_WORD).union(
{'null', 'false', 'true'})
WHITE = {TAB, VT, FF, SP, NBSP, BOM}.union(USP)
LINE_TERMINATOR = {LF, CR, LS, PS}
LLINE_TERMINATOR = list(LINE_TERMINATOR)
x = ''.join(WHITE) + ''.join(LINE_TERMINATOR)
SPACE = WHITE.union(LINE_TERMINATOR)
LINE_TERMINATOR_SEQUENCE = LINE_TERMINATOR.union({CR + LF})