Actualizados
- allcalidad: Cambio de dominio - animeflv: Correción - streamcloud - Actualización interna
This commit is contained in:
@@ -0,0 +1,308 @@
|
||||
from string import ascii_lowercase, digits
|
||||
##################################
|
||||
StringName = u'PyJsConstantString%d_'
|
||||
NumberName = u'PyJsConstantNumber%d_'
|
||||
RegExpName = u'PyJsConstantRegExp%d_'
|
||||
##################################
|
||||
ALPHAS = set(ascii_lowercase + ascii_lowercase.upper())
|
||||
NUMS = set(digits)
|
||||
IDENTIFIER_START = ALPHAS.union(NUMS)
|
||||
ESCAPE_CHARS = {'n', '0', 'b', 'f', 'r', 't', 'v', '"', "'", '\\'}
|
||||
OCTAL = {'0', '1', '2', '3', '4', '5', '6', '7'}
|
||||
HEX = set('0123456789abcdefABCDEF')
|
||||
from utils import *
|
||||
IDENTIFIER_PART = IDENTIFIER_PART.union({'.'})
|
||||
|
||||
|
||||
def _is_cancelled(source, n):
|
||||
cancelled = False
|
||||
k = 0
|
||||
while True:
|
||||
k += 1
|
||||
if source[n - k] != '\\':
|
||||
break
|
||||
cancelled = not cancelled
|
||||
return cancelled
|
||||
|
||||
|
||||
def _ensure_regexp(source, n): #<- this function has to be improved
|
||||
'''returns True if regexp starts at n else returns False
|
||||
checks whether it is not a division '''
|
||||
markers = '(+~"\'=[%:?!*^|&-,;/\\'
|
||||
k = 0
|
||||
while True:
|
||||
k += 1
|
||||
if n - k < 0:
|
||||
return True
|
||||
char = source[n - k]
|
||||
if char in markers:
|
||||
return True
|
||||
if char != ' ' and char != '\n':
|
||||
break
|
||||
return False
|
||||
|
||||
|
||||
def parse_num(source, start, charset):
|
||||
"""Returns a first index>=start of chat not in charset"""
|
||||
while start < len(source) and source[start] in charset:
|
||||
start += 1
|
||||
return start
|
||||
|
||||
|
||||
def parse_exponent(source, start):
|
||||
"""returns end of exponential, raises SyntaxError if failed"""
|
||||
if not source[start] in {'e', 'E'}:
|
||||
if source[start] in IDENTIFIER_PART:
|
||||
raise SyntaxError('Invalid number literal!')
|
||||
return start
|
||||
start += 1
|
||||
if source[start] in {'-', '+'}:
|
||||
start += 1
|
||||
FOUND = False
|
||||
# we need at least one dig after exponent
|
||||
while source[start] in NUMS:
|
||||
FOUND = True
|
||||
start += 1
|
||||
if not FOUND or source[start] in IDENTIFIER_PART:
|
||||
raise SyntaxError('Invalid number literal!')
|
||||
return start
|
||||
|
||||
|
||||
def remove_constants(source):
|
||||
'''Replaces Strings and Regexp literals in the source code with
|
||||
identifiers and *removes comments*. Identifier is of the format:
|
||||
|
||||
PyJsStringConst(String const number)_ - for Strings
|
||||
PyJsRegExpConst(RegExp const number)_ - for RegExps
|
||||
|
||||
Returns dict which relates identifier and replaced constant.
|
||||
|
||||
Removes single line and multiline comments from JavaScript source code
|
||||
Pseudo comments (inside strings) will not be removed.
|
||||
|
||||
For example this line:
|
||||
var x = "/*PSEUDO COMMENT*/ TEXT //ANOTHER PSEUDO COMMENT"
|
||||
will be unaltered'''
|
||||
source = ' ' + source + '\n'
|
||||
comments = []
|
||||
inside_comment, single_comment = False, False
|
||||
inside_single, inside_double = False, False
|
||||
inside_regexp = False
|
||||
regexp_class_count = 0
|
||||
n = 0
|
||||
while n < len(source):
|
||||
char = source[n]
|
||||
if char == '"' and not (inside_comment or inside_single
|
||||
or inside_regexp):
|
||||
if not _is_cancelled(source, n):
|
||||
if inside_double:
|
||||
inside_double[1] = n + 1
|
||||
comments.append(inside_double)
|
||||
inside_double = False
|
||||
else:
|
||||
inside_double = [n, None, 0]
|
||||
elif char == "'" and not (inside_comment or inside_double
|
||||
or inside_regexp):
|
||||
if not _is_cancelled(source, n):
|
||||
if inside_single:
|
||||
inside_single[1] = n + 1
|
||||
comments.append(inside_single)
|
||||
inside_single = False
|
||||
else:
|
||||
inside_single = [n, None, 0]
|
||||
elif (inside_single or inside_double):
|
||||
if char in LINE_TERMINATOR:
|
||||
if _is_cancelled(source, n):
|
||||
if char == CR and source[n + 1] == LF:
|
||||
n += 1
|
||||
n += 1
|
||||
continue
|
||||
else:
|
||||
raise SyntaxError(
|
||||
'Invalid string literal. Line terminators must be escaped!'
|
||||
)
|
||||
else:
|
||||
if inside_comment:
|
||||
if single_comment:
|
||||
if char in LINE_TERMINATOR:
|
||||
inside_comment[1] = n
|
||||
comments.append(inside_comment)
|
||||
inside_comment = False
|
||||
single_comment = False
|
||||
else: # Multiline
|
||||
if char == '/' and source[n - 1] == '*':
|
||||
inside_comment[1] = n + 1
|
||||
comments.append(inside_comment)
|
||||
inside_comment = False
|
||||
elif inside_regexp:
|
||||
if not quiting_regexp:
|
||||
if char in LINE_TERMINATOR:
|
||||
raise SyntaxError(
|
||||
'Invalid regexp literal. Line terminators cant appear!'
|
||||
)
|
||||
if _is_cancelled(source, n):
|
||||
n += 1
|
||||
continue
|
||||
if char == '[':
|
||||
regexp_class_count += 1
|
||||
elif char == ']':
|
||||
regexp_class_count = max(regexp_class_count - 1, 0)
|
||||
elif char == '/' and not regexp_class_count:
|
||||
quiting_regexp = True
|
||||
else:
|
||||
if char not in IDENTIFIER_START:
|
||||
inside_regexp[1] = n
|
||||
comments.append(inside_regexp)
|
||||
inside_regexp = False
|
||||
elif char == '/' and source[n - 1] == '/':
|
||||
single_comment = True
|
||||
inside_comment = [n - 1, None, 1]
|
||||
elif char == '*' and source[n - 1] == '/':
|
||||
inside_comment = [n - 1, None, 1]
|
||||
elif char == '/' and source[n + 1] not in ('/', '*'):
|
||||
if not _ensure_regexp(source, n): #<- improve this one
|
||||
n += 1
|
||||
continue #Probably just a division
|
||||
quiting_regexp = False
|
||||
inside_regexp = [n, None, 2]
|
||||
elif not (inside_comment or inside_regexp):
|
||||
if (char in NUMS and
|
||||
source[n - 1] not in IDENTIFIER_PART) or char == '.':
|
||||
if char == '.':
|
||||
k = parse_num(source, n + 1, NUMS)
|
||||
if k == n + 1: # just a stupid dot...
|
||||
n += 1
|
||||
continue
|
||||
k = parse_exponent(source, k)
|
||||
elif char == '0' and source[n + 1] in {
|
||||
'x', 'X'
|
||||
}: #Hex number probably
|
||||
k = parse_num(source, n + 2, HEX)
|
||||
if k == n + 2 or source[k] in IDENTIFIER_PART:
|
||||
raise SyntaxError('Invalid hex literal!')
|
||||
else: #int or exp or flot or exp flot
|
||||
k = parse_num(source, n + 1, NUMS)
|
||||
if source[k] == '.':
|
||||
k = parse_num(source, k + 1, NUMS)
|
||||
k = parse_exponent(source, k)
|
||||
comments.append((n, k, 3))
|
||||
n = k
|
||||
continue
|
||||
n += 1
|
||||
res = ''
|
||||
start = 0
|
||||
count = 0
|
||||
constants = {}
|
||||
for end, next_start, typ in comments:
|
||||
res += source[start:end]
|
||||
start = next_start
|
||||
if typ == 0: # String
|
||||
name = StringName
|
||||
elif typ == 1: # comment
|
||||
continue
|
||||
elif typ == 2: # regexp
|
||||
name = RegExpName
|
||||
elif typ == 3: # number
|
||||
name = NumberName
|
||||
else:
|
||||
raise RuntimeError()
|
||||
res += ' ' + name % count + ' '
|
||||
constants[name % count] = source[end:next_start]
|
||||
count += 1
|
||||
res += source[start:]
|
||||
# remove this stupid white space
|
||||
for e in WHITE:
|
||||
res = res.replace(e, ' ')
|
||||
res = res.replace(CR + LF, '\n')
|
||||
for e in LINE_TERMINATOR:
|
||||
res = res.replace(e, '\n')
|
||||
return res.strip(), constants
|
||||
|
||||
|
||||
def recover_constants(py_source,
|
||||
replacements): #now has n^2 complexity. improve to n
|
||||
'''Converts identifiers representing Js constants to the PyJs constants
|
||||
PyJsNumberConst_1_ which has the true value of 5 will be converted to PyJsNumber(5)'''
|
||||
for identifier, value in replacements.iteritems():
|
||||
if identifier.startswith('PyJsConstantRegExp'):
|
||||
py_source = py_source.replace(identifier,
|
||||
'JsRegExp(%s)' % repr(value))
|
||||
elif identifier.startswith('PyJsConstantString'):
|
||||
py_source = py_source.replace(
|
||||
identifier, 'Js(u%s)' % unify_string_literals(value))
|
||||
else:
|
||||
py_source = py_source.replace(identifier, 'Js(%s)' % value)
|
||||
return py_source
|
||||
|
||||
|
||||
def unify_string_literals(js_string):
|
||||
"""this function parses the string just like javascript
|
||||
for example literal '\d' in JavaScript would be interpreted
|
||||
as 'd' - backslash would be ignored and in Pyhon this
|
||||
would be interpreted as '\\d' This function fixes this problem."""
|
||||
n = 0
|
||||
res = ''
|
||||
limit = len(js_string)
|
||||
while n < limit:
|
||||
char = js_string[n]
|
||||
if char == '\\':
|
||||
new, n = do_escape(js_string, n)
|
||||
res += new
|
||||
else:
|
||||
res += char
|
||||
n += 1
|
||||
return res
|
||||
|
||||
|
||||
def unify_regexp_literals(js):
|
||||
pass
|
||||
|
||||
|
||||
def do_escape(source, n):
|
||||
"""Its actually quite complicated to cover every case :)
|
||||
http://www.javascriptkit.com/jsref/escapesequence.shtml"""
|
||||
if not n + 1 < len(source):
|
||||
return '' # not possible here but can be possible in general case.
|
||||
if source[n + 1] in LINE_TERMINATOR:
|
||||
if source[n + 1] == CR and n + 2 < len(source) and source[n + 2] == LF:
|
||||
return source[n:n + 3], n + 3
|
||||
return source[n:n + 2], n + 2
|
||||
if source[n + 1] in ESCAPE_CHARS:
|
||||
return source[n:n + 2], n + 2
|
||||
if source[n + 1] in {'x', 'u'}:
|
||||
char, length = ('u', 4) if source[n + 1] == 'u' else ('x', 2)
|
||||
n += 2
|
||||
end = parse_num(source, n, HEX)
|
||||
if end - n < length:
|
||||
raise SyntaxError('Invalid escape sequence!')
|
||||
#if length==4:
|
||||
# return unichr(int(source[n:n+4], 16)), n+4 # <- this was a very bad way of solving this problem :)
|
||||
return source[n - 2:n + length], n + length
|
||||
if source[n + 1] in OCTAL:
|
||||
n += 1
|
||||
end = parse_num(source, n, OCTAL)
|
||||
end = min(end, n + 3) # cant be longer than 3
|
||||
# now the max allowed is 377 ( in octal) and 255 in decimal
|
||||
max_num = 255
|
||||
num = 0
|
||||
len_parsed = 0
|
||||
for e in source[n:end]:
|
||||
cand = 8 * num + int(e)
|
||||
if cand > max_num:
|
||||
break
|
||||
num = cand
|
||||
len_parsed += 1
|
||||
# we have to return in a different form because python may want to parse more...
|
||||
# for example '\777' will be parsed by python as a whole while js will use only \77
|
||||
return '\\' + hex(num)[1:], n + len_parsed
|
||||
return source[n + 1], n + 2
|
||||
|
||||
|
||||
#####TEST######
|
||||
|
||||
if __name__ == '__main__':
|
||||
test = ('''
|
||||
''')
|
||||
|
||||
t, d = remove_constants(test)
|
||||
print t, d
|
||||
Reference in New Issue
Block a user