1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
|
#!/usr/bin/env python2
#
# Scan potential external strings from ECMAScript and C files.
#
# Very simplistic example with a lot of limitations:
#
# - Doesn't handle multiple variables in a variable declaration
#
# - Only extracts strings from C files, these may correspond to
# Duktape/C bindings (but in many cases don't)
#
import os
import sys
import re
import json
strmap = {}
# ECMAScript function declaration
re_funcname = re.compile(r'function\s+(\w+)', re.UNICODE)
# ECMAScript variable declaration
# XXX: doesn't handle multiple variables
re_vardecl = re.compile(r'var\s+(\w+)', re.UNICODE)
# ECMAScript variable assignment
re_varassign = re.compile(r'(\w+)\s*=\s*', re.UNICODE)
# ECMAScript dotted property reference (also matches numbers like
# '4.0', which are separately rejected below)
re_propref = re.compile(r'(\w+(?:\.\w+)+)', re.UNICODE)
re_digits = re.compile(r'^\d+$', re.UNICODE)
# ECMAScript or C string literal
re_strlit_dquot = re.compile(r'("(?:\\"|\\\\|[^"])*")', re.UNICODE)
re_strlit_squot = re.compile(r'(\'(?:\\\'|\\\\|[^\'])*\')', re.UNICODE)
def strDecode(x):
# Need to decode hex, unicode, and other escapes. Python syntax
# is close enough to C and ECMAScript so use eval for now.
try:
return eval('u' + x) # interpret as unicode string
except:
sys.stderr.write('Failed to parse: ' + repr(x) + ', ignoring\n')
return None
def scan(f, fn):
global strmap
# Scan rules depend on file type
if fn[-2:] == '.c':
use_funcname = False
use_vardecl = False
use_varassign = False
use_propref = False
use_strlit_dquot = True
use_strlit_squot = False
else:
use_funcname = True
use_vardecl = True
use_varassign = True
use_propref = True
use_strlit_dquot = True
use_strlit_squot = True
for line in f:
# Assume input data is UTF-8
line = line.decode('utf-8')
if use_funcname:
for m in re_funcname.finditer(line):
strmap[m.group(1)] = True
if use_vardecl:
for m in re_vardecl.finditer(line):
strmap[m.group(1)] = True
if use_varassign:
for m in re_varassign.finditer(line):
strmap[m.group(1)] = True
if use_propref:
for m in re_propref.finditer(line):
parts = m.group(1).split('.')
if re_digits.match(parts[0]) is not None:
# Probably a number ('4.0' or such)
pass
else:
for part in parts:
strmap[part] = True
if use_strlit_dquot:
for m in re_strlit_dquot.finditer(line):
s = strDecode(m.group(1))
if s is not None:
strmap[s] = True
if use_strlit_squot:
for m in re_strlit_squot.finditer(line):
s = strDecode(m.group(1))
if s is not None:
strmap[s] = True
def main():
for fn in sys.argv[1:]:
f = open(fn, 'rb')
scan(f, fn)
f.close()
strs = []
strs_base64 = []
doc = {
# Strings as Unicode strings
'scanned_strings': strs,
# Strings as base64-encoded UTF-8 data, which should be ready
# to be used in C code (Duktape internal string representation
# is UTF-8)
'scanned_strings_base64': strs_base64
}
k = strmap.keys()
k.sort()
for s in k:
strs.append(s)
t = s.encode('utf-8').encode('base64')
if len(t) > 0 and t[-1] == '\n':
t = t[0:-1]
strs_base64.append(t)
print(json.dumps(doc, indent=4, ensure_ascii=True, sort_keys=True))
if __name__ == '__main__':
main()
|