[go: up one dir, main page]

File: scan_strings.py

package info (click to toggle)
duktape 2.3.0-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 20,496 kB
  • sloc: ansic: 203,676; python: 5,856; makefile: 476; cpp: 205
file content (135 lines) | stat: -rw-r--r-- 3,846 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python2
#
#  Scan potential external strings from ECMAScript and C files.
#
#  Very simplistic example with a lot of limitations:
#
#    - Doesn't handle multiple variables in a variable declaration
#
#    - Only extracts strings from C files, these may correspond to
#      Duktape/C bindings (but in many cases don't)
#

import os
import sys
import re
import json

strmap = {}

# ECMAScript function declaration
re_funcname = re.compile(r'function\s+(\w+)', re.UNICODE)

# ECMAScript variable declaration
# XXX: doesn't handle multiple variables
re_vardecl = re.compile(r'var\s+(\w+)', re.UNICODE)

# ECMAScript variable assignment
re_varassign = re.compile(r'(\w+)\s*=\s*', re.UNICODE)

# ECMAScript dotted property reference (also matches numbers like
# '4.0', which are separately rejected below)
re_propref = re.compile(r'(\w+(?:\.\w+)+)', re.UNICODE)
re_digits = re.compile(r'^\d+$', re.UNICODE)

# ECMAScript or C string literal
re_strlit_dquot = re.compile(r'("(?:\\"|\\\\|[^"])*")', re.UNICODE)
re_strlit_squot = re.compile(r'(\'(?:\\\'|\\\\|[^\'])*\')', re.UNICODE)

def strDecode(x):
    # Need to decode hex, unicode, and other escapes.  Python syntax
    # is close enough to C and ECMAScript so use eval for now.

    try:
        return eval('u' + x)  # interpret as unicode string
    except:
        sys.stderr.write('Failed to parse: ' + repr(x) + ', ignoring\n')
        return None

def scan(f, fn):
    global strmap

    # Scan rules depend on file type
    if fn[-2:] == '.c':
        use_funcname = False
        use_vardecl = False
        use_varassign = False
        use_propref = False
        use_strlit_dquot = True
        use_strlit_squot = False
    else:
        use_funcname = True
        use_vardecl = True
        use_varassign = True
        use_propref = True
        use_strlit_dquot = True
        use_strlit_squot = True

    for line in f:
        # Assume input data is UTF-8
        line = line.decode('utf-8')

        if use_funcname:
            for m in re_funcname.finditer(line):
                strmap[m.group(1)] = True

        if use_vardecl:
            for m in re_vardecl.finditer(line):
                strmap[m.group(1)] = True

        if use_varassign:
            for m in re_varassign.finditer(line):
                strmap[m.group(1)] = True

        if use_propref:
            for m in re_propref.finditer(line):
                parts = m.group(1).split('.')
                if re_digits.match(parts[0]) is not None:
                    # Probably a number ('4.0' or such)
                    pass
                else:
                    for part in parts:
                        strmap[part] = True

        if use_strlit_dquot:
            for m in re_strlit_dquot.finditer(line):
                s = strDecode(m.group(1))
                if s is not None:
                    strmap[s] = True

        if use_strlit_squot:
            for m in re_strlit_squot.finditer(line):
                s = strDecode(m.group(1))
                if s is not None:
                    strmap[s] = True

def main():
    for fn in sys.argv[1:]:
        f = open(fn, 'rb')
        scan(f, fn)
        f.close()

    strs = []
    strs_base64 = []
    doc = {
        # Strings as Unicode strings
        'scanned_strings': strs,

        # Strings as base64-encoded UTF-8 data, which should be ready
        # to be used in C code (Duktape internal string representation
        # is UTF-8)
        'scanned_strings_base64': strs_base64
    }
    k = strmap.keys()
    k.sort()
    for s in k:
        strs.append(s)
        t = s.encode('utf-8').encode('base64')
        if len(t) > 0 and t[-1] == '\n':
            t = t[0:-1]
        strs_base64.append(t)

    print(json.dumps(doc, indent=4, ensure_ascii=True, sort_keys=True))

if __name__ == '__main__':
    main()