Skip to content

Commit 2f6c2ac

Browse files
committed
Add bap.noeval_parser, tests, pytest/tox configs
1 parent ab38d35 commit 2f6c2ac

File tree

4 files changed

+790
-0
lines changed

4 files changed

+790
-0
lines changed

conftest.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
'''pytest configuration module'''
2+
import pytest # pylint: disable=import-error
3+
4+
# configure setup to skip slow tests by default (without --slow flag)
5+
def pytest_runtest_setup(item):
6+
"""Skip tests if they are marked as slow and --slow is not given"""
7+
if getattr(item.obj, 'slow', None) and not item.config.getvalue('slow'):
8+
pytest.skip('slow tests not requested')
9+
10+
# add '--slow' flag to enable the slow tests, but default to False/disabled
11+
def pytest_addoption(parser):
12+
'''Add --slow option'''
13+
parser.addoption('--slow', action='store_true', default=False,
14+
help='Also run slow tests')
15+

src/bap/noeval_parser.py

Lines changed: 314 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,314 @@
1+
#! /usr/bin/env python3
2+
'''
3+
Parser for ADT string from bap that does not use eval
4+
5+
The nieve eval-based version runs into out-of-memory conditions on large files
6+
'''
7+
import gc
8+
import sys
9+
import time
10+
11+
from . import bir
12+
13+
14+
def toint(string, start, end):
15+
'''
16+
Convert substring string[start:end] to integer/long without eval
17+
18+
Note: may contain leading whitespace
19+
'''
20+
istr = string[start:end].lstrip()
21+
22+
if sys.version_info > (3,): # then longs don't exist
23+
if istr.endswith('L'):
24+
istr = istr.rstrip('L')
25+
of_str = int
26+
else:
27+
if istr.endswith('L'):
28+
of_str = long
29+
else:
30+
of_str = int
31+
if istr.startswith('0x'):
32+
return of_str(istr, 16)
33+
else:
34+
return of_str(istr)
35+
36+
def setup_progress(totalitems):
37+
'''
38+
Generate functions to help track execution progress
39+
'''
40+
last_itemsdone = [0]
41+
last_timedone = [time.time()]
42+
def s_to_hms(remain_s):
43+
'''
44+
Convert seconds to (hours, minutes, seconds)
45+
'''
46+
remain_m = remain_s / 60
47+
remain_h = remain_m / 60
48+
remain_m -= remain_h*60
49+
remain_s = remain_s%60
50+
return remain_h, remain_m, remain_s
51+
def progress(itemsdone):
52+
'''
53+
Convert itemsdone of totalitems into tuple with elements:
54+
1. tuple describing progress in units: (done/total, done, total)
55+
2. remaining time from s_to_hms()
56+
'''
57+
itemprogress = (100.0*itemsdone/totalitems, itemsdone, totalitems)
58+
itemsleft = totalitems - itemsdone
59+
idelta = itemsdone - last_itemsdone[0]
60+
last_itemsdone[0] = itemsdone
61+
timedone = time.time()
62+
tdelta = timedone - last_timedone[0]
63+
last_timedone[0] = timedone
64+
if idelta > 0:
65+
s_per = tdelta / idelta
66+
i_remain = itemsleft
67+
remain_s = int(i_remain * s_per)
68+
return itemprogress, s_to_hms(remain_s)
69+
return itemprogress, (-1, -1, -1)
70+
def interval():
71+
'''
72+
Return time since last progress() call
73+
'''
74+
return time.time() - last_timedone[0]
75+
return interval, progress
76+
77+
def _try_update_parent(parent, objs, stk):
78+
k = stk.pop() # pop the just evaluated item
79+
del objs[k] # preemtively remove since this is the most likely case
80+
if stk:
81+
pparent = objs[stk[-1]]
82+
assert isinstance(pparent, dict)
83+
assert pparent, 'parent is empty'
84+
assert pparent['typ'] != 'int', 'parent wrong type: %r' % (pparent['typ'])
85+
assert 'children' in pparent
86+
pparent['children'].append(parent)
87+
else: # put things back (unlikely)
88+
stk.append(k)
89+
objs[k] = parent
90+
91+
def _parse_str(in_c, in_s, i, objs, stk):
92+
del in_c # unused
93+
endpos = in_s.find('"', i+1)
94+
if endpos < 0:
95+
raise ParserInputError("mismatched double-quote")
96+
k = stk[-1]
97+
assert all((in_s[_k] in (' ', '\t', '\n') for _k in range(k, i))), \
98+
'pre quote is not whitespace at [%d..%d)' % (k, i)
99+
if sys.version_info > (3,):
100+
# need to use unicode_escape of a bytes, but have a str
101+
parent = objs[k] = (in_s[i+1:endpos]).encode('utf-8').decode('unicode_escape')
102+
else:
103+
parent = objs[k] = in_s[i+1:endpos].decode('string_escape')
104+
## try added new item to parent
105+
_try_update_parent(parent, objs, stk)
106+
# next obj
107+
i = endpos+1
108+
stk.append(i)
109+
objs[i] = {}
110+
return i
111+
112+
def _parse_finished(in_c, in_s, i, objs, stk):
113+
del in_c # unused
114+
# close an int, or make sure top object is empty and pop/return
115+
k = stk.pop()
116+
top = objs[k]
117+
del objs[k] # remove from hash
118+
if top: # must be an int
119+
assert isinstance(top, dict)
120+
if top.get('typ', None) != 'd':
121+
raise ParserInputError('Incomplete input stream')
122+
try:
123+
objs[k] = toint(in_s, k, i)
124+
except ValueError:
125+
raise ParserInputError("Integer expected between [%d..%d)" % (k, i))
126+
# push it back
127+
stk.append(k) # this is unlikely so put the extra work here
128+
return
129+
130+
def _parse_end(in_c, in_s, i, objs, stk):
131+
if 'typedb' not in globals(): # first time through this function
132+
# Need access to bap.bir namespace, but avoid circular import
133+
global bir # pylint: disable=global-variable-not-assigned,invalid-name
134+
from .bap import bir
135+
# potential optimization
136+
# define the typedb to optimize
137+
# global typedb # pylint: disable=global-variable-undefined,invalid-name
138+
# typedb = {}
139+
# pop last object
140+
k = stk.pop()
141+
top = objs[k]
142+
del objs[k] # remove from hash
143+
# look at parent
144+
if not stk:
145+
raise ParserInputError('Mismatched input stream')
146+
j = stk[-1]
147+
parent = objs[j]
148+
assert isinstance(parent, dict)
149+
assert parent, 'parent is empty'
150+
assert parent['typ'] != 'int', 'parent wrong type: %r' % (parent['typ'])
151+
assert 'children' in parent
152+
if top: # add to parent if non empty
153+
# make real int before appending
154+
if top['typ'] == 'd': # int
155+
try:
156+
top = toint(in_s, k, i)
157+
except ValueError:
158+
raise ParserInputError("Integer expected between [%d..%d)" % (top, i))
159+
parent['children'].append(top)
160+
if in_c == ',': # add blank object and move on
161+
# next obj
162+
i = i+1
163+
stk.append(i)
164+
objs[i] = {}
165+
return i
166+
else: # we are ending a tuple/list/app do it
167+
# maybe handle apply (num and seq are earlier)
168+
ptyp = parent['typ']
169+
if ptyp == '[':
170+
if in_c != ']':
171+
raise ParserInputError('close %r and open %r mismatch' % (in_c, ptyp))
172+
parent = objs[j] = parent.get('children', []) # pylint: disable=redefined-variable-type
173+
elif ptyp == '(':
174+
if in_c != ')':
175+
raise ParserInputError('close %r and open %r mismatch' % (in_c, ptyp))
176+
parent = objs[j] = tuple(parent.get('children', ())) # pylint: disable=redefined-variable-type
177+
else:
178+
name = ptyp
179+
# potential optimization
180+
# if name not in typedb:
181+
# typedb[name] = getattr(bir, name)
182+
# parent = objs[j] = typedb[name](*parent.get('children', ())) # pylint: disable=redefined-variable-type
183+
parent = objs[j] = getattr(bir, name)(*parent.get('children', ())) # pylint: disable=redefined-variable-type
184+
# now add to parent if exists
185+
_try_update_parent(parent, objs, stk)
186+
# next obj
187+
i = i+1
188+
stk.append(i)
189+
objs[i] = {}
190+
return i
191+
192+
def _parse_start(in_c, in_s, i, objs, stk):
193+
k = stk[-1]
194+
top = objs[k]
195+
if top: # not empty means app
196+
name_start = top['start'] # avoids whitespace issue
197+
name = in_s[name_start:i] # could just strip?
198+
top['typ'] = name
199+
else:
200+
top['typ'] = in_c # list or tuple
201+
top['children'] = []
202+
# next obj
203+
i = i+1
204+
stk.append(i)
205+
objs[i] = {}
206+
return i
207+
208+
def _parse_any(in_c, in_s, i, objs, stk):
209+
del in_s # unused
210+
# look at top to determine type
211+
top = objs[stk[-1]]
212+
if not top: # empty, so need to make type choice between int and app
213+
if in_c.isdigit():
214+
top['typ'] = 'd'
215+
elif in_c in (' ', "\t", "\n"): # ignore whitespace
216+
pass # no setting, skipping whitespace
217+
else:
218+
top['typ'] = 'a'
219+
top['start'] = i # needed since whitespace might make the stack index off
220+
else:
221+
pass # type choice is already made and this char is not interesting
222+
i = i + 1 # keep going!
223+
return i
224+
225+
_parse_functions = { # pylint: disable=invalid-name
226+
'"': _parse_str,
227+
')': _parse_end,
228+
']': _parse_end,
229+
',': _parse_end,
230+
'(': _parse_start,
231+
'[': _parse_start,
232+
}
233+
234+
def _parser(in_s, logger=None):
235+
'''
236+
Main no-eval parser implementation
237+
'''
238+
i = 0
239+
s_len = len(in_s)
240+
stk = [0] # start with 'top' position in stack
241+
objs = {0:{}} # start with blank object
242+
# upon reading a character it always belong to the top object
243+
# if the char ends the top object, then a new empty top is created
244+
# top object uninitialized going into loop first time
245+
interval_check, get_progress = setup_progress(s_len)
246+
while i <= s_len:
247+
if logger is not None and interval_check() > 5:
248+
progress, remaining = get_progress(i)
249+
logger.info("progress: %0.2f%% : %10d of %d" % progress)
250+
logger.info("remaining: %02d:%02d:%02d" % remaining)
251+
if i < s_len:
252+
in_c = in_s[i]
253+
else:
254+
assert i == s_len
255+
_parse_finished(in_c, in_s, i, objs, stk)
256+
break
257+
parse_func = _parse_functions.get(in_c, _parse_any)
258+
i = parse_func(in_c, in_s, i, objs, stk)
259+
# if c == '"':
260+
# i = _parse_str(c, s, i, objs, stk)
261+
# elif c in (',', ')', ']'): # ending item, tricky because tuple/list can end in comma
262+
# i = _parse_end(c, s, i, objs, stk)
263+
# elif c in ('(', '['):
264+
# i = _parse_start(c, s, i, objs, stk)
265+
# else:
266+
# i = _parse_any(c, s, i, objs, stk)
267+
assert len(stk) == 1
268+
assert stk[0] == 0
269+
assert 0 in objs
270+
result = objs[0]
271+
if isinstance(result, dict):
272+
raise ParserInputError('Incomplete input string')
273+
return objs[0]
274+
275+
class ParserInputError(Exception):
276+
'''Class of exceptions for bad input to the parser'''
277+
pass
278+
class ParserError(Exception):
279+
'''Class of exceptions for errors in the parser, not the input'''
280+
pass
281+
282+
def parser(input_str, disable_gc=False, logger=None):
283+
'''
284+
Entrypoint to optimized adt parser.
285+
Input: string (non-empty)
286+
Output: Python object equivalent to eval(input_str) in the context bap.bir
287+
288+
Options: disable_gc: if true, no garbage collection is done while parsing
289+
290+
Notes: Expects a well formatted (ie. balanced) string with caveats:
291+
Only contains string representations of tuples, lists, integers, and
292+
function calls with name such that bap.bir.hasattr(name) is true.
293+
Integers may start with '0x' for base 16, otherwise base 10 is assumed.
294+
Strings must start and end with double-quote and not contain a
295+
double-quote, not even an escaped one
296+
'''
297+
# _parser expects a str
298+
if not isinstance(input_str, str):
299+
input_str = input_str.decode('utf-8')
300+
if input_str == '':
301+
raise ParserInputError("ADT Parser called on empty string")
302+
if disable_gc:
303+
gc.disable() # disable for better timing consistency during testing
304+
result = _parser(input_str, logger=logger)
305+
if disable_gc:
306+
gc.enable()
307+
gc.collect() # force garbage collection to reclaim memory before we leave
308+
return result
309+
310+
EVALFREE_ADT_PARSER = {
311+
'format': 'adt',
312+
'load': parser
313+
}
314+

0 commit comments

Comments
 (0)