|
| 1 | +#! /usr/bin/env python3 |
| 2 | +''' |
| 3 | +Parser for ADT string from bap that does not use eval |
| 4 | +
|
| 5 | +The nieve eval-based version runs into out-of-memory conditions on large files |
| 6 | +''' |
| 7 | +import gc |
| 8 | +import sys |
| 9 | +import time |
| 10 | + |
| 11 | +from . import bir |
| 12 | + |
| 13 | + |
| 14 | +def toint(string, start, end): |
| 15 | + ''' |
| 16 | + Convert substring string[start:end] to integer/long without eval |
| 17 | +
|
| 18 | + Note: may contain leading whitespace |
| 19 | + ''' |
| 20 | + istr = string[start:end].lstrip() |
| 21 | + |
| 22 | + if sys.version_info > (3,): # then longs don't exist |
| 23 | + if istr.endswith('L'): |
| 24 | + istr = istr.rstrip('L') |
| 25 | + of_str = int |
| 26 | + else: |
| 27 | + if istr.endswith('L'): |
| 28 | + of_str = long |
| 29 | + else: |
| 30 | + of_str = int |
| 31 | + if istr.startswith('0x'): |
| 32 | + return of_str(istr, 16) |
| 33 | + else: |
| 34 | + return of_str(istr) |
| 35 | + |
| 36 | +def setup_progress(totalitems): |
| 37 | + ''' |
| 38 | + Generate functions to help track execution progress |
| 39 | + ''' |
| 40 | + last_itemsdone = [0] |
| 41 | + last_timedone = [time.time()] |
| 42 | + def s_to_hms(remain_s): |
| 43 | + ''' |
| 44 | + Convert seconds to (hours, minutes, seconds) |
| 45 | + ''' |
| 46 | + remain_m = remain_s / 60 |
| 47 | + remain_h = remain_m / 60 |
| 48 | + remain_m -= remain_h*60 |
| 49 | + remain_s = remain_s%60 |
| 50 | + return remain_h, remain_m, remain_s |
| 51 | + def progress(itemsdone): |
| 52 | + ''' |
| 53 | + Convert itemsdone of totalitems into tuple with elements: |
| 54 | + 1. tuple describing progress in units: (done/total, done, total) |
| 55 | + 2. remaining time from s_to_hms() |
| 56 | + ''' |
| 57 | + itemprogress = (100.0*itemsdone/totalitems, itemsdone, totalitems) |
| 58 | + itemsleft = totalitems - itemsdone |
| 59 | + idelta = itemsdone - last_itemsdone[0] |
| 60 | + last_itemsdone[0] = itemsdone |
| 61 | + timedone = time.time() |
| 62 | + tdelta = timedone - last_timedone[0] |
| 63 | + last_timedone[0] = timedone |
| 64 | + if idelta > 0: |
| 65 | + s_per = tdelta / idelta |
| 66 | + i_remain = itemsleft |
| 67 | + remain_s = int(i_remain * s_per) |
| 68 | + return itemprogress, s_to_hms(remain_s) |
| 69 | + return itemprogress, (-1, -1, -1) |
| 70 | + def interval(): |
| 71 | + ''' |
| 72 | + Return time since last progress() call |
| 73 | + ''' |
| 74 | + return time.time() - last_timedone[0] |
| 75 | + return interval, progress |
| 76 | + |
| 77 | +def _try_update_parent(parent, objs, stk): |
| 78 | + k = stk.pop() # pop the just evaluated item |
| 79 | + del objs[k] # preemtively remove since this is the most likely case |
| 80 | + if stk: |
| 81 | + pparent = objs[stk[-1]] |
| 82 | + assert isinstance(pparent, dict) |
| 83 | + assert pparent, 'parent is empty' |
| 84 | + assert pparent['typ'] != 'int', 'parent wrong type: %r' % (pparent['typ']) |
| 85 | + assert 'children' in pparent |
| 86 | + pparent['children'].append(parent) |
| 87 | + else: # put things back (unlikely) |
| 88 | + stk.append(k) |
| 89 | + objs[k] = parent |
| 90 | + |
| 91 | +def _parse_str(in_c, in_s, i, objs, stk): |
| 92 | + del in_c # unused |
| 93 | + endpos = in_s.find('"', i+1) |
| 94 | + if endpos < 0: |
| 95 | + raise ParserInputError("mismatched double-quote") |
| 96 | + k = stk[-1] |
| 97 | + assert all((in_s[_k] in (' ', '\t', '\n') for _k in range(k, i))), \ |
| 98 | + 'pre quote is not whitespace at [%d..%d)' % (k, i) |
| 99 | + if sys.version_info > (3,): |
| 100 | + # need to use unicode_escape of a bytes, but have a str |
| 101 | + parent = objs[k] = (in_s[i+1:endpos]).encode('utf-8').decode('unicode_escape') |
| 102 | + else: |
| 103 | + parent = objs[k] = in_s[i+1:endpos].decode('string_escape') |
| 104 | + ## try added new item to parent |
| 105 | + _try_update_parent(parent, objs, stk) |
| 106 | + # next obj |
| 107 | + i = endpos+1 |
| 108 | + stk.append(i) |
| 109 | + objs[i] = {} |
| 110 | + return i |
| 111 | + |
| 112 | +def _parse_finished(in_c, in_s, i, objs, stk): |
| 113 | + del in_c # unused |
| 114 | + # close an int, or make sure top object is empty and pop/return |
| 115 | + k = stk.pop() |
| 116 | + top = objs[k] |
| 117 | + del objs[k] # remove from hash |
| 118 | + if top: # must be an int |
| 119 | + assert isinstance(top, dict) |
| 120 | + if top.get('typ', None) != 'd': |
| 121 | + raise ParserInputError('Incomplete input stream') |
| 122 | + try: |
| 123 | + objs[k] = toint(in_s, k, i) |
| 124 | + except ValueError: |
| 125 | + raise ParserInputError("Integer expected between [%d..%d)" % (k, i)) |
| 126 | + # push it back |
| 127 | + stk.append(k) # this is unlikely so put the extra work here |
| 128 | + return |
| 129 | + |
| 130 | +def _parse_end(in_c, in_s, i, objs, stk): |
| 131 | + if 'typedb' not in globals(): # first time through this function |
| 132 | + # Need access to bap.bir namespace, but avoid circular import |
| 133 | + global bir # pylint: disable=global-variable-not-assigned,invalid-name |
| 134 | + from .bap import bir |
| 135 | + # potential optimization |
| 136 | + # define the typedb to optimize |
| 137 | +# global typedb # pylint: disable=global-variable-undefined,invalid-name |
| 138 | +# typedb = {} |
| 139 | + # pop last object |
| 140 | + k = stk.pop() |
| 141 | + top = objs[k] |
| 142 | + del objs[k] # remove from hash |
| 143 | + # look at parent |
| 144 | + if not stk: |
| 145 | + raise ParserInputError('Mismatched input stream') |
| 146 | + j = stk[-1] |
| 147 | + parent = objs[j] |
| 148 | + assert isinstance(parent, dict) |
| 149 | + assert parent, 'parent is empty' |
| 150 | + assert parent['typ'] != 'int', 'parent wrong type: %r' % (parent['typ']) |
| 151 | + assert 'children' in parent |
| 152 | + if top: # add to parent if non empty |
| 153 | + # make real int before appending |
| 154 | + if top['typ'] == 'd': # int |
| 155 | + try: |
| 156 | + top = toint(in_s, k, i) |
| 157 | + except ValueError: |
| 158 | + raise ParserInputError("Integer expected between [%d..%d)" % (top, i)) |
| 159 | + parent['children'].append(top) |
| 160 | + if in_c == ',': # add blank object and move on |
| 161 | + # next obj |
| 162 | + i = i+1 |
| 163 | + stk.append(i) |
| 164 | + objs[i] = {} |
| 165 | + return i |
| 166 | + else: # we are ending a tuple/list/app do it |
| 167 | + # maybe handle apply (num and seq are earlier) |
| 168 | + ptyp = parent['typ'] |
| 169 | + if ptyp == '[': |
| 170 | + if in_c != ']': |
| 171 | + raise ParserInputError('close %r and open %r mismatch' % (in_c, ptyp)) |
| 172 | + parent = objs[j] = parent.get('children', []) # pylint: disable=redefined-variable-type |
| 173 | + elif ptyp == '(': |
| 174 | + if in_c != ')': |
| 175 | + raise ParserInputError('close %r and open %r mismatch' % (in_c, ptyp)) |
| 176 | + parent = objs[j] = tuple(parent.get('children', ())) # pylint: disable=redefined-variable-type |
| 177 | + else: |
| 178 | + name = ptyp |
| 179 | + # potential optimization |
| 180 | +# if name not in typedb: |
| 181 | +# typedb[name] = getattr(bir, name) |
| 182 | +# parent = objs[j] = typedb[name](*parent.get('children', ())) # pylint: disable=redefined-variable-type |
| 183 | + parent = objs[j] = getattr(bir, name)(*parent.get('children', ())) # pylint: disable=redefined-variable-type |
| 184 | + # now add to parent if exists |
| 185 | + _try_update_parent(parent, objs, stk) |
| 186 | + # next obj |
| 187 | + i = i+1 |
| 188 | + stk.append(i) |
| 189 | + objs[i] = {} |
| 190 | + return i |
| 191 | + |
| 192 | +def _parse_start(in_c, in_s, i, objs, stk): |
| 193 | + k = stk[-1] |
| 194 | + top = objs[k] |
| 195 | + if top: # not empty means app |
| 196 | + name_start = top['start'] # avoids whitespace issue |
| 197 | + name = in_s[name_start:i] # could just strip? |
| 198 | + top['typ'] = name |
| 199 | + else: |
| 200 | + top['typ'] = in_c # list or tuple |
| 201 | + top['children'] = [] |
| 202 | + # next obj |
| 203 | + i = i+1 |
| 204 | + stk.append(i) |
| 205 | + objs[i] = {} |
| 206 | + return i |
| 207 | + |
| 208 | +def _parse_any(in_c, in_s, i, objs, stk): |
| 209 | + del in_s # unused |
| 210 | + # look at top to determine type |
| 211 | + top = objs[stk[-1]] |
| 212 | + if not top: # empty, so need to make type choice between int and app |
| 213 | + if in_c.isdigit(): |
| 214 | + top['typ'] = 'd' |
| 215 | + elif in_c in (' ', "\t", "\n"): # ignore whitespace |
| 216 | + pass # no setting, skipping whitespace |
| 217 | + else: |
| 218 | + top['typ'] = 'a' |
| 219 | + top['start'] = i # needed since whitespace might make the stack index off |
| 220 | + else: |
| 221 | + pass # type choice is already made and this char is not interesting |
| 222 | + i = i + 1 # keep going! |
| 223 | + return i |
| 224 | + |
| 225 | +_parse_functions = { # pylint: disable=invalid-name |
| 226 | + '"': _parse_str, |
| 227 | + ')': _parse_end, |
| 228 | + ']': _parse_end, |
| 229 | + ',': _parse_end, |
| 230 | + '(': _parse_start, |
| 231 | + '[': _parse_start, |
| 232 | +} |
| 233 | + |
| 234 | +def _parser(in_s, logger=None): |
| 235 | + ''' |
| 236 | + Main no-eval parser implementation |
| 237 | + ''' |
| 238 | + i = 0 |
| 239 | + s_len = len(in_s) |
| 240 | + stk = [0] # start with 'top' position in stack |
| 241 | + objs = {0:{}} # start with blank object |
| 242 | + # upon reading a character it always belong to the top object |
| 243 | + # if the char ends the top object, then a new empty top is created |
| 244 | + # top object uninitialized going into loop first time |
| 245 | + interval_check, get_progress = setup_progress(s_len) |
| 246 | + while i <= s_len: |
| 247 | + if logger is not None and interval_check() > 5: |
| 248 | + progress, remaining = get_progress(i) |
| 249 | + logger.info("progress: %0.2f%% : %10d of %d" % progress) |
| 250 | + logger.info("remaining: %02d:%02d:%02d" % remaining) |
| 251 | + if i < s_len: |
| 252 | + in_c = in_s[i] |
| 253 | + else: |
| 254 | + assert i == s_len |
| 255 | + _parse_finished(in_c, in_s, i, objs, stk) |
| 256 | + break |
| 257 | + parse_func = _parse_functions.get(in_c, _parse_any) |
| 258 | + i = parse_func(in_c, in_s, i, objs, stk) |
| 259 | +# if c == '"': |
| 260 | +# i = _parse_str(c, s, i, objs, stk) |
| 261 | +# elif c in (',', ')', ']'): # ending item, tricky because tuple/list can end in comma |
| 262 | +# i = _parse_end(c, s, i, objs, stk) |
| 263 | +# elif c in ('(', '['): |
| 264 | +# i = _parse_start(c, s, i, objs, stk) |
| 265 | +# else: |
| 266 | +# i = _parse_any(c, s, i, objs, stk) |
| 267 | + assert len(stk) == 1 |
| 268 | + assert stk[0] == 0 |
| 269 | + assert 0 in objs |
| 270 | + result = objs[0] |
| 271 | + if isinstance(result, dict): |
| 272 | + raise ParserInputError('Incomplete input string') |
| 273 | + return objs[0] |
| 274 | + |
| 275 | +class ParserInputError(Exception): |
| 276 | + '''Class of exceptions for bad input to the parser''' |
| 277 | + pass |
| 278 | +class ParserError(Exception): |
| 279 | + '''Class of exceptions for errors in the parser, not the input''' |
| 280 | + pass |
| 281 | + |
| 282 | +def parser(input_str, disable_gc=False, logger=None): |
| 283 | + ''' |
| 284 | + Entrypoint to optimized adt parser. |
| 285 | + Input: string (non-empty) |
| 286 | + Output: Python object equivalent to eval(input_str) in the context bap.bir |
| 287 | +
|
| 288 | + Options: disable_gc: if true, no garbage collection is done while parsing |
| 289 | +
|
| 290 | + Notes: Expects a well formatted (ie. balanced) string with caveats: |
| 291 | + Only contains string representations of tuples, lists, integers, and |
| 292 | + function calls with name such that bap.bir.hasattr(name) is true. |
| 293 | + Integers may start with '0x' for base 16, otherwise base 10 is assumed. |
| 294 | + Strings must start and end with double-quote and not contain a |
| 295 | + double-quote, not even an escaped one |
| 296 | + ''' |
| 297 | + # _parser expects a str |
| 298 | + if not isinstance(input_str, str): |
| 299 | + input_str = input_str.decode('utf-8') |
| 300 | + if input_str == '': |
| 301 | + raise ParserInputError("ADT Parser called on empty string") |
| 302 | + if disable_gc: |
| 303 | + gc.disable() # disable for better timing consistency during testing |
| 304 | + result = _parser(input_str, logger=logger) |
| 305 | + if disable_gc: |
| 306 | + gc.enable() |
| 307 | + gc.collect() # force garbage collection to reclaim memory before we leave |
| 308 | + return result |
| 309 | + |
| 310 | +EVALFREE_ADT_PARSER = { |
| 311 | + 'format': 'adt', |
| 312 | + 'load': parser |
| 313 | +} |
| 314 | + |
0 commit comments