-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathparser_cfg.py
530 lines (452 loc) · 18.8 KB
/
parser_cfg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
import argparse
import logging
import sys
from typing import List
logger = logging.getLogger(__name__)
END_OF_ALTERNATE_MARKER = 0
END_OF_RULE_MARKER = 0
END_OF_GRAMMAR_MARKER = 0xFFFF
TO_BE_FILLED_MARKER = 0
REF_RULE_MARKER = 1
LITERAL_MARKER = 2
########################
# EBNF Grammar Parsing #
########################
class ParseState:
def __init__(self):
self.symbol_table = {}
self.grammar_encoding = [] # old name: out_grammar
def print(self, file=sys.stdout):
print_grammar(file, self)
def get_symbol_id(state: ParseState, symbol_name: str) -> int:
if symbol_name not in state.symbol_table:
state.symbol_table[symbol_name] = len(state.symbol_table)
return state.symbol_table[symbol_name]
def generate_symbol_id(state: ParseState, base_name: str) -> int:
next_id = len(state.symbol_table)
state.symbol_table[base_name + "_" + str(next_id)] = next_id
return next_id
def is_word_char(c: str) -> bool:
"""
Check if a char is a-z, A-Z, 0-9, -, _, i.e., chars allowed as rule names
Returns:
"""
return c.isalnum() or c == "-" or c == "_"
def hex_to_int(c: str) -> int:
"""
Convert a hex char to int, c should be in the range of 0-9, a-f, A-F
case insensitive
Args:
c: a hex char
Returns:
int: the int value of the hex char
"""
if c.isdigit():
return int(c)
elif "a" <= c.lower() <= "f":
return ord(c.lower()) - ord("a") + 10
return -1
def remove_leading_white_space(src, rm_leading_newline):
"""
Skips over whitespace and comments in the input string.
This function processes the input string, skipping over any spaces, tabs,
and content following a '#' character, which denotes a comment. The parsing
of a comment continues until the end of the line (denoted by newline characters
'\r' or '\n'). If the 'rm_leading_newline' parameter is set to False, the function
will stop processing and return the remaining string upon encountering a
newline character, otherwise it will skip over newline characters as well.
Parameters:
src (str): The input string to be processed.
rm_leading_newline (bool): A flag indicating whether encountering a newline character
should stop the parsing (False) or if it should be skipped (True).
Returns:
str: The remaining portion of the input string after skipping whitespace and comments.
"""
pos = 0
while pos < len(src) and (src[pos].isspace() or src[pos] == "#"):
if src[pos] == "#":
while pos < len(src) and src[pos] not in ("\r", "\n"):
pos += 1
else:
if not rm_leading_newline and src[pos] in ("\r", "\n"):
break
pos += 1
return src[pos:]
def parse_name(src) -> (str, str):
"""
parse the leading name from the input string
Args:
src: the input grammar string
Returns:
name, remaining_src
"""
pos = 0
while pos < len(src) and is_word_char(src[pos]):
pos += 1
if pos == 0:
raise RuntimeError("expecting name at " + src)
return src[:pos], src[pos:]
def parse_char(src) -> (str, str):
"""
parse the leading char from the input string
:param src:
:return: char, remaining_src
"""
# if we have a backslash, it's maybe an escape
if src[0] == "\\":
esc = src[1]
if esc == "x":
first = hex_to_int(src[2])
if first > -1:
second = hex_to_int(src[3])
if second > -1:
return (first << 4) + second, src[4:]
raise RuntimeError("expecting \\xNN at " + src)
elif esc in ('"', "[", "]"):
return esc, src[2:]
elif esc == "r":
return "\r", src[2:]
elif esc == "n":
return "\n", src[2:]
elif esc == "t":
return "\t", src[2:]
raise RuntimeError("unknown escape at " + src)
elif src:
return src[0], src[1:]
raise RuntimeError("unexpected end of input")
def _parse_rhs_literal_string(src: str, outbuf: List[int]) -> str:
assert src[0] == '"', f"rule should start with '\"', but got {src[0]}"
remaining_src = src[1:]
# advance until we get an end quote or run out of input
while remaining_src and remaining_src[0] != '"':
char, remaining_src = parse_char(remaining_src)
outbuf.append(LITERAL_MARKER)
outbuf.append(ord(char))
outbuf.append(ord(char))
# in case we ran out of input before finding the end quote
if not remaining_src:
raise RuntimeError(f"expecting an end quote at {src},but not found")
# remove the end quote and return the remaining string
return remaining_src[1:]
def _parse_rhs_char_ranges(src: str, outbuf: List[int]) -> str:
assert src[0] == "[", f"rule should start with '[', but got {src[0]}"
remaining_src = src[1:]
start_idx = len(outbuf)
# num chars in range - replaced at end of loop
outbuf.append(TO_BE_FILLED_MARKER)
while remaining_src and remaining_src[0] != "]":
char, remaining_src = parse_char(remaining_src)
outbuf.append(ord(char))
if remaining_src[0] == "-" and remaining_src[1] != "]":
endchar_pair, remaining_src = parse_char(remaining_src[1:])
outbuf.append(ord(endchar_pair))
else:
# This is the case for enumerate, e.g., [0123456789], [abcdef]
# Each char is considered as a range of itself, i.e., c-c
outbuf.append(ord(char))
if not remaining_src:
raise RuntimeError(
f"expecting an ] at {src},but not found, is the char range closed?"
)
# replace num chars with actual
outbuf[start_idx] = len(outbuf) - start_idx - 1
return remaining_src[1:]
def _parse_rhs_symbol_reference(src: str, state: ParseState, outbuf: List[int]) -> str:
assert is_word_char(src[0]), f"rule should start with a word char, but got {src[0]}"
name, remaining_src = parse_name(src)
ref_rule_id = get_symbol_id(state, name)
outbuf.append(REF_RULE_MARKER)
outbuf.append(ref_rule_id)
return remaining_src
def _parse_rhs_grouping(
remaining_src: str, state: ParseState, rule_name: str, outbuf: List[int]
) -> str:
assert (
remaining_src[0] == "("
), f"rule should start with '(', but got {remaining_src[0]}"
remaining_src = remove_leading_white_space(remaining_src[1:], True)
# parse nested alternates into synthesized rule
synthetic_rule_id = generate_symbol_id(state, rule_name)
remaining_src = parse_rhs(state, remaining_src, rule_name, synthetic_rule_id, True)
# output reference to synthesized rule
outbuf.append(REF_RULE_MARKER)
outbuf.append(synthetic_rule_id)
if not remaining_src or remaining_src[0] != ")":
raise RuntimeError("expecting ')' at " + remaining_src)
return remaining_src[1:]
def _parse_rhs_repetition_operators(
remaining_src: str,
state: ParseState,
rule_name: str,
last_sym_start: int,
outbuf: List[int],
) -> str:
assert remaining_src[0] in (
"*",
"+",
"?",
), f"rule should start with '*', '+', or '?', but got {remaining_src[0]}"
out_grammar = state.grammar_encoding
# last_sym_start = len(outbuf)
# apply transformation to previous symbol (last_sym_start -
# end) according to rewrite rules:
# S* --> S' ::= S S' |
# S+ --> S' ::= S S' | S
# S? --> S' ::= S |
sub_rule_id = generate_symbol_id(state, rule_name)
out_grammar.append(sub_rule_id)
sub_rule_offset = len(out_grammar)
# placeholder for size of 1st alternate
out_grammar.append(TO_BE_FILLED_MARKER)
# add preceding symbol to generated rule
out_grammar.extend(outbuf[last_sym_start:])
if remaining_src[0] in ("*", "+"):
# cause generated rule to recurse
out_grammar.append(REF_RULE_MARKER)
out_grammar.append(sub_rule_id)
# apply actual size
out_grammar[sub_rule_offset] = len(out_grammar) - sub_rule_offset
# mark end of 1st alternate
out_grammar.append(END_OF_ALTERNATE_MARKER)
sub_rule_offset = len(out_grammar)
# placeholder for size of 2nd alternate
out_grammar.append(TO_BE_FILLED_MARKER)
if remaining_src[0] == "+":
# add preceding symbol as alternate only for '+'
out_grammar.extend(outbuf[last_sym_start:])
# apply actual size of 2nd alternate
out_grammar[sub_rule_offset] = len(out_grammar) - sub_rule_offset
# mark end of 2nd alternate, then end of rule
out_grammar.append(END_OF_ALTERNATE_MARKER)
out_grammar.append(END_OF_RULE_MARKER)
# in original rule, replace previous symbol with reference to generated rule
outbuf[last_sym_start:] = [REF_RULE_MARKER, sub_rule_id]
return remaining_src[1:]
def parse_simple_rhs(state, rhs: str, rule_name: str, outbuf, is_nested):
simple_rhs_offset = len(outbuf)
# sequence size, will be replaced at end when known
outbuf.append(TO_BE_FILLED_MARKER)
last_sym_start = len(outbuf)
remaining_rhs = rhs
while remaining_rhs:
if remaining_rhs[0] == '"': # literal string
# mark the start of the last symbol, for repetition operator
last_sym_start = len(outbuf)
remaining_rhs = _parse_rhs_literal_string(remaining_rhs, outbuf)
elif remaining_rhs[0] == "[": # char range(s)
# mark the start of the last symbol, for repetition operator
last_sym_start = len(outbuf)
remaining_rhs = _parse_rhs_char_ranges(remaining_rhs, outbuf)
elif is_word_char(remaining_rhs[0]): # rule reference
# mark the start of the last symbol, for repetition operator
last_sym_start = len(outbuf)
remaining_rhs = _parse_rhs_symbol_reference(remaining_rhs, state, outbuf)
elif remaining_rhs[0] == "(": # grouping
# mark the start of the last symbol, for repetition operator
last_sym_start = len(outbuf)
remaining_rhs = _parse_rhs_grouping(remaining_rhs, state, rule_name, outbuf)
elif remaining_rhs[0] in ("*", "+", "?"): # repetition operator
# No need to mark the start of the last symbol, because we already did it
if len(outbuf) - simple_rhs_offset - 1 == 0:
raise RuntimeError(
"expecting preceeding item to */+/? at " + remaining_rhs
)
remaining_rhs = _parse_rhs_repetition_operators(
remaining_rhs, state, rule_name, last_sym_start, outbuf
)
else:
# case for newline, i.e., end of rule
assert remaining_rhs[0] in [
"\n",
"|",
")",
], f"rule should end with newline or '|', but got {remaining_rhs[0]}"
# we break here so that we call parse_rule again to parse the next rule
break
# Here we do not rm newline deliberately so that we know the rhs is ended
remaining_rhs = remove_leading_white_space(
remaining_rhs, rm_leading_newline=is_nested
)
# apply actual size of this alternate sequence
outbuf[simple_rhs_offset] = len(outbuf) - simple_rhs_offset
# mark end of alternate
outbuf.append(END_OF_ALTERNATE_MARKER)
return remaining_rhs
def parse_rhs(state, rhs: str, rule_name, rule_id, is_nested):
outbuf = []
remaining_rhs = parse_simple_rhs(state, rhs, rule_name, outbuf, is_nested)
while remaining_rhs and remaining_rhs[0] == "|":
remaining_rhs = remove_leading_white_space(remaining_rhs[1:], True)
remaining_rhs = parse_simple_rhs(
state, remaining_rhs, rule_name, outbuf, is_nested
)
# Now we have finished parsing the rhs, we can add the rule to the grammar_encoding
state.grammar_encoding.append(rule_id)
state.grammar_encoding.extend(outbuf)
state.grammar_encoding.append(END_OF_RULE_MARKER)
return remaining_rhs
def parse_rule(state: ParseState, rule_text: str) -> str:
name, remaining_rule_text = parse_name(rule_text)
remaining_rule_text = remove_leading_white_space(remaining_rule_text, False)
# check if the rule is already defined, TODO: what will happen if the rule is already defined?
rule_id = get_symbol_id(state, name)
if remaining_rule_text[:3] != "::=":
raise RuntimeError("expecting ::= at " + remaining_rule_text)
remaining_rule_text = remove_leading_white_space(remaining_rule_text[3:], True)
remaining_rule_text = parse_rhs(state, remaining_rule_text, name, rule_id, False)
if remaining_rule_text and remaining_rule_text[0] == "\r":
remaining_rule_text = (
remaining_rule_text[2:]
if remaining_rule_text[1] == "\n"
else remaining_rule_text[1:]
)
elif remaining_rule_text and remaining_rule_text[0] == "\n":
remaining_rule_text = remaining_rule_text[1:]
elif remaining_rule_text:
raise RuntimeError("expecting newline or end at " + remaining_rule_text)
return remove_leading_white_space(remaining_rule_text, True)
def parse_ebnf(grammar_text: str) -> ParseState:
try:
state = ParseState()
remaining_grammar_text = remove_leading_white_space(grammar_text, True)
last_grammar_repr = ""
while remaining_grammar_text:
if last_grammar_repr:
last_parsed_rule_len = len(last_grammar_repr) - len(
remaining_grammar_text
)
logger.debug(
f"last_parsed_rule: {last_grammar_repr[:last_parsed_rule_len]}"
)
last_grammar_repr = remaining_grammar_text
remaining_grammar_text = parse_rule(state, remaining_grammar_text)
state.grammar_encoding.append(END_OF_GRAMMAR_MARKER)
return state
except RuntimeError as err:
logger.warning("error parsing grammar:", err)
return ParseState()
###################################
# EBNF Grammar Parsing ends here #
###################################
def break_grammar_into_rules(grammar_encoding: List[int]) -> List[List[int]]:
offset = 0
# we loop until we reach the end of the grammar_encoding
rule_encodings = []
i = 0
while i < len(grammar_encoding) - 2:
if (
grammar_encoding[i] == END_OF_ALTERNATE_MARKER
and grammar_encoding[i + 1] == END_OF_RULE_MARKER
):
rule_encodings.append(grammar_encoding[offset : i + 2])
offset = i + 2
# skip the END_OF_RULE_MARKER
# This is mandatory because if we do not skip the END_OF_RULE_MARKER
# we fail in the case where the next rule has rule_id 0
i += 1
i += 1
return rule_encodings
def break_rule_into_elements(rule_encoding: List[int]) -> List[List[int]]:
rule_id = rule_encoding.pop(0)
end_of_rule_marker = rule_encoding.pop(-1)
assert (
end_of_rule_marker == END_OF_RULE_MARKER
), f"rule should end with {END_OF_RULE_MARKER}, but got {end_of_rule_marker}"
offset = 0
elements = []
while offset < len(rule_encoding):
element_size = rule_encoding[offset]
assert (
rule_encoding[offset + element_size] == END_OF_ALTERNATE_MARKER
), f"element should end with {END_OF_ALTERNATE_MARKER}, but got {rule_encoding[offset + element_size]}"
elements.append(rule_encoding[offset : offset + element_size + 1])
offset += element_size + 1
return elements
def _print_annotated_grammar(file, grammar_encoding, symbol_id_names, index=0):
rule_id = grammar_encoding[index]
print(f"<{index}>{symbol_id_names[rule_id]} ::=", end=" ", file=file)
pos = index + 1
while grammar_encoding[pos]:
if pos - 1 > index:
print("|", end=" ", file=file)
pos += 1 # sequence size, not needed here
while grammar_encoding[pos]:
if grammar_encoding[pos] == REF_RULE_MARKER:
ref_rule_id = grammar_encoding[pos + 1]
print(
f"<{pos}>{symbol_id_names[ref_rule_id]}",
end=" ",
file=file,
)
pos += 2
else:
print("<{}>[".format(pos), end="", file=file)
num_chars = grammar_encoding[pos]
pos += 1
for i in range(0, num_chars, 2):
print(
"{}-".format(chr(grammar_encoding[pos + i])), end="", file=file
)
if i + 1 < num_chars:
print(
"{}".format(chr(grammar_encoding[pos + i + 1])),
end="",
file=file,
)
print("]", end=" ", file=file)
pos += num_chars
pos += 1
print(file=file)
return pos + 1
def print_grammar(file, state):
pos = 0
symbol_id_names = {v: k for k, v in state.symbol_table.items()}
print("Grammar Rules:", file=file)
while (
pos < len(state.grammar_encoding)
and state.grammar_encoding[pos] != END_OF_GRAMMAR_MARKER
):
pos = _print_annotated_grammar(
file, state.grammar_encoding, symbol_id_names, pos
)
if pos > len(state.grammar_encoding):
raise Warning(f"grammar_encoding is not ended with {END_OF_GRAMMAR_MARKER}")
pos = 0
print("\nGrammar Hex representation:", file=file)
while (
pos < len(state.grammar_encoding)
and state.grammar_encoding[pos] != END_OF_GRAMMAR_MARKER
):
print(f"{state.grammar_encoding[pos]:04x}", end=" ", file=file)
pos += 1
if pos > len(state.grammar_encoding):
raise Warning(f"grammar_encoding is not ended with {END_OF_GRAMMAR_MARKER}")
else:
print("ffff\n")
print("Rules Decimal representation:", file=file)
# we loop until we reach the end of the grammar_encoding
rule_encodings = break_grammar_into_rules(state.grammar_encoding)
for rule_encoding in rule_encodings:
rule_id = rule_encoding[0]
print(
f"<{rule_id}> {break_rule_into_elements(rule_encoding)}",
file=file,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parse EBNF grammar files.")
parser.add_argument(
"-g",
"--grammar-file",
nargs="?",
default="examples/grammars/json.ebnf",
help="Path to the grammar file (default: examples/grammars/json.ebnf)",
)
args = parser.parse_args()
# set logging level
logging.basicConfig(level=logging.DEBUG)
with open(args.grammar_file, "r") as file:
input_text = file.read()
parsed_grammar = parse_ebnf(input_text)
parsed_grammar.print()
print(f"symbol_ids: \n{parsed_grammar.symbol_table}")
start_rule_id = parsed_grammar.symbol_table["root"]