forked from Pissandshittium/pissandshittium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotation_tokenizer.py
142 lines (115 loc) · 4.84 KB
/
annotation_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Copyright 2019 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""
A tokenizer for traffic annotation definitions.
"""
from typing import NamedTuple, Optional
import re
# Regexen that match a token inside the annotation definition arguments. Stored
# as a list instead of a dict, to preserve order.
#
# Order matters because otherwise, 'symbol' could be parsed before
# 'string_literal' (i.e., R"(...)" would be misinterpreted as the symbol 'R',
# followed by a string with parentheses in it).
TOKEN_REGEXEN = [
# Comma for separating args.
('comma', re.compile(r'(,)')),
# String literal. "string" or R"(string)". In Java, this will incorrectly
# accept R-strings, which aren't part of the language's syntax. But since
# that wouldn't compile anyways, we can just ignore this issue.
('string_literal', re.compile(r'"((?:\\.|[^"])*?)"|R"\((.*?)\)"',
re.DOTALL)),
# The '+' operator, for string concatenation. Java doesn't have multi-line
# string literals, so this is the only way to keep long strings readable. It
# doesn't incur a runtime cost, since the Java compiler is smart enough to
# concat the string literals at compile time. See "constant expressions" in
# the JLS:
# https://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.28
('plus', re.compile(r'(\+)')),
# C++ or Java identifier.
('symbol', re.compile(r'([a-zA-Z_][a-zA-Z_0-9]*)')),
# Left parenthesis.
('left_paren', re.compile(r'(\()')),
# Right parenthesis.
('right_paren', re.compile(r'(\))')),
]
# Number of characters to include in the context (for error reporting).
CONTEXT_LENGTH = 20
class Token(NamedTuple):
type: str
value: str
pos: int
class SourceCodeParsingError(Exception):
"""An error during C++ or Java parsing/tokenizing."""
def __init__(self, expected_type, body, pos, file_path, line_number):
context = body[pos:pos + CONTEXT_LENGTH]
msg = ("Expected {} in annotation definition at {}:{}.\n" +
"near '{}'").format(expected_type, file_path, line_number, context)
Exception.__init__(self, msg)
class Tokenizer:
"""Simple tokenizer with basic error reporting.
Use advance() or maybe_advance() to take tokens from the string, one at a
time.
"""
def __init__(self, body, file_path, line_number):
self.body = body
self.pos = 0
self.file_path = file_path
self.line_number = line_number
def _assert_token_type(self, token, expected_type):
"""Like assert(), but reports errors in a _somewhat_ useful way."""
if token and token.type == expected_type:
return
# Skip whitespace to make the error message more useful.
pos = self._skip_whitespace()
raise SourceCodeParsingError(expected_type, self.body, pos, self.file_path,
self.line_number)
def _skip_whitespace(self):
"""Return the position of the first non-whitespace character from here."""
whitespace_re = re.compile(r'\s*')
return whitespace_re.match(self.body, self.pos).end()
def _get_token(self):
"""Return the token here, or None on failure."""
# Skip initial whitespace.
pos = self._skip_whitespace()
# Find the token here, if there's one.
token = None
for (token_type, regex) in TOKEN_REGEXEN:
re_match = regex.match(self.body, pos)
if re_match:
raw_token = re_match.group(0)
token_content = next(g for g in re_match.groups() if g is not None)
if token_type == 'string_literal' and not raw_token.startswith('R"'):
# Remove the extra backslash in backslash sequences, but only in
# non-R strings. R-strings don't need escaping.
backslash_regex = re.compile(r'\\(\\|")')
token_content = backslash_regex.sub(r'\1', token_content)
token = Token(token_type, token_content, re_match.end())
break
return token
def maybe_advance(self, expected_type: str) -> Optional[str]:
"""Advance the tokenizer by one token if it has |expected_type|.
Args:
expected_type: expected |type| attribute of the token.
Returns:
The |value| attribute of the token if it has the right type, or None if it
has another type.
"""
token = self._get_token()
if token and token.type == expected_type:
self.pos = token.pos
return token.value
return None
def advance(self, expected_type: str) -> str:
"""Advance the tokenizer by one token, asserting its type.
Throws an error if the token at point has the wrong type.
Args:
expected_type: expected |type| attribute of the token.
Returns:
The |value| attribute of the token at point.
"""
token = self._get_token()
self._assert_token_type(token, expected_type)
self.pos = token.pos
return token.value