Skip to content

Commit 3f367d5

Browse files
committed
Fix Lexer greediness
Some edge cases around numbers were not handled as expected. This commit adds test cases from the 2 RFCs clarifying the expected behaviour ( graphql/graphql-spec#601, graphql/graphql-spec#599) and updates the Lexer to match. This is technically a breaking change but most cases were likely to lead to validation errors (e.g. "0xF1" being parsed as [0, xF1] when expecting a list of integers).
1 parent bb72c22 commit 3f367d5

File tree

3 files changed

+77
-19
lines changed

3 files changed

+77
-19
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ Unreleased
4747
- `SchemaVisitor.(on_field_definition|on_argument_definition|on_input_field_definition)` have become `SchemaVisitor.(on_field|on_argument|on_input_field)` to maintain consistency with other methods.
4848
- Do not expose `GraphQLExtension` on `py_gql`.
4949

50+
- Fix Lexer greediness. Some edge cases were not handled as expected. This commit adds test cases from the 2 RFCs clarifying the expected behaviour ([graphql/graphql-spec#601](https://github.com/graphql/graphql-spec/pull/601), [graphql/graphql-spec#599](https://github.com/graphql/graphql-spec/pull/599)) and updates the Lexer to match. This is _technically_ a breaking change but most cases were likely to lead to validation errors (e.g. "0xF1" being parsed as [0, xF1] when expecting a list of integers).
51+
5052
[0.4.0](https://github.com/lirsacc/py-gql/releases/tag/0.4.0) - 2019-10-10
5153
--------------------------------------------------------------------------
5254

py_gql/lang/lexer.py

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515

1616
from .._string_utils import ensure_unicode, parse_block_string
1717
from ..exc import (
18-
GraphQLSyntaxError,
1918
InvalidCharacter,
2019
InvalidEscapeSequence,
2120
NonTerminatedString,
@@ -47,12 +46,14 @@
4746
Token,
4847
)
4948

50-
EOL_CHARS = frozenset([0x000A, 0x000D]) # "\n" # "\r"
49+
EOL_CHARS = [0x000A, 0x000D] # "\n" # "\r"
5150

52-
IGNORED_CHARS = (
53-
frozenset([0xFEFF, 0x0009, 0x0020, 0x002C])
54-
| EOL_CHARS # BOM # \t # SPACE # ,
55-
)
51+
IGNORED_CHARS = [ # BOM # \t # SPACE # ,
52+
0xFEFF,
53+
0x0009,
54+
0x0020,
55+
0x002C,
56+
] + EOL_CHARS
5657

5758
SYMBOLS = {
5859
cls.value: cls
@@ -86,14 +87,27 @@
8687

8788

8889
def _unexpected(
89-
expected: str, char: str, position: int, source: str
90-
) -> GraphQLSyntaxError:
90+
char: Optional[str],
91+
position: int,
92+
source: str,
93+
expected: Optional[str] = None,
94+
) -> Union[UnexpectedEOF, UnexpectedCharacter]:
9195
if char is None:
9296
return UnexpectedEOF(position - 1, source)
93-
else:
97+
elif expected is not None:
9498
return UnexpectedCharacter(
9599
'Expected "%s" but found "%s"' % (expected, char), position, source
96100
)
101+
else:
102+
return UnexpectedCharacter(
103+
'Unexpected character "%s"' % char, position, source
104+
)
105+
106+
107+
def _is_name_start(code):
108+
return (
109+
code == 0x005F or 0x0041 <= code <= 0x005A or 0x0061 <= code <= 0x007A
110+
)
97111

98112

99113
class Lexer:
@@ -165,9 +179,7 @@ def _read_ellipsis(self) -> Ellip:
165179
char = self._peek()
166180
self._position += 1
167181
if char != ".":
168-
raise _unexpected(
169-
".", cast(str, char), self._position, self._source
170-
)
182+
raise _unexpected(char, self._position, self._source, ".")
171183
return Ellip(start, self._position)
172184

173185
def _read_string(self) -> String:
@@ -296,6 +308,17 @@ def _read_number(self) -> Union[Integer, Float]:
296308

297309
self._read_over_integer()
298310

311+
# Explicit lookahead restrictions.
312+
next_char = self._peek()
313+
if next_char is not None:
314+
next_code = ord(next_char)
315+
if _is_name_start(next_code):
316+
raise UnexpectedCharacter(
317+
'Unexpected character "%s"' % char,
318+
self._position,
319+
self._source,
320+
)
321+
299322
end = self._position
300323
value = self._source[start:end]
301324
return (
@@ -312,7 +335,7 @@ def _read_over_integer(self):
312335
if code == 0x0030: # "0"
313336
self._position += 1
314337
char = self._peek()
315-
if char is not None and ord(char) == 0x0030:
338+
if char is not None and (0x0030 <= ord(char) <= 0x0039):
316339
raise UnexpectedCharacter(
317340
'Unexpected character "%s"' % char,
318341
self._position,
@@ -406,11 +429,7 @@ def __next__(self) -> Token:
406429
return self._read_string()
407430
elif code == 0x002D or 0x0030 <= code <= 0x0039:
408431
return self._read_number()
409-
elif (
410-
code == 0x005F
411-
or 0x0041 <= code <= 0x005A
412-
or 0x0061 <= code <= 0x007A
413-
):
432+
elif _is_name_start(code):
414433
return self._read_name()
415434
else:
416435
raise UnexpectedCharacter(

tests/test_lang/test_lexer.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919
def lex_one(source):
2020
lexer = Lexer(source)
2121
assert type(next(lexer)) == token.SOF
22-
return next(lexer)
22+
val = next(lexer)
23+
assert type(next(lexer)) == token.EOF
24+
return val
2325

2426

2527
def test_it_disallows_uncommon_control_characters():
@@ -101,6 +103,20 @@ def test_errors_respect_whitespace():
101103
)
102104

103105

106+
@pytest.mark.parametrize(
107+
"value,expected",
108+
[
109+
("abc", token.Name(0, 3, "abc")),
110+
("_abc", token.Name(0, 4, "_abc")),
111+
("abc_", token.Name(0, 4, "abc_")),
112+
("abc123", token.Name(0, 6, "abc123")),
113+
("abc_123", token.Name(0, 7, "abc_123")),
114+
],
115+
)
116+
def test_it_lexes_name(value, expected):
117+
assert lex_one(value) == expected
118+
119+
104120
@pytest.mark.parametrize(
105121
"value,expected",
106122
[
@@ -116,6 +132,7 @@ def test_errors_respect_whitespace():
116132
'"unicode \\u1234\\u5678\\u90AB\\uCDEF"',
117133
token.String(0, 34, "unicode \u1234\u5678\u90AB\uCDEF"),
118134
),
135+
('""', token.String(0, 2, "")),
119136
],
120137
)
121138
def test_it_lexes_strings(value, expected):
@@ -126,6 +143,8 @@ def test_it_lexes_strings(value, expected):
126143
"value, err_cls, expected_positon",
127144
[
128145
('"', NonTerminatedString, 1),
146+
('"""', NonTerminatedString, 3),
147+
('""""', NonTerminatedString, 4),
129148
('"no end quote', NonTerminatedString, 13),
130149
("'single quotes'", UnexpectedCharacter, 0),
131150
('"contains unescaped \u0007 control char"', InvalidCharacter, 20),
@@ -154,6 +173,7 @@ def test_it_lex_reports_useful_string_errors(value, err_cls, expected_positon):
154173
"value, expected",
155174
[
156175
('"""simple"""', token.BlockString(0, 12, "simple")),
176+
('""""""', token.BlockString(0, 6, "")),
157177
('""" white space """', token.BlockString(0, 19, " white space ")),
158178
(
159179
'"""contains " quote"""',
@@ -223,6 +243,7 @@ def test_it_lex_reports_useful_block_string_errors(
223243
("123E4", token.Float(0, 5, "123E4")),
224244
("123e-4", token.Float(0, 6, "123e-4")),
225245
("123e+4", token.Float(0, 6, "123e+4")),
246+
("1.2e3", token.Float(0, 5, "1.2e3")),
226247
("-123e4", token.Float(0, 6, "-123e4")),
227248
("-123E4", token.Float(0, 6, "-123E4")),
228249
("-123e-4", token.Float(0, 7, "-123e-4")),
@@ -238,6 +259,7 @@ def test_it_lexes_numbers(string, expected):
238259
"value, err_cls, expected_positon",
239260
[
240261
("00", UnexpectedCharacter, 1),
262+
("01", UnexpectedCharacter, 1),
241263
("+1", UnexpectedCharacter, 0),
242264
("1.", UnexpectedEOF, 2),
243265
("1.e1", UnexpectedCharacter, 2),
@@ -246,6 +268,21 @@ def test_it_lexes_numbers(string, expected):
246268
("-A", UnexpectedCharacter, 1),
247269
("1.0e", UnexpectedEOF, 4),
248270
("1.0eA", UnexpectedCharacter, 4),
271+
("123.", UnexpectedEOF, 4),
272+
("123e", UnexpectedEOF, 4),
273+
("123E", UnexpectedEOF, 4),
274+
("01.23", UnexpectedCharacter, 1),
275+
("1.2e3.4", UnexpectedCharacter, 7),
276+
("1.23.4", UnexpectedCharacter, 6),
277+
("1.2e3e", UnexpectedCharacter, 5),
278+
("0xF1", UnexpectedCharacter, 1),
279+
("0b10", UnexpectedCharacter, 1),
280+
("123abc", UnexpectedCharacter, 3),
281+
("1_234", UnexpectedCharacter, 1),
282+
("1ß", UnexpectedCharacter, 1),
283+
("1.23f", UnexpectedCharacter, 4),
284+
("1.234_5", UnexpectedCharacter, 5),
285+
("1.2ß", UnexpectedCharacter, 3),
249286
],
250287
)
251288
def test_it_lex_reports_useful_number_errors(value, err_cls, expected_positon):

0 commit comments

Comments
 (0)