Skip to content

Commit f476ec3

Browse files
trotterdylanwhitequark
authored andcommitted
Detect source encoding to properly interpret string literals
Buffers now always hold unicode source, whereas before they could hold bytes if that's what were passed in to the constructor. This is possible because we determine the encoding and then use that to decode() the bytes. The side effect is that Buffer.__init__ could raise UnicodeDecodeError if the input is badly encoded. The Buffer encoding is then used by the lexer to produce a strdata token of the correct type for string literals. For unicode literals, escaping happens much as before via _replace_escape(). For bytes, there's a different code path that calls encode() using the Buffer's encoding followed by a special escaping function that ensures the value's not accidentally promoted to unicode. The parser behavior for multi-string literals (e.g. "foo" "bar") also had to change. When any of the literals are unicode, the result is unicode. When all the literals are bytes the resulting value is also bytes.
1 parent cd2f7ac commit f476ec3

File tree

7 files changed

+250
-63
lines changed

7 files changed

+250
-63
lines changed

pythonparser/lexer.py

Lines changed: 60 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010

1111
if sys.version_info[0] == 3:
1212
unichr = chr
13+
byte = lambda x: bytes([x])
14+
else:
15+
byte = chr
1316

1417
class Token:
1518
"""
@@ -105,6 +108,7 @@ def __init__(self, source_buffer, version, diagnostic_engine, interactive=False)
105108
self.diagnostic_engine = diagnostic_engine
106109
self.interactive = interactive
107110
self.print_function = False
111+
self.unicode_literals = self.version >= (3, 0)
108112

109113
self.offset = 0
110114
self.new_line = True
@@ -184,24 +188,23 @@ def __init__(self, source_buffer, version, diagnostic_engine, interactive=False)
184188
id_xid=id_xid), re.VERBOSE|re.UNICODE)
185189

186190
# These are identical for all lexer instances.
187-
_lex_escape_re = re.compile(r"""
191+
_lex_escape_pattern = r"""
188192
\\(?:
189193
([\n\\'"abfnrtv]) # 1 single-char
190194
| ([0-7]{1,3}) # 2 oct
191195
| x([0-9A-Fa-f]{2}) # 3 hex
192196
)
193-
""", re.VERBOSE)
197+
"""
198+
_lex_escape_re = re.compile(_lex_escape_pattern.encode(), re.VERBOSE)
194199

195-
_lex_escape_unicode_re = re.compile(_lex_escape_re.pattern + r"""
200+
_lex_escape_unicode_re = re.compile(_lex_escape_pattern + r"""
196201
| \\(?:
197202
u([0-9A-Fa-f]{4}) # 4 unicode-16
198203
| U([0-9A-Fa-f]{8}) # 5 unicode-32
199204
| N\{(.+?)\} # 6 unicode-name
200205
)
201206
""", re.VERBOSE)
202207

203-
_lex_check_byte_re = re.compile("[^\x00-\x7f]")
204-
205208
def next(self, eof_token=False):
206209
"""
207210
Returns token at ``offset`` as a :class:`Token` and advances ``offset``
@@ -419,27 +422,24 @@ def _string_literal(self, options, begin_span, data, data_span, end_span):
419422

420423
def _replace_escape(self, range, mode, value):
421424
is_raw = ("r" in mode)
422-
is_byte = ("b" in mode)
423-
is_unicode = ("u" in mode)
425+
is_unicode = "u" in mode or ("b" not in mode and self.unicode_literals)
426+
427+
if not is_unicode:
428+
value = value.encode(self.source_buffer.encoding)
429+
if is_raw:
430+
return value
431+
return self._replace_escape_bytes(value)
424432

425433
if is_raw:
426434
return value
427435

428-
if is_byte and self._lex_check_byte_re.match(value):
429-
error = diagnostic.Diagnostic(
430-
"error", "non-7-bit character in a byte literal", {},
431-
tok_range)
432-
self.diagnostic_engine.process(error)
433-
434-
if is_unicode or self.version >= (3, 0):
435-
re = self._lex_escape_unicode_re
436-
else:
437-
re = self._lex_escape_re
436+
return self._replace_escape_unicode(range, value)
438437

438+
def _replace_escape_unicode(self, range, value):
439439
chunks = []
440440
offset = 0
441441
while offset < len(value):
442-
match = re.search(value, offset)
442+
match = self._lex_escape_unicode_re.search(value, offset)
443443
if match is None:
444444
# Append the remaining of the string
445445
chunks.append(value[offset:])
@@ -499,6 +499,48 @@ def _replace_escape(self, range, mode, value):
499499

500500
return "".join(chunks)
501501

502+
def _replace_escape_bytes(self, value):
503+
chunks = []
504+
offset = 0
505+
while offset < len(value):
506+
match = self._lex_escape_re.search(value, offset)
507+
if match is None:
508+
# Append the remaining of the string
509+
chunks.append(value[offset:])
510+
break
511+
512+
# Append the part of string before match
513+
chunks.append(value[offset:match.start()])
514+
offset = match.end()
515+
516+
# Process the escape
517+
if match.group(1) is not None: # single-char
518+
chr = match.group(1)
519+
if chr == b"\n":
520+
pass
521+
elif chr == b"\\" or chr == b"'" or chr == b"\"":
522+
chunks.append(chr)
523+
elif chr == b"a":
524+
chunks.append(b"\a")
525+
elif chr == b"b":
526+
chunks.append(b"\b")
527+
elif chr == b"f":
528+
chunks.append(b"\f")
529+
elif chr == b"n":
530+
chunks.append(b"\n")
531+
elif chr == b"r":
532+
chunks.append(b"\r")
533+
elif chr == b"t":
534+
chunks.append(b"\t")
535+
elif chr == b"v":
536+
chunks.append(b"\v")
537+
elif match.group(2) is not None: # oct
538+
chunks.append(byte(int(match.group(2), 8)))
539+
elif match.group(3) is not None: # hex
540+
chunks.append(byte(int(match.group(3), 16)))
541+
542+
return b"".join(chunks)
543+
502544
def _check_long_literal(self, range, literal):
503545
if literal[-1] in "lL" and self.version >= (3, 0):
504546
error = diagnostic.Diagnostic(

pythonparser/parser.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,8 @@ def _assignable(self, node, is_delete=False):
520520
def add_flags(self, flags):
521521
if "print_function" in flags:
522522
self.lexer.print_function = True
523+
if "unicode_literals" in flags:
524+
self.lexer.unicode_literals = True
523525

524526
# Grammar
525527
@action(Expect(Alt(Newline(),
@@ -1522,7 +1524,10 @@ def atom_4(self, begin_tok, data_tok, end_tok):
15221524

15231525
@action(Plus(atom_4))
15241526
def atom_5(self, strings):
1525-
return ast.Str(s="".join([x.s for x in strings]),
1527+
joint = ""
1528+
if all(isinstance(x.s, bytes) for x in strings):
1529+
joint = b""
1530+
return ast.Str(s=joint.join([x.s for x in strings]),
15261531
begin_loc=strings[0].begin_loc, end_loc=strings[-1].end_loc,
15271532
loc=strings[0].loc.join(strings[-1].loc))
15281533

pythonparser/source.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from __future__ import absolute_import, division, print_function, unicode_literals
99
import bisect
10+
import regex as re
1011

1112
class Buffer:
1213
"""
@@ -18,7 +19,11 @@ class Buffer:
1819
:ivar line: (integer) first line of the input
1920
"""
2021
def __init__(self, source, name="<input>", first_line=1):
21-
self.source = source
22+
self.encoding = self._extract_encoding(source)
23+
if isinstance(source, bytes):
24+
self.source = source.decode(self.encoding)
25+
else:
26+
self.source = source
2227
self.name = name
2328
self.first_line = first_line
2429
self._line_begins = None
@@ -65,6 +70,29 @@ def _extract_line_begins(self):
6570
return self._line_begins
6671
self._line_begins.append(index)
6772

73+
_encoding_re = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
74+
_encoding_bytes_re = re.compile(_encoding_re.pattern.encode())
75+
76+
def _extract_encoding(self, source):
77+
if isinstance(source, bytes):
78+
re = self._encoding_bytes_re
79+
nl = b"\n"
80+
else:
81+
re = self._encoding_re
82+
nl = "\n"
83+
match = re.match(source)
84+
if not match:
85+
index = source.find(nl)
86+
if index != -1:
87+
match = re.match(source[index + 1:])
88+
if match:
89+
encoding = match.group(1)
90+
if isinstance(encoding, bytes):
91+
return encoding.decode("ascii")
92+
return encoding
93+
return "ascii"
94+
95+
6896
class Range:
6997
"""
7098
Location of an exclusive range of characters [*begin_pos*, *end_pos*)

pythonparser/test/test_lexer.py

Lines changed: 71 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
# coding:utf-8
22

33
from __future__ import absolute_import, division, print_function, unicode_literals
4+
from . import test_utils
45
from .. import source, lexer, diagnostic
56
import unittest
67

8+
BytesOnly = test_utils.BytesOnly
9+
UnicodeOnly = test_utils.UnicodeOnly
10+
711
class LexerTestCase(unittest.TestCase):
812

913
def assertLexesVersions(self, input, versions, *expected_tokens, **kwargs):
@@ -152,42 +156,67 @@ def test_integer_py3(self):
152156
"int", 123)
153157

154158
def test_string_literal(self):
155-
self.assertLexes("''",
156-
"strbegin", "",
157-
"strdata", "",
158-
"strend", None)
159-
self.assertLexes("''''''",
160-
"strbegin", "",
161-
"strdata", "",
162-
"strend", None)
163-
self.assertLexes("\"\"",
164-
"strbegin", "",
165-
"strdata", "",
166-
"strend", None)
167-
self.assertLexes("\"\"\"\"\"\"",
168-
"strbegin", "",
169-
"strdata", "",
170-
"strend", None)
171-
172-
self.assertLexes("'x'",
173-
"strbegin", "",
174-
"strdata", "x",
175-
"strend", None)
176-
177-
self.assertLexes("'''\n'''",
178-
"strbegin", "",
179-
"strdata", "\n",
180-
"strend", None)
181-
182-
self.assertLexes("'''\n'''",
183-
"strbegin", "",
184-
"strdata", "\n",
185-
"strend", None)
186-
187-
self.assertLexes(r"'\0 \10 \010'",
188-
"strbegin", "",
189-
"strdata", "\x00 \x08 \x08",
190-
"strend", None)
159+
for version in self.VERSIONS:
160+
if version < (3,):
161+
str_type = BytesOnly
162+
else:
163+
str_type = UnicodeOnly
164+
self.assertLexesVersions("''", [version],
165+
"strbegin", "",
166+
"strdata", str_type(""),
167+
"strend", None)
168+
self.assertLexesVersions("''''''", [version],
169+
"strbegin", "",
170+
"strdata", str_type(""),
171+
"strend", None)
172+
self.assertLexesVersions("\"\"", [version],
173+
"strbegin", "",
174+
"strdata", str_type(""),
175+
"strend", None)
176+
self.assertLexesVersions("\"\"\"\"\"\"", [version],
177+
"strbegin", "",
178+
"strdata", str_type(""),
179+
"strend", None)
180+
181+
self.assertLexesVersions("'x'", [version],
182+
"strbegin", "",
183+
"strdata", str_type("x"),
184+
"strend", None)
185+
186+
self.assertLexesVersions("'''\n'''", [version],
187+
"strbegin", "",
188+
"strdata", str_type("\n"),
189+
"strend", None)
190+
191+
self.assertLexesVersions("'''\n'''", [version],
192+
"strbegin", "",
193+
"strdata", str_type("\n"),
194+
"strend", None)
195+
196+
self.assertLexesVersions(r"'\0 \10 \010'", [version],
197+
"strbegin", "",
198+
"strdata", str_type("\x00 \x08 \x08"),
199+
"strend", None)
200+
201+
self.assertLexesVersions(r"b'\xc3\xa7'", [(2,7), (3,0), (3,1)],
202+
"strbegin", "b",
203+
"strdata", BytesOnly(b"\xc3\xa7"),
204+
"strend", None)
205+
206+
self.assertLexesVersions(b"# coding: koi8-r\nb'\xc3\xa7'", [(2,7), (3,0), (3,1)],
207+
"strbegin", "b",
208+
"strdata", BytesOnly(b"\xc3\xa7"),
209+
"strend", None)
210+
211+
self.assertLexesVersions(b"# coding: koi8-r\n'\xc3\xa7'", [(3,0), (3,1)],
212+
"strbegin", "",
213+
"strdata", UnicodeOnly("\u0446\u2556"),
214+
"strend", None)
215+
216+
self.assertLexesVersions(b"# coding: koi8-r\nu'\xc3\xa7'", [(2,7)],
217+
"strbegin", "u",
218+
"strdata", UnicodeOnly("\u0446\u2556"),
219+
"strend", None)
191220

192221
self.assertDiagnoses(
193222
"'",
@@ -211,12 +240,13 @@ def test_escape_clike(self):
211240
(r"\a", "\a"), (r"\b", "\b"), (r"\f", "\f"), (r"\n", "\n"),
212241
(r"\r", "\r"), (r"\t", "\t"), (r"\v", "\v"),
213242
(r"\x53", "S"), (r"\123", "S")]:
214-
for mode in [ "", "u", "b" ]:
215-
self.assertLexesEscape(mode, chr, val)
216-
for mode in [ "r", "br" ]:
217-
self.assertLexesEscape(mode, chr, chr)
243+
self.assertLexesEscape("b", chr, BytesOnly(val))
244+
self.assertLexesEscape("u", chr, UnicodeOnly(val))
245+
self.assertLexesEscape("", chr, UnicodeOnly(val))
246+
self.assertLexesEscape("r", chr, UnicodeOnly(chr))
247+
self.assertLexesEscape("br", chr, BytesOnly(chr))
218248

219-
self.assertLexesEscape("r", "\\\"", "\\\"")
249+
self.assertLexesEscape("r", "\\\"", UnicodeOnly("\\\""))
220250

221251
def test_escape_unicode(self):
222252
self.assertLexesEscape("u", "\\u044b", "ы")

0 commit comments

Comments
 (0)