Skip to content

Commit c96d154

Browse files
bpo-45461: Fix IncrementalDecoder and StreamReader in the "unicode-escape" codec (GH-28939)
They support now splitting escape sequences between input chunks. Add the third parameter "final" in codecs.unicode_escape_decode(). It is True by default to match the former behavior.
1 parent e71662c commit c96d154

File tree

8 files changed

+121
-32
lines changed

8 files changed

+121
-32
lines changed

Include/cpython/unicodeobject.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -777,12 +777,20 @@ PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
777777

778778
/* --- Unicode-Escape Codecs ---------------------------------------------- */
779779

780+
/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
781+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
782+
const char *string, /* Unicode-Escape encoded string */
783+
Py_ssize_t length, /* size of string */
784+
const char *errors, /* error handling */
785+
Py_ssize_t *consumed /* bytes consumed */
786+
);
780787
/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
781788
chars. */
782-
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
789+
PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
783790
const char *string, /* Unicode-Escape encoded string */
784791
Py_ssize_t length, /* size of string */
785792
const char *errors, /* error handling */
793+
Py_ssize_t *consumed, /* bytes consumed */
786794
const char **first_invalid_escape /* on return, points to first
787795
invalid escaped char in
788796
string. */

Lib/encodings/unicode_escape.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
2121
def encode(self, input, final=False):
2222
return codecs.unicode_escape_encode(input, self.errors)[0]
2323

24-
class IncrementalDecoder(codecs.IncrementalDecoder):
25-
def decode(self, input, final=False):
26-
return codecs.unicode_escape_decode(input, self.errors)[0]
24+
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
25+
def _buffer_decode(self, input, errors, final):
26+
return codecs.unicode_escape_decode(input, errors, final)
2727

2828
class StreamWriter(Codec,codecs.StreamWriter):
2929
pass
3030

3131
class StreamReader(Codec,codecs.StreamReader):
32-
pass
32+
def decode(self, input, errors='strict'):
33+
return codecs.unicode_escape_decode(input, errors, False)
3334

3435
### encodings module API
3536

Lib/test/test_codecs.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def check_partial(self, input, partialresults):
114114
q = Queue(b"")
115115
r = codecs.getreader(self.encoding)(q)
116116
result = ""
117-
for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
117+
for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
118118
q.write(bytes([c]))
119119
result += r.read()
120120
self.assertEqual(result, partialresult)
@@ -125,7 +125,7 @@ def check_partial(self, input, partialresults):
125125
# do the check again, this time using an incremental decoder
126126
d = codecs.getincrementaldecoder(self.encoding)()
127127
result = ""
128-
for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
128+
for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
129129
result += d.decode(bytes([c]))
130130
self.assertEqual(result, partialresult)
131131
# check that there's nothing left in the buffers
@@ -135,7 +135,7 @@ def check_partial(self, input, partialresults):
135135
# Check whether the reset method works properly
136136
d.reset()
137137
result = ""
138-
for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
138+
for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
139139
result += d.decode(bytes([c]))
140140
self.assertEqual(result, partialresult)
141141
# check that there's nothing left in the buffers
@@ -2353,7 +2353,11 @@ def test_unicode_escape(self):
23532353
(r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
23542354

23552355

2356-
class UnicodeEscapeTest(unittest.TestCase):
2356+
class UnicodeEscapeTest(ReadTest, unittest.TestCase):
2357+
encoding = "unicode-escape"
2358+
2359+
test_lone_surrogates = None
2360+
23572361
def test_empty(self):
23582362
self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
23592363
self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
@@ -2440,6 +2444,44 @@ def test_decode_errors(self):
24402444
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
24412445
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
24422446

2447+
def test_partial(self):
2448+
self.check_partial(
2449+
"\x00\t\n\r\\\xff\uffff\U00010000",
2450+
[
2451+
'',
2452+
'',
2453+
'',
2454+
'\x00',
2455+
'\x00',
2456+
'\x00\t',
2457+
'\x00\t',
2458+
'\x00\t\n',
2459+
'\x00\t\n',
2460+
'\x00\t\n\r',
2461+
'\x00\t\n\r',
2462+
'\x00\t\n\r\\',
2463+
'\x00\t\n\r\\',
2464+
'\x00\t\n\r\\',
2465+
'\x00\t\n\r\\',
2466+
'\x00\t\n\r\\\xff',
2467+
'\x00\t\n\r\\\xff',
2468+
'\x00\t\n\r\\\xff',
2469+
'\x00\t\n\r\\\xff',
2470+
'\x00\t\n\r\\\xff',
2471+
'\x00\t\n\r\\\xff',
2472+
'\x00\t\n\r\\\xff\uffff',
2473+
'\x00\t\n\r\\\xff\uffff',
2474+
'\x00\t\n\r\\\xff\uffff',
2475+
'\x00\t\n\r\\\xff\uffff',
2476+
'\x00\t\n\r\\\xff\uffff',
2477+
'\x00\t\n\r\\\xff\uffff',
2478+
'\x00\t\n\r\\\xff\uffff',
2479+
'\x00\t\n\r\\\xff\uffff',
2480+
'\x00\t\n\r\\\xff\uffff',
2481+
'\x00\t\n\r\\\xff\uffff',
2482+
'\x00\t\n\r\\\xff\uffff\U00010000',
2483+
]
2484+
)
24432485

24442486
class RawUnicodeEscapeTest(unittest.TestCase):
24452487
def test_empty(self):
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix incremental decoder and stream reader in the "unicode-escape" codec.
2+
Previously they failed if the escape sequence was split.

Modules/_codecsmodule.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -489,17 +489,20 @@ _codecs_utf_32_ex_decode_impl(PyObject *module, Py_buffer *data,
489489
_codecs.unicode_escape_decode
490490
data: Py_buffer(accept={str, buffer})
491491
errors: str(accept={str, NoneType}) = None
492+
final: bool(accept={int}) = True
492493
/
493494
[clinic start generated code]*/
494495

495496
static PyObject *
496497
_codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
497-
const char *errors)
498-
/*[clinic end generated code: output=3ca3c917176b82ab input=8328081a3a569bd6]*/
498+
const char *errors, int final)
499+
/*[clinic end generated code: output=b284f97b12c635ee input=6154f039a9f7c639]*/
499500
{
500-
PyObject *decoded = PyUnicode_DecodeUnicodeEscape(data->buf, data->len,
501-
errors);
502-
return codec_tuple(decoded, data->len);
501+
Py_ssize_t consumed = data->len;
502+
PyObject *decoded = _PyUnicode_DecodeUnicodeEscapeStateful(data->buf, data->len,
503+
errors,
504+
final ? NULL : &consumed);
505+
return codec_tuple(decoded, consumed);
503506
}
504507

505508
/*[clinic input]

Modules/clinic/_codecsmodule.c.h

Lines changed: 13 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Objects/unicodeobject.c

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6342,9 +6342,10 @@ PyUnicode_AsUTF16String(PyObject *unicode)
63426342
static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
63436343

63446344
PyObject *
6345-
_PyUnicode_DecodeUnicodeEscape(const char *s,
6345+
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
63466346
Py_ssize_t size,
63476347
const char *errors,
6348+
Py_ssize_t *consumed,
63486349
const char **first_invalid_escape)
63496350
{
63506351
const char *starts = s;
@@ -6357,6 +6358,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
63576358
*first_invalid_escape = NULL;
63586359

63596360
if (size == 0) {
6361+
if (consumed) {
6362+
*consumed = 0;
6363+
}
63606364
_Py_RETURN_UNICODE_EMPTY();
63616365
}
63626366
/* Escaped strings will always be longer than the resulting
@@ -6407,7 +6411,7 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
64076411
/* \ - Escapes */
64086412
if (s >= end) {
64096413
message = "\\ at end of string";
6410-
goto error;
6414+
goto incomplete;
64116415
}
64126416
c = (unsigned char) *s++;
64136417

@@ -6461,7 +6465,10 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
64616465
count = 8;
64626466
message = "truncated \\UXXXXXXXX escape";
64636467
hexescape:
6464-
for (ch = 0; count && s < end; ++s, --count) {
6468+
for (ch = 0; count; ++s, --count) {
6469+
if (s >= end) {
6470+
goto incomplete;
6471+
}
64656472
c = (unsigned char)*s;
64666473
ch <<= 4;
64676474
if (c >= '0' && c <= '9') {
@@ -6474,12 +6481,9 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
64746481
ch += c - ('A' - 10);
64756482
}
64766483
else {
6477-
break;
6484+
goto error;
64786485
}
64796486
}
6480-
if (count) {
6481-
goto error;
6482-
}
64836487

64846488
/* when we get here, ch is a 32-bit unicode character */
64856489
if (ch > MAX_UNICODE) {
@@ -6506,14 +6510,20 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
65066510
}
65076511

65086512
message = "malformed \\N character escape";
6509-
if (s < end && *s == '{') {
6513+
if (s >= end) {
6514+
goto incomplete;
6515+
}
6516+
if (*s == '{') {
65106517
const char *start = ++s;
65116518
size_t namelen;
65126519
/* look for the closing brace */
65136520
while (s < end && *s != '}')
65146521
s++;
6522+
if (s >= end) {
6523+
goto incomplete;
6524+
}
65156525
namelen = s - start;
6516-
if (namelen && s < end) {
6526+
if (namelen) {
65176527
/* found a name. look it up in the unicode database */
65186528
s++;
65196529
ch = 0xffffffff; /* in case 'getcode' messes up */
@@ -6539,6 +6549,11 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
65396549
continue;
65406550
}
65416551

6552+
incomplete:
6553+
if (consumed) {
6554+
*consumed = startinpos;
6555+
break;
6556+
}
65426557
error:
65436558
endinpos = s-starts;
65446559
writer.min_length = end - s + writer.pos;
@@ -6567,12 +6582,14 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
65676582
}
65686583

65696584
PyObject *
6570-
PyUnicode_DecodeUnicodeEscape(const char *s,
6585+
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
65716586
Py_ssize_t size,
6572-
const char *errors)
6587+
const char *errors,
6588+
Py_ssize_t *consumed)
65736589
{
65746590
const char *first_invalid_escape;
6575-
PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6591+
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6592+
consumed,
65766593
&first_invalid_escape);
65776594
if (result == NULL)
65786595
return NULL;
@@ -6587,6 +6604,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
65876604
return result;
65886605
}
65896606

6607+
PyObject *
6608+
PyUnicode_DecodeUnicodeEscape(const char *s,
6609+
Py_ssize_t size,
6610+
const char *errors)
6611+
{
6612+
return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6613+
}
6614+
65906615
/* Return a Unicode-Escape string version of the Unicode object. */
65916616

65926617
PyObject *

Parser/string_parser.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
115115
s = buf;
116116

117117
const char *first_invalid_escape;
118-
v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
118+
v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
119119

120120
if (v != NULL && first_invalid_escape != NULL) {
121121
if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {

0 commit comments

Comments
 (0)