Skip to content

Commit 70dcc84

Browse files
authored
gh-126004: Fix positions handling in codecs.xmlcharrefreplace_errors (#127675)
This fixes how `PyCodec_XMLCharRefReplaceErrors` handles the `start` and `end` attributes of `UnicodeError` objects via the `_PyUnicodeError_GetParams` helper.
1 parent a10f993 commit 70dcc84

File tree

3 files changed

+108
-91
lines changed

3 files changed

+108
-91
lines changed

Lib/test/test_capi/test_codecs.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -843,7 +843,8 @@ def test_codec_replace_errors_handler(self):
843843

844844
def test_codec_xmlcharrefreplace_errors_handler(self):
845845
handler = _testcapi.codec_xmlcharrefreplace_errors
846-
self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
846+
self.do_test_codec_errors_handler(handler, self.unicode_encode_errors,
847+
safe=True)
847848

848849
def test_codec_backslashreplace_errors_handler(self):
849850
handler = _testcapi.codec_backslashreplace_errors
@@ -853,12 +854,12 @@ def test_codec_namereplace_errors_handler(self):
853854
handler = _testlimitedcapi.codec_namereplace_errors
854855
self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
855856

856-
def do_test_codec_errors_handler(self, handler, exceptions):
857+
def do_test_codec_errors_handler(self, handler, exceptions, *, safe=False):
857858
at_least_one = False
858859
for exc in exceptions:
859860
# See https://github.com/python/cpython/issues/123378 and related
860861
# discussion and issues for details.
861-
if self._exception_may_crash(exc):
862+
if not safe and self._exception_may_crash(exc):
862863
continue
863864

864865
at_least_one = True
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix handling of :attr:`UnicodeError.start` and :attr:`UnicodeError.end`
2+
values in the :func:`codecs.xmlcharrefreplace_errors` error handler.
3+
Patch by Bénédikt Tran.

Python/codecs.c

Lines changed: 101 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -755,100 +755,113 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
755755

756756
PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
757757
{
758-
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
759-
PyObject *restuple;
760-
PyObject *object;
761-
Py_ssize_t i;
762-
Py_ssize_t start;
763-
Py_ssize_t end;
764-
PyObject *res;
765-
Py_UCS1 *outp;
766-
Py_ssize_t ressize;
767-
Py_UCS4 ch;
768-
if (PyUnicodeEncodeError_GetStart(exc, &start))
769-
return NULL;
770-
if (PyUnicodeEncodeError_GetEnd(exc, &end))
771-
return NULL;
772-
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
773-
return NULL;
774-
if (end - start > PY_SSIZE_T_MAX / (2+7+1))
775-
end = start + PY_SSIZE_T_MAX / (2+7+1);
776-
for (i = start, ressize = 0; i < end; ++i) {
777-
/* object is guaranteed to be "ready" */
778-
ch = PyUnicode_READ_CHAR(object, i);
779-
if (ch<10)
780-
ressize += 2+1+1;
781-
else if (ch<100)
782-
ressize += 2+2+1;
783-
else if (ch<1000)
784-
ressize += 2+3+1;
785-
else if (ch<10000)
786-
ressize += 2+4+1;
787-
else if (ch<100000)
788-
ressize += 2+5+1;
789-
else if (ch<1000000)
790-
ressize += 2+6+1;
791-
else
792-
ressize += 2+7+1;
758+
if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
759+
wrong_exception_type(exc);
760+
return NULL;
761+
}
762+
763+
PyObject *obj;
764+
Py_ssize_t objlen, start, end, slen;
765+
if (_PyUnicodeError_GetParams(exc,
766+
&obj, &objlen,
767+
&start, &end, &slen, false) < 0)
768+
{
769+
return NULL;
770+
}
771+
772+
// The number of characters that each character 'ch' contributes
773+
// in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
774+
// and will be formatted as "&#" + DIGITS + ";". Since the Unicode
775+
// range is below 10^7, each "block" requires at most 2 + 7 + 1
776+
// characters.
777+
if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
778+
end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
779+
end = Py_MIN(end, objlen);
780+
slen = Py_MAX(0, end - start);
781+
}
782+
783+
Py_ssize_t ressize = 0;
784+
for (Py_ssize_t i = start; i < end; ++i) {
785+
/* object is guaranteed to be "ready" */
786+
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
787+
if (ch < 10) {
788+
ressize += 2 + 1 + 1;
793789
}
794-
/* allocate replacement */
795-
res = PyUnicode_New(ressize, 127);
796-
if (res == NULL) {
797-
Py_DECREF(object);
798-
return NULL;
790+
else if (ch < 100) {
791+
ressize += 2 + 2 + 1;
799792
}
800-
outp = PyUnicode_1BYTE_DATA(res);
801-
/* generate replacement */
802-
for (i = start; i < end; ++i) {
803-
int digits;
804-
int base;
805-
ch = PyUnicode_READ_CHAR(object, i);
806-
*outp++ = '&';
807-
*outp++ = '#';
808-
if (ch<10) {
809-
digits = 1;
810-
base = 1;
811-
}
812-
else if (ch<100) {
813-
digits = 2;
814-
base = 10;
815-
}
816-
else if (ch<1000) {
817-
digits = 3;
818-
base = 100;
819-
}
820-
else if (ch<10000) {
821-
digits = 4;
822-
base = 1000;
823-
}
824-
else if (ch<100000) {
825-
digits = 5;
826-
base = 10000;
827-
}
828-
else if (ch<1000000) {
829-
digits = 6;
830-
base = 100000;
831-
}
832-
else {
833-
digits = 7;
834-
base = 1000000;
835-
}
836-
while (digits-->0) {
837-
*outp++ = '0' + ch/base;
838-
ch %= base;
839-
base /= 10;
840-
}
841-
*outp++ = ';';
793+
else if (ch < 1000) {
794+
ressize += 2 + 3 + 1;
795+
}
796+
else if (ch < 10000) {
797+
ressize += 2 + 4 + 1;
798+
}
799+
else if (ch < 100000) {
800+
ressize += 2 + 5 + 1;
801+
}
802+
else if (ch < 1000000) {
803+
ressize += 2 + 6 + 1;
804+
}
805+
else {
806+
assert(ch < 10000000);
807+
ressize += 2 + 7 + 1;
842808
}
843-
assert(_PyUnicode_CheckConsistency(res, 1));
844-
restuple = Py_BuildValue("(Nn)", res, end);
845-
Py_DECREF(object);
846-
return restuple;
847809
}
848-
else {
849-
wrong_exception_type(exc);
810+
811+
/* allocate replacement */
812+
PyObject *res = PyUnicode_New(ressize, 127);
813+
if (res == NULL) {
814+
Py_DECREF(obj);
850815
return NULL;
851816
}
817+
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
818+
/* generate replacement */
819+
for (Py_ssize_t i = start; i < end; ++i) {
820+
int digits, base;
821+
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
822+
if (ch < 10) {
823+
digits = 1;
824+
base = 1;
825+
}
826+
else if (ch < 100) {
827+
digits = 2;
828+
base = 10;
829+
}
830+
else if (ch < 1000) {
831+
digits = 3;
832+
base = 100;
833+
}
834+
else if (ch < 10000) {
835+
digits = 4;
836+
base = 1000;
837+
}
838+
else if (ch < 100000) {
839+
digits = 5;
840+
base = 10000;
841+
}
842+
else if (ch < 1000000) {
843+
digits = 6;
844+
base = 100000;
845+
}
846+
else {
847+
assert(ch < 10000000);
848+
digits = 7;
849+
base = 1000000;
850+
}
851+
*outp++ = '&';
852+
*outp++ = '#';
853+
while (digits-- > 0) {
854+
assert(base >= 1);
855+
*outp++ = '0' + ch / base;
856+
ch %= base;
857+
base /= 10;
858+
}
859+
*outp++ = ';';
860+
}
861+
assert(_PyUnicode_CheckConsistency(res, 1));
862+
PyObject *restuple = Py_BuildValue("(Nn)", res, end);
863+
Py_DECREF(obj);
864+
return restuple;
852865
}
853866

854867
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)

0 commit comments

Comments
 (0)