Skip to content

gh-126004: fix positions handling in codecs.xmlcharrefreplace_errors #127675

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions Lib/test/test_capi/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,8 @@ def test_codec_replace_errors_handler(self):

def test_codec_xmlcharrefreplace_errors_handler(self):
handler = _testcapi.codec_xmlcharrefreplace_errors
self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
self.do_test_codec_errors_handler(handler, self.unicode_encode_errors,
safe=True)

def test_codec_backslashreplace_errors_handler(self):
handler = _testcapi.codec_backslashreplace_errors
Expand All @@ -853,12 +854,12 @@ def test_codec_namereplace_errors_handler(self):
handler = _testlimitedcapi.codec_namereplace_errors
self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)

def do_test_codec_errors_handler(self, handler, exceptions):
def do_test_codec_errors_handler(self, handler, exceptions, *, safe=False):
at_least_one = False
for exc in exceptions:
# See https://github.com/python/cpython/issues/123378 and related
# discussion and issues for details.
if self._exception_may_crash(exc):
if not safe and self._exception_may_crash(exc):
continue

at_least_one = True
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix handling of :attr:`UnicodeError.start` and :attr:`UnicodeError.end`
values in the :func:`codecs.xmlcharrefreplace_errors` error handler.
Patch by Bénédikt Tran.
189 changes: 101 additions & 88 deletions Python/codecs.c
Original file line number Diff line number Diff line change
Expand Up @@ -755,100 +755,113 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)

PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
{
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
PyObject *restuple;
PyObject *object;
Py_ssize_t i;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
Py_UCS1 *outp;
Py_ssize_t ressize;
Py_UCS4 ch;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
if (end - start > PY_SSIZE_T_MAX / (2+7+1))
end = start + PY_SSIZE_T_MAX / (2+7+1);
for (i = start, ressize = 0; i < end; ++i) {
/* object is guaranteed to be "ready" */
ch = PyUnicode_READ_CHAR(object, i);
if (ch<10)
ressize += 2+1+1;
else if (ch<100)
ressize += 2+2+1;
else if (ch<1000)
ressize += 2+3+1;
else if (ch<10000)
ressize += 2+4+1;
else if (ch<100000)
ressize += 2+5+1;
else if (ch<1000000)
ressize += 2+6+1;
else
ressize += 2+7+1;
if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
wrong_exception_type(exc);
return NULL;
}

PyObject *obj;
Py_ssize_t objlen, start, end, slen;
if (_PyUnicodeError_GetParams(exc,
&obj, &objlen,
&start, &end, &slen, false) < 0)
{
return NULL;
}

// The number of characters that each character 'ch' contributes
// in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
// and will be formatted as "&#" + DIGITS + ";". Since the Unicode
// range is below 10^7, each "block" requires at most 2 + 7 + 1
// characters.
if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
end = Py_MIN(end, objlen);
slen = Py_MAX(0, end - start);
}

Py_ssize_t ressize = 0;
for (Py_ssize_t i = start; i < end; ++i) {
/* object is guaranteed to be "ready" */
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
if (ch < 10) {
ressize += 2 + 1 + 1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An alternative to end = start + PY_SSIZE_T_MAX / (2 + 7 + 1); would be to set digits = 1; and after the serie of ifs, check that ressize += (2 + digits + 1); doesn't overflow. If it does overflow, return MemoryError. Something like:

if (resize > PY_SSIZE_T_MAX - (2 + digits + 1)) {
    return PyErr_MemoryError();
}

Copy link
Member Author

@picnixz picnixz Jan 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's what PyCodec_NameReplaceErrors does but I don't know how performances would be affected. Hitting PY_SSIZE_T_MAX / (2 + 7 + 1) means that we're handling something that is quite large. So doing the check on resize at each loop iteration might slow down the handler a bit.

Now, it took me a while to convince myself that it won't be slowing down the handlers by much. Namely, I don't think it would dramatically slow it down because if our characters are > 10^3, we would do < 10, < 100, < 1000 and < 10000 checks (this last one is needed to know that it's > 10^3 but not > 10^4) already. So instead of 4 we have 5 checks which is not that annoying.

Why can we assume that we will have at least 2 checks and not less? Well... everything < 100 is actually ASCII, so and unless someone is using a special codec for which those characters are not supported or for artificially created exceptions that indicate their start/end positions incorrectly, we're likely to have at least 2 checks in the loop (namely < 10 and < 100) because the bad characters are likely do be something outside the ASCII range.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would look like:

    Py_ssize_t ressize = 0;
    for (Py_ssize_t i = start; i < end; ++i) {
        // The number of characters that each character 'ch' contributes
        // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
        // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
        // range is below 10^7, each "block" requires at most 2 + 7 + 1
        // characters.
        int replsize;
        /* object is guaranteed to be "ready" */
        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
        if (ch < 10) {
            replsize = 2 + 1 + 1;
        }
        else if (ch < 100) {
            replsize = 2 + 2 + 1;
        }
        else if (ch < 1000) {
            replsize = 2 + 3 + 1;
        }
        else if (ch < 10000) {
            replsize = 2 + 4 + 1;
        }
        else if (ch < 100000) {
            replsize = 2 + 5 + 1;
        }
        else if (ch < 1000000) {
            replsize = 2 + 6 + 1;
        }
        else {
            assert(ch < 10000000);
            replsize = 2 + 7 + 1;
        }
        if (ressize > PY_SSIZE_T_MAX - replsize) {
            Py_DECREF(obj);
            return PyErr_NoMemory();
        }
        ressize += replsize;

But I'm not very fond of this. I think it's still nicer to have the check outside the loop.

Copy link
Member Author

@picnixz picnixz Jan 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we decide in a follow-up PR whether those special checks (for instance, we have similar checks for PyCodec_NameReplaceErrors and PyCodec_BackslashReplaceErrors) need to be kept. For the 'namereplace' handler, we purely break actually:

        for (i = start, ressize = 0; i < end; ++i) {
            /* object is guaranteed to be "ready" */
            c = PyUnicode_READ_CHAR(object, i);
            if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
                replsize = 1+1+1+(int)strlen(buffer)+1;
            }
            else if (c >= 0x10000) {
                replsize = 1+1+8;
            }
            else if (c >= 0x100) {
                replsize = 1+1+4;
            }
            else
                replsize = 1+1+2;
            if (ressize > PY_SSIZE_T_MAX - replsize)
                break;
            ressize += replsize;
        }

and just don't care anymore :') (the reason is that cannot determine in advance how much it would take unless we call getname before...)

}
/* allocate replacement */
res = PyUnicode_New(ressize, 127);
if (res == NULL) {
Py_DECREF(object);
return NULL;
else if (ch < 100) {
ressize += 2 + 2 + 1;
}
outp = PyUnicode_1BYTE_DATA(res);
/* generate replacement */
for (i = start; i < end; ++i) {
int digits;
int base;
ch = PyUnicode_READ_CHAR(object, i);
*outp++ = '&';
*outp++ = '#';
if (ch<10) {
digits = 1;
base = 1;
}
else if (ch<100) {
digits = 2;
base = 10;
}
else if (ch<1000) {
digits = 3;
base = 100;
}
else if (ch<10000) {
digits = 4;
base = 1000;
}
else if (ch<100000) {
digits = 5;
base = 10000;
}
else if (ch<1000000) {
digits = 6;
base = 100000;
}
else {
digits = 7;
base = 1000000;
}
while (digits-->0) {
*outp++ = '0' + ch/base;
ch %= base;
base /= 10;
}
*outp++ = ';';
else if (ch < 1000) {
ressize += 2 + 3 + 1;
}
else if (ch < 10000) {
ressize += 2 + 4 + 1;
}
else if (ch < 100000) {
ressize += 2 + 5 + 1;
}
else if (ch < 1000000) {
ressize += 2 + 6 + 1;
}
else {
assert(ch < 10000000);
ressize += 2 + 7 + 1;
}
assert(_PyUnicode_CheckConsistency(res, 1));
restuple = Py_BuildValue("(Nn)", res, end);
Py_DECREF(object);
return restuple;
}
else {
wrong_exception_type(exc);

/* allocate replacement */
PyObject *res = PyUnicode_New(ressize, 127);
if (res == NULL) {
Py_DECREF(obj);
return NULL;
}
Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
/* generate replacement */
for (Py_ssize_t i = start; i < end; ++i) {
int digits, base;
Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
if (ch < 10) {
digits = 1;
base = 1;
}
else if (ch < 100) {
digits = 2;
base = 10;
}
else if (ch < 1000) {
digits = 3;
base = 100;
}
else if (ch < 10000) {
digits = 4;
base = 1000;
}
else if (ch < 100000) {
digits = 5;
base = 10000;
}
else if (ch < 1000000) {
digits = 6;
base = 100000;
}
else {
assert(ch < 10000000);
digits = 7;
base = 1000000;
}
*outp++ = '&';
*outp++ = '#';
while (digits-- > 0) {
assert(base >= 1);
*outp++ = '0' + ch / base;
ch %= base;
base /= 10;
}
*outp++ = ';';
}
assert(_PyUnicode_CheckConsistency(res, 1));
PyObject *restuple = Py_BuildValue("(Nn)", res, end);
Py_DECREF(obj);
return restuple;
}

PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
Expand Down
Loading