Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-31979: Simplify transforming decimals to ASCII #4336

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions Include/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -1723,6 +1723,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(

#endif /* MS_WINDOWS */

#ifndef Py_LIMITED_API
/* --- Decimal Encoder ---------------------------------------------------- */

/* Takes a Unicode string holding a decimal value and writes it into
Expand All @@ -1747,34 +1748,31 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(

*/

#ifndef Py_LIMITED_API
PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Py_UNICODE *s, /* Unicode buffer */
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
char *output, /* Output buffer; must have size >= length */
const char *errors /* error handling */
) /* Py_DEPRECATED(3.3) */;
#endif

/* Transforms code points that have decimal digit property to the
corresponding ASCII digit code points.

Returns a new Unicode string on success, NULL on failure.
*/

#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
Py_UNICODE *s, /* Unicode buffer */
Py_ssize_t length /* Number of Py_UNICODE chars to transform */
) /* Py_DEPRECATED(3.3) */;
#endif

/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
as argument instead of a raw buffer and length. This function additionally
transforms spaces to ASCII because this is what the callers in longobject,
floatobject, and complexobject did anyways. */
/* Coverts a Unicode object holding a decimal value to an ASCII string
for using in int, float and complex parsers.
Transforms code points that have decimal digit property to the
corresponding ASCII digit code points. Transforms spaces to ASCII.
Transforms code points starting from the first non-ASCII code point that
is neither a decimal digit nor a space to the end into '?'. */

#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
PyObject *unicode /* Unicode object */
);
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_float.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_float(self):
self.assertRaises(TypeError, float, {})
self.assertRaisesRegex(TypeError, "not 'dict'", float, {})
# Lone surrogate
self.assertRaises(UnicodeEncodeError, float, '\uD8F0')
self.assertRaises(ValueError, float, '\uD8F0')
# check that we don't accept alternate exponent markers
self.assertRaises(ValueError, float, "-1.7d29")
self.assertRaises(ValueError, float, "3D-14")
Expand Down
13 changes: 8 additions & 5 deletions Lib/test/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -2068,11 +2068,14 @@ def test_codecs_errors(self):
# Error handling (wrong arguments)
self.assertRaises(TypeError, "hello".encode, 42, 42, 42)

# Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
self.assertRaises(UnicodeError, float, "\ud800")
self.assertRaises(UnicodeError, float, "\udf00")
self.assertRaises(UnicodeError, complex, "\ud800")
self.assertRaises(UnicodeError, complex, "\udf00")
# Error handling (lone surrogate in
# _PyUnicode_TransformDecimalAndSpaceToASCII())
self.assertRaises(ValueError, int, "\ud800")
self.assertRaises(ValueError, int, "\udf00")
self.assertRaises(ValueError, float, "\ud800")
self.assertRaises(ValueError, float, "\udf00")
self.assertRaises(ValueError, complex, "\ud800")
self.assertRaises(ValueError, complex, "\udf00")

def test_codecs(self):
# Encoding
Expand Down
7 changes: 3 additions & 4 deletions Objects/complexobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -914,10 +914,10 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
if (s_buffer == NULL) {
return NULL;
}
assert(PyUnicode_IS_ASCII(s_buffer));
/* Simply get a pointer to existing ASCII characters. */
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
if (s == NULL) {
goto exit;
}
assert(s != NULL);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please add a short comment explaining that PyUnicode_AsUTF8AndSize() cannot fail on an ASCII string, since it simply returns a pointer to existing ASCII characters?

The assertion surprised me, I had to check how Unicode strings are implemented :-)

Same comment for the same assertion below.

}
else {
PyErr_Format(PyExc_TypeError,
Expand All @@ -928,7 +928,6 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)

result = _Py_string_to_number_with_underscores(s, len, "complex", v, type,
complex_from_string_inner);
exit:
Py_DECREF(s_buffer);
return result;
}
Expand Down
7 changes: 3 additions & 4 deletions Objects/floatobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,10 @@ PyFloat_FromString(PyObject *v)
s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v);
if (s_buffer == NULL)
return NULL;
assert(PyUnicode_IS_ASCII(s_buffer));
/* Simply get a pointer to existing ASCII characters. */
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
if (s == NULL) {
Py_DECREF(s_buffer);
return NULL;
}
assert(s != NULL);
}
else if (PyBytes_Check(v)) {
s = PyBytes_AS_STRING(v);
Expand Down
21 changes: 9 additions & 12 deletions Objects/longobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -2509,21 +2509,18 @@ PyLong_FromUnicodeObject(PyObject *u, int base)
asciidig = _PyUnicode_TransformDecimalAndSpaceToASCII(u);
if (asciidig == NULL)
return NULL;
assert(PyUnicode_IS_ASCII(asciidig));
/* Simply get a pointer to existing ASCII characters. */
buffer = PyUnicode_AsUTF8AndSize(asciidig, &buflen);
if (buffer == NULL) {
Py_DECREF(asciidig);
if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
return NULL;
}
else {
result = PyLong_FromString(buffer, &end, base);
if (end == NULL || (result != NULL && end == buffer + buflen)) {
Py_DECREF(asciidig);
return result;
}
assert(buffer != NULL);

result = PyLong_FromString(buffer, &end, base);
if (end == NULL || (result != NULL && end == buffer + buflen)) {
Py_DECREF(asciidig);
Py_XDECREF(result);
return result;
}
Py_DECREF(asciidig);
Py_XDECREF(result);
PyErr_Format(PyExc_ValueError,
"invalid literal for int() with base %d: %.200R",
base, u);
Expand Down
136 changes: 32 additions & 104 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)

/* --- Unicode Object ----------------------------------------------------- */

static PyObject *
fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));

static inline Py_ssize_t
findchar(const void *s, int kind,
Py_ssize_t size, Py_UCS4 ch,
Expand Down Expand Up @@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,
return _PyUnicode_TranslateCharmap(str, mapping, errors);
}

static Py_UCS4
fix_decimal_and_space_to_ascii(PyObject *self)
{
/* No need to call PyUnicode_READY(self) because this function is only
called as a callback from fixup() which does it already. */
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
const int kind = PyUnicode_KIND(self);
void *data = PyUnicode_DATA(self);
Py_UCS4 maxchar = 127, ch, fixed;
int modified = 0;
Py_ssize_t i;

for (i = 0; i < len; ++i) {
ch = PyUnicode_READ(kind, data, i);
fixed = 0;
if (ch > 127) {
if (Py_UNICODE_ISSPACE(ch))
fixed = ' ';
else {
const int decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal >= 0)
fixed = '0' + decimal;
}
if (fixed != 0) {
modified = 1;
maxchar = Py_MAX(maxchar, fixed);
PyUnicode_WRITE(kind, data, i, fixed);
}
else
maxchar = Py_MAX(maxchar, ch);
}
}

return (modified) ? maxchar : 0;
}

PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
{
Expand All @@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
}
if (PyUnicode_READY(unicode) == -1)
return NULL;
if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
if (PyUnicode_IS_ASCII(unicode)) {
/* If the string is already ASCII, just return the same string */
Py_INCREF(unicode);
return unicode;
}
return fixup(unicode, fix_decimal_and_space_to_ascii);

Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
PyObject *result = PyUnicode_New(len, 127);
if (result == NULL) {
return NULL;
}

Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
int kind = PyUnicode_KIND(unicode);
const void *data = PyUnicode_DATA(unicode);
Py_ssize_t i;
for (i = 0; i < len; ++i) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (ch < 127) {
out[i] = ch;
}
else if (Py_UNICODE_ISSPACE(ch)) {
out[i] = ' ';
}
else {
int decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal < 0) {
out[i] = '?';
_PyUnicode_LENGTH(result) = i + 1;
break;
}
out[i] = '0' + decimal;
}
}

return result;
}

PyObject *
Expand Down Expand Up @@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,
return tailmatch(str, substr, start, end, direction);
}

/* Apply fixfct filter to the Unicode object self and return a
reference to the modified object */

static PyObject *
fixup(PyObject *self,
Py_UCS4 (*fixfct)(PyObject *s))
{
PyObject *u;
Py_UCS4 maxchar_old, maxchar_new = 0;
PyObject *v;

u = _PyUnicode_Copy(self);
if (u == NULL)
return NULL;
maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);

/* fix functions return the new maximum character in a string,
if the kind of the resulting unicode object does not change,
everything is fine. Otherwise we need to change the string kind
and re-run the fix function. */
maxchar_new = fixfct(u);

if (maxchar_new == 0) {
/* no changes */;
if (PyUnicode_CheckExact(self)) {
Py_DECREF(u);
Py_INCREF(self);
return self;
}
else
return u;
}

maxchar_new = align_maxchar(maxchar_new);

if (maxchar_new == maxchar_old)
return u;

/* In case the maximum character changed, we need to
convert the string to the new category. */
v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
if (v == NULL) {
Py_DECREF(u);
return NULL;
}
if (maxchar_new > maxchar_old) {
/* If the maxchar increased so that the kind changed, not all
characters are representable anymore and we need to fix the
string again. This only happens in very few cases. */
_PyUnicode_FastCopyCharacters(v, 0,
self, 0, PyUnicode_GET_LENGTH(self));
maxchar_old = fixfct(v);
assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
}
else {
_PyUnicode_FastCopyCharacters(v, 0,
u, 0, PyUnicode_GET_LENGTH(self));
}
Py_DECREF(u);
assert(_PyUnicode_CheckConsistency(v, 1));
return v;
}

static PyObject *
ascii_upper_or_lower(PyObject *self, int lower)
{
Expand Down