-
-
Notifications
You must be signed in to change notification settings - Fork 32.2k
gh-111089: Add cache to PyUnicode_AsUTF8() for embedded NUL #111587
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8004312
a7e93c9
4ccd7d9
3c4844f
e224751
65c6671
30bb725
07975be
e3c6fa5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
Add ``PyASCIIObject.state.embed_null`` member to Python str objects. It is | ||
used as a cache by :c:func:`PyUnicode_AsUTF8` to only check once if a string | ||
contains a null character. Strings created by :c:func:`PyUnicode_FromString` | ||
initializes *embed_null* to 0 since the string cannot contain a null character. | ||
Patch by Victor Stinner. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -189,6 +189,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently | |
# define OVERALLOCATE_FACTOR 4 | ||
#endif | ||
|
||
#define EMBED_NULL_UNKNOWN 2 | ||
|
||
/* Forward declaration */ | ||
static inline int | ||
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch); | ||
|
@@ -205,6 +207,10 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, | |
static inline int unicode_is_finalizing(void); | ||
static int unicode_is_singleton(PyObject *unicode); | ||
#endif | ||
static inline Py_ssize_t | ||
findchar(const void *s, int kind, | ||
Py_ssize_t size, Py_UCS4 ch, | ||
int direction); | ||
|
||
|
||
// Return a reference to the immortal empty string singleton. | ||
|
@@ -623,6 +629,15 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) | |
} | ||
CHECK(PyUnicode_READ(kind, data, ascii->length) == 0); | ||
} | ||
|
||
if (_PyUnicode_STATE(ascii).embed_null != EMBED_NULL_UNKNOWN) { | ||
Py_ssize_t pos = findchar(PyUnicode_DATA(ascii), | ||
PyUnicode_KIND(ascii), | ||
PyUnicode_GET_LENGTH(ascii), | ||
0, 1); | ||
CHECK(_PyUnicode_STATE(ascii).embed_null == (pos >= 0)); | ||
} | ||
|
||
return 1; | ||
|
||
#undef CHECK | ||
|
@@ -1000,6 +1015,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length) | |
_PyUnicode_UTF8(unicode) = NULL; | ||
_PyUnicode_UTF8_LENGTH(unicode) = 0; | ||
} | ||
_PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN; | ||
#ifdef Py_TRACE_REFS | ||
_Py_ForgetReference(unicode); | ||
#endif | ||
|
@@ -1053,6 +1069,7 @@ resize_inplace(PyObject *unicode, Py_ssize_t length) | |
_PyUnicode_UTF8(unicode) = NULL; | ||
_PyUnicode_UTF8_LENGTH(unicode) = 0; | ||
} | ||
_PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN; | ||
|
||
data = (PyObject *)PyObject_Realloc(data, new_size); | ||
if (data == NULL) { | ||
|
@@ -1253,6 +1270,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) | |
_PyUnicode_STATE(unicode).compact = 1; | ||
_PyUnicode_STATE(unicode).ascii = is_ascii; | ||
_PyUnicode_STATE(unicode).statically_allocated = 0; | ||
_PyUnicode_STATE(unicode).embed_null = EMBED_NULL_UNKNOWN; | ||
if (is_ascii) { | ||
((char*)data)[size] = 0; | ||
} | ||
|
@@ -1777,6 +1795,8 @@ unicode_char(Py_UCS4 ch) | |
assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); | ||
PyUnicode_4BYTE_DATA(unicode)[0] = ch; | ||
} | ||
// ch >= 256 and so cannot be 0 | ||
_PyUnicode_STATE(unicode).embed_null = 0; | ||
assert(_PyUnicode_CheckConsistency(unicode, 1)); | ||
return unicode; | ||
} | ||
|
@@ -1793,8 +1813,13 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) | |
return NULL; | ||
} | ||
|
||
unsigned int embed_null; | ||
if (size == -1) { | ||
size = wcslen(u); | ||
embed_null = 0; | ||
} | ||
else { | ||
embed_null = EMBED_NULL_UNKNOWN; | ||
} | ||
|
||
/* If the Unicode data is known at construction time, we can apply | ||
|
@@ -1859,6 +1884,7 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) | |
default: | ||
Py_UNREACHABLE(); | ||
} | ||
_PyUnicode_STATE(unicode).embed_null = embed_null; | ||
|
||
return unicode_result(unicode); | ||
} | ||
|
@@ -1890,7 +1916,16 @@ PyUnicode_FromString(const char *u) | |
PyErr_SetString(PyExc_OverflowError, "input too long"); | ||
return NULL; | ||
} | ||
return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); | ||
PyObject *unicode; | ||
unicode = PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL); | ||
if (unicode != NULL) { | ||
// PyUnicode_DecodeUTF8Stateful(u, strlen(u)) cannot create NUL | ||
// characters: the UTF-8 decoder with the strict error handler only | ||
// creates a NUL character if the input string contains a NUL byte | ||
// which cannot be the case here. | ||
_PyUnicode_STATE(unicode).embed_null = 0; | ||
} | ||
return unicode; | ||
} | ||
|
||
|
||
|
@@ -1932,6 +1967,7 @@ _PyUnicode_FromId(_Py_Identifier *id) | |
if (!obj) { | ||
return NULL; | ||
} | ||
_PyUnicode_STATE(obj).embed_null = 0; | ||
PyUnicode_InternInPlace(&obj); | ||
|
||
if (index >= ids->size) { | ||
|
@@ -2204,6 +2240,7 @@ _PyUnicode_Copy(PyObject *unicode) | |
|
||
memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode), | ||
length * PyUnicode_KIND(unicode)); | ||
_PyUnicode_STATE(copy).embed_null = _PyUnicode_STATE(unicode).embed_null; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, technically, any string can be modified anytime by the C API. People "should not do that", but since it's possible, I'm not sure if it's safe to make the assumption that people will not mutate a string long after its creation: after the cache is initialized. |
||
assert(_PyUnicode_CheckConsistency(copy, 1)); | ||
return copy; | ||
} | ||
|
@@ -3846,10 +3883,29 @@ PyUnicode_AsUTF8(PyObject *unicode) | |
{ | ||
Py_ssize_t size; | ||
const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size); | ||
if (utf8 != NULL && strlen(utf8) != (size_t)size) { | ||
PyErr_SetString(PyExc_ValueError, "embedded null character"); | ||
if (utf8 == NULL) { | ||
return NULL; | ||
} | ||
|
||
// Cache to avoid calling O(n) strlen() operation at every | ||
// PyUnicode_AsUTF8() call on the same object. | ||
if (_PyUnicode_STATE(unicode).embed_null != 0) { | ||
if (_PyUnicode_STATE(unicode).embed_null == EMBED_NULL_UNKNOWN) { | ||
if (strlen(utf8) != (size_t)size) { | ||
_PyUnicode_STATE(unicode).embed_null = 1; | ||
} | ||
else { | ||
_PyUnicode_STATE(unicode).embed_null = 0; | ||
} | ||
} | ||
|
||
if (_PyUnicode_STATE(unicode).embed_null == 1) { | ||
PyErr_SetString(PyExc_ValueError, | ||
"embedded null character"); | ||
return NULL; | ||
} | ||
} | ||
|
||
return utf8; | ||
} | ||
|
||
|
@@ -11039,6 +11095,7 @@ PyUnicode_Append(PyObject **p_left, PyObject *right) | |
Py_DECREF(left); | ||
*p_left = res; | ||
} | ||
assert(_PyUnicode_STATE(*p_left).embed_null == EMBED_NULL_UNKNOWN); | ||
assert(_PyUnicode_CheckConsistency(*p_left, 1)); | ||
return; | ||
|
||
|
@@ -14627,6 +14684,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode) | |
_PyUnicode_STATE(self).compact = 0; | ||
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; | ||
_PyUnicode_STATE(self).statically_allocated = 0; | ||
_PyUnicode_STATE(self).embed_null = EMBED_NULL_UNKNOWN; | ||
_PyUnicode_UTF8_LENGTH(self) = 0; | ||
_PyUnicode_UTF8(self) = NULL; | ||
_PyUnicode_DATA_ANY(self) = NULL; | ||
|
Uh oh!
There was an error while loading. Please reload this page.