python · vstinner · Nov 2, 2023 · serhiy-storchaka · Nov 3, 2023 · serhiy-storchaka
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
@@ -971,6 +971,12 @@ These are the UTF-8 codec APIs:
    returned buffer always has an extra null byte appended (not included in
    *size*), regardless of whether there are any other null code points.
 
+   If *size* is NULL and the *unicode* string contains embedded null
+   characters, raise an exception. To accept embedded null characters and
+   truncate on purpose at the first null byte, :c:func:`PyUnicode_AsUTF8Unsafe`
+   and :c:func:`PyUnicode_AsUTF8AndSize(unicode, &size)
+   <PyUnicode_AsUTF8AndSize>` can be used instead.
+
    On error, set an exception, set *size* to ``-1`` (if it's not NULL) and
    return ``NULL``.
 
@@ -987,15 +993,21 @@ These are the UTF-8 codec APIs:
    .. versionchanged:: 3.10
       This function is a part of the :ref:`limited API <limited-c-api>`.
 
+   .. versionchanged:: 3.13
+      Raise an exception if *size* is NULL and the string contains embedded
+      null characters.
+
 
 .. c:function:: const char* PyUnicode_AsUTF8(PyObject *unicode)
 
-   As :c:func:`PyUnicode_AsUTF8AndSize`, but does not store the size.
+   Similar to :c:func:`PyUnicode_AsUTF8AndSize(unicode, NULL)
+   <PyUnicode_AsUTF8AndSize>`, but does not store the size.
 
    Raise an exception if the *unicode* string contains embedded null
-   characters. To accept embedded null characters and truncate on purpose
-   at the first null byte, ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be
-   used instead.
+   characters. To accept embedded null characters and truncate on purpose at
+   the first null byte, :c:func:`PyUnicode_AsUTF8Unsafe` and
+   :c:func:`PyUnicode_AsUTF8AndSize(unicode, &size) <PyUnicode_AsUTF8AndSize>`
+   can be used instead.
 
    .. versionadded:: 3.3
 
@@ -1005,6 +1017,16 @@ These are the UTF-8 codec APIs:
    .. versionchanged:: 3.13
       Raise an exception if the string contains embedded null characters.
 
+.. c:function:: const char* PyUnicode_AsUTF8Unsafe(PyObject *unicode)
+
+   Similar to :c:func:`PyUnicode_AsUTF8`, but do not raise an exception if the
+   string contains embedded null characters.
+
+   This function can be used to truncate a string on purpose at the first null
+   character.
+
+   .. versionchanged:: 3.13
+
 
 UTF-32 Codecs
 """""""""""""

diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst
@@ -1128,6 +1128,11 @@ New Features
 * Add :c:func:`PyUnicode_AsUTF8` function to the limited C API.
   (Contributed by Victor Stinner in :gh:`111089`.)
 
+* Add :c:func:`PyUnicode_AsUTF8Unsafe` function: similar to
+  :c:func:`PyUnicode_AsUTF8`, but do not raise an exception if the string
+  contains embedded null characters.
+  (Contributed by Victor Stinner in :gh:`111089`.)
+
 
 Porting to Python 3.13
 ----------------------
@@ -1198,10 +1203,13 @@ Porting to Python 3.13
   Note that ``Py_TRASHCAN_BEGIN`` has a second argument which
   should be the deallocation function it is in.
 
-* The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the string
-  contains embedded null characters. To accept embedded null characters and
-  truncate on purpose at the first null byte,
-  ``PyUnicode_AsUTF8AndSize(unicode, NULL)`` can be used instead.
+* The :c:func:`PyUnicode_AsUTF8` and
+  :c:func:`PyUnicode_AsUTF8AndSize(unicode, NULL) <PyUnicode_AsUTF8AndSize>`
+  functions now raise an exception if the string contains embedded null
+  characters. To accept embedded null characters and truncate on purpose at the
+  first null byte, :c:func:`PyUnicode_AsUTF8Unsafe` and
+  :c:func:`PyUnicode_AsUTF8AndSize(unicode, &size) <PyUnicode_AsUTF8AndSize>`
+  can be used instead.
   (Contributed by Victor Stinner in :gh:`111089`.)
 
 * On Windows, ``Python.h`` no longer includes the ``<stddef.h>`` standard

diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
@@ -451,7 +451,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
 // This function caches the UTF-8 encoded string in the Unicode object
 // and subsequent calls will return the same string. The memory is released
 // when the Unicode object is deallocated.
-PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
+PyAPI_FUNC(const char*) PyUnicode_AsUTF8(PyObject *unicode);
+
+// Similar to PyUnicode_AsUTF8(), but do not raise an exception if the string
+// contains embedded null characters.
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
+PyAPI_FUNC(const char*) PyUnicode_AsUTF8Unsafe(PyObject *unicode);
+#endif
 
 // Returns a pointer to the UTF-8 encoding of the
 // Unicode object unicode and the size of the encoded representation

diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -875,24 +875,37 @@ def test_fromordinal(self):
         self.assertRaises(ValueError, fromordinal, 0x110000)
         self.assertRaises(ValueError, fromordinal, -1)
 
+    def check_asutf8(self, unicode_asutf8):
+        self.assertEqual(unicode_asutf8('abc', 4), b'abc\0')
+        self.assertEqual(unicode_asutf8('абв', 7), b'\xd0\xb0\xd0\xb1\xd0\xb2\0')
+        self.assertEqual(unicode_asutf8('\U0001f600', 5), b'\xf0\x9f\x98\x80\0')
+
+        self.assertRaises(UnicodeEncodeError, unicode_asutf8, '\ud8ff', 0)
+        self.assertRaises(TypeError, unicode_asutf8, b'abc', 0)
+        self.assertRaises(TypeError, unicode_asutf8, [], 0)
+        # CRASHES unicode_asutf8(NULL, 0)
+
     @support.cpython_only
     @unittest.skipIf(_testcapi is None, 'need _testcapi module')
     def test_asutf8(self):
         """Test PyUnicode_AsUTF8()"""
         from _testcapi import unicode_asutf8
-
-        self.assertEqual(unicode_asutf8('abc', 4), b'abc\0')
-        self.assertEqual(unicode_asutf8('абв', 7), b'\xd0\xb0\xd0\xb1\xd0\xb2\0')
-        self.assertEqual(unicode_asutf8('\U0001f600', 5), b'\xf0\x9f\x98\x80\0')
+        self.check_asutf8(unicode_asutf8)
 
         # disallow embedded null characters
         self.assertRaises(ValueError, unicode_asutf8, 'abc\0', 0)
         self.assertRaises(ValueError, unicode_asutf8, 'abc\0def', 0)
 
-        self.assertRaises(UnicodeEncodeError, unicode_asutf8, '\ud8ff', 0)
-        self.assertRaises(TypeError, unicode_asutf8, b'abc', 0)
-        self.assertRaises(TypeError, unicode_asutf8, [], 0)
-        # CRASHES unicode_asutf8(NULL, 0)
+    @support.cpython_only
+    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
+    def test_asutf8unsafe(self):
+        """Test PyUnicode_AsUTF8Unsafe()"""
+        from _testcapi import unicode_asutf8unsafe
+        self.check_asutf8(unicode_asutf8unsafe)
+
+        # allow embedded null characters
+        self.assertEqual(unicode_asutf8unsafe('abc\0', 4), b'abc\0')
+        self.assertEqual(unicode_asutf8unsafe('abc\0def', 8), b'abc\0def\0')
 
     @support.cpython_only
     @unittest.skipIf(_testcapi is None, 'need _testcapi module')
@@ -904,9 +917,11 @@ def test_asutf8andsize(self):
         self.assertEqual(unicode_asutf8andsize('abc', 4), (b'abc\0', 3))
         self.assertEqual(unicode_asutf8andsize('абв', 7), (b'\xd0\xb0\xd0\xb1\xd0\xb2\0', 6))
         self.assertEqual(unicode_asutf8andsize('\U0001f600', 5), (b'\xf0\x9f\x98\x80\0', 4))
+
         self.assertEqual(unicode_asutf8andsize('abc\0def', 8), (b'abc\0def\0', 7))
         self.assertEqual(unicode_asutf8andsize_null('abc', 4), b'abc\0')
-        self.assertEqual(unicode_asutf8andsize_null('abc\0def', 8), b'abc\0def\0')
+        # PyUnicode_AsUTF8AndSize(str, NULL) disallows embedded null characters
+        self.assertRaises(ValueError, unicode_asutf8andsize_null, 'abc\0def', 8)
 
         self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, '\ud8ff', 0)
         self.assertRaises(TypeError, unicode_asutf8andsize, b'abc', 0)

diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py
diff --git a/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst b/Misc/NEWS.d/next/C API/2023-10-20-01-42-43.gh-issue-111089.VIrd5q.rst
@@ -1,2 +1,4 @@
-The :c:func:`PyUnicode_AsUTF8` function now raises an exception if the
-string contains embedded null characters. Patch by Victor Stinner.
+The :c:func:`PyUnicode_AsUTF8` and
+:c:func:`PyUnicode_AsUTF8AndSize(unicode, NULL) <PyUnicode_AsUTF8AndSize>`
+functions now raise an exception if the string contains embedded null
+characters. Patch by Victor Stinner.
@@ -2480,3 +2480,5 @@
     added = '3.13'
 [function.PyUnicode_AsUTF8]
     added = '3.13'
+[function.PyUnicode_AsUTF8Unsafe]
+    added = '3.13'
diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c
@@ -619,6 +619,25 @@ unicode_asutf8(PyObject *self, PyObject *args)
     return PyBytes_FromStringAndSize(s, buflen);
 }
 
+/* Test PyUnicode_AsUTF8Unsafe() */
+static PyObject *
+unicode_asutf8unsafe(PyObject *self, PyObject *args)
+{
+    PyObject *unicode;
+    Py_ssize_t buflen;
+    const char *s;
+
+    if (!PyArg_ParseTuple(args, "On", &unicode, &buflen))
+        return NULL;
+
+    NULLABLE(unicode);
+    s = PyUnicode_AsUTF8Unsafe(unicode);
+    if (s == NULL)
+        return NULL;
+
+    return PyBytes_FromStringAndSize(s, buflen);
+}
+
 /* Test PyUnicode_AsUTF8AndSize() */
 static PyObject *
 unicode_asutf8andsize(PyObject *self, PyObject *args)
@@ -2031,6 +2050,7 @@ static PyMethodDef TestMethods[] = {
     {"unicode_asucs4copy",       unicode_asucs4copy,             METH_VARARGS},
     {"unicode_fromordinal",      unicode_fromordinal,            METH_VARARGS},
     {"unicode_asutf8",           unicode_asutf8,                 METH_VARARGS},
+    {"unicode_asutf8unsafe",     unicode_asutf8unsafe,           METH_VARARGS},
     {"unicode_asutf8andsize",    unicode_asutf8andsize,          METH_VARARGS},
     {"unicode_asutf8andsize_null",unicode_asutf8andsize_null,    METH_VARARGS},
     {"unicode_getdefaultencoding",unicode_getdefaultencoding,    METH_NOARGS},

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -205,6 +205,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
 static inline int unicode_is_finalizing(void);
 static int unicode_is_singleton(PyObject *unicode);
 #endif
+static int unicode_fill_utf8(PyObject *unicode);
 
 
 // Return a reference to the immortal empty string singleton.
@@ -3813,10 +3814,8 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
 }
 
 
-static int unicode_fill_utf8(PyObject *unicode);
-
-const char *
-PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
+static const char *
+unicode_as_utf8(PyObject *unicode, Py_ssize_t *psize, int check_embed_null)
 {
     if (!PyUnicode_Check(unicode)) {
         PyErr_BadArgument();
@@ -3826,31 +3825,47 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
         return NULL;
     }
 
-    if (PyUnicode_UTF8(unicode) == NULL) {
+    const char *utf8 = PyUnicode_UTF8(unicode);
+    if (utf8 == NULL) {
         if (unicode_fill_utf8(unicode) == -1) {
             if (psize) {
                 *psize = -1;
             }
             return NULL;
         }
+        utf8 = PyUnicode_UTF8(unicode);
     }
 
     if (psize) {
         *psize = PyUnicode_UTF8_LENGTH(unicode);
     }
+
+    if (check_embed_null) {
+        if (strlen(utf8) != (size_t)PyUnicode_UTF8_LENGTH(unicode)) {
+            PyErr_SetString(PyExc_ValueError, "embedded null character");
+            return NULL;
+        }
+    }
+
     return PyUnicode_UTF8(unicode);
 }
 
-const char *
+const char*
+PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
+{
+    return unicode_as_utf8(unicode, psize, psize == NULL);
+}
+
+const char*
 PyUnicode_AsUTF8(PyObject *unicode)
 {
-    Py_ssize_t size;
-    const char *utf8 = PyUnicode_AsUTF8AndSize(unicode, &size);
-    if (utf8 != NULL && strlen(utf8) != (size_t)size) {
-        PyErr_SetString(PyExc_ValueError, "embedded null character");
-        return NULL;
-    }
-    return utf8;
+    return unicode_as_utf8(unicode, NULL, 1);
+}
+
+const char*
+PyUnicode_AsUTF8Unsafe(PyObject *unicode)
+{
+    return unicode_as_utf8(unicode, NULL, 0);
 }
 
 /*