Skip to content

Commit e215d96

Browse files
committed
Issue #16147: Rewrite PyUnicode_FromFormatV() to use _PyUnicodeWriter API
* Simplify the code: replace 4 steps with one unique step using the _PyUnicodeWriter API. PyUnicode_Format() has the same design. It avoids to store intermediate results which require to allocate an array of pointers on the heap. * Use the _PyUnicodeWriter API for speed (and its convinient API): overallocate the buffer to reduce the number of "realloc()" * Implement "width" and "precision" in Python, don't rely on sprintf(). It avoids to need of a temporary buffer allocated on the heap: only use a small buffer allocated in the stack. * Add _PyUnicodeWriter_WriteCstr() function * Split PyUnicode_FromFormatV() into two functions: add unicode_fromformat_arg(). * Inline parse_format_flags(): the format of an argument is now only parsed once, it's no more needed to have a subfunction. * Optimize PyUnicode_FromFormatV() for characters between two "%" arguments: search the next "%" and copy the substring in one chunk, instead of copying character per character.
1 parent 2a09b6e commit e215d96

3 files changed

Lines changed: 364 additions & 484 deletions

File tree

Include/unicodeobject.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -933,12 +933,28 @@ PyAPI_FUNC(int)
933933
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
934934
Py_ssize_t length, Py_UCS4 maxchar);
935935

936+
/* Append a Unicode string.
937+
Return 0 on success, raise an exception and return -1 on error. */
936938
PyAPI_FUNC(int)
937-
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str);
939+
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
940+
PyObject *str /* Unicode string */
941+
);
938942

943+
/* Append a latin1-encoded byte string.
944+
Return 0 on success, raise an exception and return -1 on error. */
945+
PyAPI_FUNC(int)
946+
_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer,
947+
const char *str, /* latin1-encoded byte string */
948+
Py_ssize_t len /* length in bytes */
949+
);
950+
951+
/* Get the value of the write as an Unicode string. Clear the
952+
buffer of the writer. Raise an exception and return NULL
953+
on error. */
939954
PyAPI_FUNC(PyObject *)
940955
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
941956

957+
/* Deallocate memory of a writer (clear its internal buffer). */
942958
PyAPI_FUNC(void)
943959
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
944960
#endif

Lib/test/test_unicode.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1769,6 +1769,22 @@ def PyUnicode_FromFormat(format, *args):
17691769
self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(123)), '123')
17701770
self.assertEqual(PyUnicode_FromFormat(b'%zu', c_size_t(123)), '123')
17711771

1772+
# test padding (width and/or precision)
1773+
self.assertEqual(PyUnicode_FromFormat(b'%010i', c_int(123)), '123'.rjust(10, '0'))
1774+
self.assertEqual(PyUnicode_FromFormat(b'%100i', c_int(123)), '123'.rjust(100))
1775+
self.assertEqual(PyUnicode_FromFormat(b'%.100i', c_int(123)), '123'.rjust(100, '0'))
1776+
self.assertEqual(PyUnicode_FromFormat(b'%100.80i', c_int(123)), '123'.rjust(80, '0').rjust(100))
1777+
1778+
self.assertEqual(PyUnicode_FromFormat(b'%010u', c_uint(123)), '123'.rjust(10, '0'))
1779+
self.assertEqual(PyUnicode_FromFormat(b'%100u', c_uint(123)), '123'.rjust(100))
1780+
self.assertEqual(PyUnicode_FromFormat(b'%.100u', c_uint(123)), '123'.rjust(100, '0'))
1781+
self.assertEqual(PyUnicode_FromFormat(b'%100.80u', c_uint(123)), '123'.rjust(80, '0').rjust(100))
1782+
1783+
self.assertEqual(PyUnicode_FromFormat(b'%010x', c_int(0x123)), '123'.rjust(10, '0'))
1784+
self.assertEqual(PyUnicode_FromFormat(b'%100x', c_int(0x123)), '123'.rjust(100))
1785+
self.assertEqual(PyUnicode_FromFormat(b'%.100x', c_int(0x123)), '123'.rjust(100, '0'))
1786+
self.assertEqual(PyUnicode_FromFormat(b'%100.80x', c_int(0x123)), '123'.rjust(80, '0').rjust(100))
1787+
17721788
# test %A
17731789
text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
17741790
self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'")

0 commit comments

Comments
 (0)