Skip to content

Commit 232b303

Browse files
[3.12] gh-52551: Fix encoding issues in strftime() (GH-125193) (GH-125657) (GH-125661)
Fix time.strftime(), the strftime() method and formatting of the datetime classes datetime, date and time. * Characters not encodable in the current locale are now acceptable in the format string. * Surrogate pairs and sequence of surrogatescape-encoded bytes are no longer recombinated. * Embedded null character no longer terminates the format string. This fixes also gh-78662 and gh-124531. (cherry picked from commit 08ccbb9) (cherry picked from commit ad3eac1) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent 5e62d9b commit 232b303

File tree

5 files changed

+293
-211
lines changed

5 files changed

+293
-211
lines changed

Lib/test/datetimetester.py

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2816,11 +2816,32 @@ def test_more_strftime(self):
28162816
self.assertEqual(t.strftime("%z"), "-0200" + z)
28172817
self.assertEqual(t.strftime("%:z"), "-02:00:" + z)
28182818

2819-
# bpo-34482: Check that surrogates don't cause a crash.
2820-
try:
2821-
t.strftime('%y\ud800%m %H\ud800%M')
2822-
except UnicodeEncodeError:
2823-
pass
2819+
def test_strftime_special(self):
2820+
t = self.theclass(2004, 12, 31, 6, 22, 33, 47)
2821+
s1 = t.strftime('%c')
2822+
s2 = t.strftime('%B')
2823+
# gh-52551, gh-78662: Unicode strings should pass through strftime,
2824+
# independently from locale.
2825+
self.assertEqual(t.strftime('\U0001f40d'), '\U0001f40d')
2826+
self.assertEqual(t.strftime('\U0001f4bb%c\U0001f40d%B'), f'\U0001f4bb{s1}\U0001f40d{s2}')
2827+
self.assertEqual(t.strftime('%c\U0001f4bb%B\U0001f40d'), f'{s1}\U0001f4bb{s2}\U0001f40d')
2828+
# Lone surrogates should pass through.
2829+
self.assertEqual(t.strftime('\ud83d'), '\ud83d')
2830+
self.assertEqual(t.strftime('\udc0d'), '\udc0d')
2831+
self.assertEqual(t.strftime('\ud83d%c\udc0d%B'), f'\ud83d{s1}\udc0d{s2}')
2832+
self.assertEqual(t.strftime('%c\ud83d%B\udc0d'), f'{s1}\ud83d{s2}\udc0d')
2833+
self.assertEqual(t.strftime('%c\udc0d%B\ud83d'), f'{s1}\udc0d{s2}\ud83d')
2834+
# Surrogate pairs should not recombine.
2835+
self.assertEqual(t.strftime('\ud83d\udc0d'), '\ud83d\udc0d')
2836+
self.assertEqual(t.strftime('%c\ud83d\udc0d%B'), f'{s1}\ud83d\udc0d{s2}')
2837+
# Surrogate-escaped bytes should not recombine.
2838+
self.assertEqual(t.strftime('\udcf0\udc9f\udc90\udc8d'), '\udcf0\udc9f\udc90\udc8d')
2839+
self.assertEqual(t.strftime('%c\udcf0\udc9f\udc90\udc8d%B'), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
2840+
# gh-124531: The null character should not terminate the format string.
2841+
self.assertEqual(t.strftime('\0'), '\0')
2842+
self.assertEqual(t.strftime('\0'*1000), '\0'*1000)
2843+
self.assertEqual(t.strftime('\0%c\0%B'), f'\0{s1}\0{s2}')
2844+
self.assertEqual(t.strftime('%c\0%B\0'), f'{s1}\0{s2}\0')
28242845

28252846
def test_extract(self):
28262847
dt = self.theclass(2002, 3, 4, 18, 45, 3, 1234)
@@ -3573,6 +3594,33 @@ def test_strftime(self):
35733594
# gh-85432: The parameter was named "fmt" in the pure-Python impl.
35743595
t.strftime(format="%f")
35753596

3597+
def test_strftime_special(self):
3598+
t = self.theclass(1, 2, 3, 4)
3599+
s1 = t.strftime('%I%p%Z')
3600+
s2 = t.strftime('%X')
3601+
# gh-52551, gh-78662: Unicode strings should pass through strftime,
3602+
# independently from locale.
3603+
self.assertEqual(t.strftime('\U0001f40d'), '\U0001f40d')
3604+
self.assertEqual(t.strftime('\U0001f4bb%I%p%Z\U0001f40d%X'), f'\U0001f4bb{s1}\U0001f40d{s2}')
3605+
self.assertEqual(t.strftime('%I%p%Z\U0001f4bb%X\U0001f40d'), f'{s1}\U0001f4bb{s2}\U0001f40d')
3606+
# Lone surrogates should pass through.
3607+
self.assertEqual(t.strftime('\ud83d'), '\ud83d')
3608+
self.assertEqual(t.strftime('\udc0d'), '\udc0d')
3609+
self.assertEqual(t.strftime('\ud83d%I%p%Z\udc0d%X'), f'\ud83d{s1}\udc0d{s2}')
3610+
self.assertEqual(t.strftime('%I%p%Z\ud83d%X\udc0d'), f'{s1}\ud83d{s2}\udc0d')
3611+
self.assertEqual(t.strftime('%I%p%Z\udc0d%X\ud83d'), f'{s1}\udc0d{s2}\ud83d')
3612+
# Surrogate pairs should not recombine.
3613+
self.assertEqual(t.strftime('\ud83d\udc0d'), '\ud83d\udc0d')
3614+
self.assertEqual(t.strftime('%I%p%Z\ud83d\udc0d%X'), f'{s1}\ud83d\udc0d{s2}')
3615+
# Surrogate-escaped bytes should not recombine.
3616+
self.assertEqual(t.strftime('\udcf0\udc9f\udc90\udc8d'), '\udcf0\udc9f\udc90\udc8d')
3617+
self.assertEqual(t.strftime('%I%p%Z\udcf0\udc9f\udc90\udc8d%X'), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
3618+
# gh-124531: The null character should not terminate the format string.
3619+
self.assertEqual(t.strftime('\0'), '\0')
3620+
self.assertEqual(t.strftime('\0'*1000), '\0'*1000)
3621+
self.assertEqual(t.strftime('\0%I%p%Z\0%X'), f'\0{s1}\0{s2}')
3622+
self.assertEqual(t.strftime('%I%p%Z\0%X\0'), f'{s1}\0{s2}\0')
3623+
35763624
def test_format(self):
35773625
t = self.theclass(1, 2, 3, 4)
35783626
self.assertEqual(t.__format__(''), str(t))
@@ -4002,9 +4050,8 @@ def tzname(self, dt): return self.tz
40024050
self.assertRaises(TypeError, t.strftime, "%Z")
40034051

40044052
# Issue #6697:
4005-
if '_Fast' in self.__class__.__name__:
4006-
Badtzname.tz = '\ud800'
4007-
self.assertRaises(ValueError, t.strftime, "%Z")
4053+
Badtzname.tz = '\ud800'
4054+
self.assertEqual(t.strftime("%Z"), '\ud800')
40084055

40094056
def test_hash_edge_cases(self):
40104057
# Offsets that overflow a basic time.

Lib/test/test_time.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,33 @@ def test_strftime(self):
177177
self.fail('conversion specifier: %r failed.' % format)
178178

179179
self.assertRaises(TypeError, time.strftime, b'%S', tt)
180-
# embedded null character
181-
self.assertRaises(ValueError, time.strftime, '%S\0', tt)
180+
181+
def test_strftime_special(self):
182+
tt = time.gmtime(self.t)
183+
s1 = time.strftime('%c', tt)
184+
s2 = time.strftime('%B', tt)
185+
# gh-52551, gh-78662: Unicode strings should pass through strftime,
186+
# independently from locale.
187+
self.assertEqual(time.strftime('\U0001f40d', tt), '\U0001f40d')
188+
self.assertEqual(time.strftime('\U0001f4bb%c\U0001f40d%B', tt), f'\U0001f4bb{s1}\U0001f40d{s2}')
189+
self.assertEqual(time.strftime('%c\U0001f4bb%B\U0001f40d', tt), f'{s1}\U0001f4bb{s2}\U0001f40d')
190+
# Lone surrogates should pass through.
191+
self.assertEqual(time.strftime('\ud83d', tt), '\ud83d')
192+
self.assertEqual(time.strftime('\udc0d', tt), '\udc0d')
193+
self.assertEqual(time.strftime('\ud83d%c\udc0d%B', tt), f'\ud83d{s1}\udc0d{s2}')
194+
self.assertEqual(time.strftime('%c\ud83d%B\udc0d', tt), f'{s1}\ud83d{s2}\udc0d')
195+
self.assertEqual(time.strftime('%c\udc0d%B\ud83d', tt), f'{s1}\udc0d{s2}\ud83d')
196+
# Surrogate pairs should not recombine.
197+
self.assertEqual(time.strftime('\ud83d\udc0d', tt), '\ud83d\udc0d')
198+
self.assertEqual(time.strftime('%c\ud83d\udc0d%B', tt), f'{s1}\ud83d\udc0d{s2}')
199+
# Surrogate-escaped bytes should not recombine.
200+
self.assertEqual(time.strftime('\udcf0\udc9f\udc90\udc8d', tt), '\udcf0\udc9f\udc90\udc8d')
201+
self.assertEqual(time.strftime('%c\udcf0\udc9f\udc90\udc8d%B', tt), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
202+
# gh-124531: The null character should not terminate the format string.
203+
self.assertEqual(time.strftime('\0', tt), '\0')
204+
self.assertEqual(time.strftime('\0'*1000, tt), '\0'*1000)
205+
self.assertEqual(time.strftime('\0%c\0%B', tt), f'\0{s1}\0{s2}')
206+
self.assertEqual(time.strftime('%c\0%B\0', tt), f'{s1}\0{s2}\0')
182207

183208
def _bounds_checking(self, func):
184209
# Make sure that strftime() checks the bounds of the various parts
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Fix encoding issues in :func:`time.strftime`, the
2+
:meth:`~datetime.datetime.strftime` method of the :mod:`datetime` classes
3+
:class:`~datetime.datetime`, :class:`~datetime.date` and
4+
:class:`~datetime.time` and formatting of these classes. Characters not
5+
encodable in the current locale are now acceptable in the format string.
6+
Surrogate pairs and sequence of surrogatescape-encoded bytes are no longer
7+
recombinated. Embedded null character no longer terminates the format
8+
string.

Modules/_datetimemodule.c

Lines changed: 67 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,7 +1501,7 @@ make_somezreplacement(PyObject *object, char *sep, PyObject *tzinfoarg)
15011501
PyObject *tzinfo = get_tzinfo_member(object);
15021502

15031503
if (tzinfo == Py_None || tzinfo == NULL) {
1504-
return PyBytes_FromStringAndSize(NULL, 0);
1504+
return PyUnicode_FromStringAndSize(NULL, 0);
15051505
}
15061506

15071507
assert(tzinfoarg != NULL);
@@ -1512,7 +1512,7 @@ make_somezreplacement(PyObject *object, char *sep, PyObject *tzinfoarg)
15121512
tzinfoarg) < 0)
15131513
return NULL;
15141514

1515-
return PyBytes_FromStringAndSize(buf, strlen(buf));
1515+
return PyUnicode_FromString(buf);
15161516
}
15171517

15181518
static PyObject *
@@ -1569,7 +1569,7 @@ make_freplacement(PyObject *object)
15691569
else
15701570
sprintf(freplacement, "%06d", 0);
15711571

1572-
return PyBytes_FromStringAndSize(freplacement, strlen(freplacement));
1572+
return PyUnicode_FromString(freplacement);
15731573
}
15741574

15751575
/* I sure don't want to reproduce the strftime code from the time module,
@@ -1590,159 +1590,124 @@ wrap_strftime(PyObject *object, PyObject *format, PyObject *timetuple,
15901590
PyObject *Zreplacement = NULL; /* py string, replacement for %Z */
15911591
PyObject *freplacement = NULL; /* py string, replacement for %f */
15921592

1593-
const char *pin; /* pointer to next char in input format */
1594-
Py_ssize_t flen; /* length of input format */
1595-
char ch; /* next char in input format */
1596-
1597-
PyObject *newfmt = NULL; /* py string, the output format */
1598-
char *pnew; /* pointer to available byte in output format */
1599-
size_t totalnew; /* number bytes total in output format buffer,
1600-
exclusive of trailing \0 */
1601-
size_t usednew; /* number bytes used so far in output format buffer */
1602-
1603-
const char *ptoappend; /* ptr to string to append to output buffer */
1604-
Py_ssize_t ntoappend; /* # of bytes to append to output buffer */
1605-
16061593
assert(object && format && timetuple);
16071594
assert(PyUnicode_Check(format));
1608-
/* Convert the input format to a C string and size */
1609-
pin = PyUnicode_AsUTF8AndSize(format, &flen);
1610-
if (!pin)
1595+
1596+
PyObject *strftime = _PyImport_GetModuleAttrString("time", "strftime");
1597+
if (strftime == NULL) {
16111598
return NULL;
1599+
}
16121600

16131601
/* Scan the input format, looking for %z/%Z/%f escapes, building
16141602
* a new format. Since computing the replacements for those codes
16151603
* is expensive, don't unless they're actually used.
16161604
*/
1617-
if (flen > INT_MAX - 1) {
1618-
PyErr_NoMemory();
1619-
goto Done;
1620-
}
16211605

1622-
totalnew = flen + 1; /* realistic if no %z/%Z */
1623-
newfmt = PyBytes_FromStringAndSize(NULL, totalnew);
1624-
if (newfmt == NULL) goto Done;
1625-
pnew = PyBytes_AsString(newfmt);
1626-
usednew = 0;
1627-
1628-
while ((ch = *pin++) != '\0') {
1629-
if (ch != '%') {
1630-
ptoappend = pin - 1;
1631-
ntoappend = 1;
1606+
_PyUnicodeWriter writer;
1607+
_PyUnicodeWriter_Init(&writer);
1608+
writer.overallocate = 1;
1609+
1610+
Py_ssize_t flen = PyUnicode_GET_LENGTH(format);
1611+
Py_ssize_t i = 0;
1612+
Py_ssize_t start = 0;
1613+
Py_ssize_t end = 0;
1614+
while (i != flen) {
1615+
i = PyUnicode_FindChar(format, '%', i, flen, 1);
1616+
if (i < 0) {
1617+
assert(!PyErr_Occurred());
1618+
break;
16321619
}
1633-
else if ((ch = *pin++) == '\0') {
1634-
/* Null byte follows %, copy only '%'.
1635-
*
1636-
* Back the pin up one char so that we catch the null check
1637-
* the next time through the loop.*/
1638-
pin--;
1639-
ptoappend = pin - 1;
1640-
ntoappend = 1;
1620+
end = i;
1621+
i++;
1622+
if (i == flen) {
1623+
break;
16411624
}
1625+
Py_UCS4 ch = PyUnicode_READ_CHAR(format, i);
1626+
i++;
16421627
/* A % has been seen and ch is the character after it. */
1643-
else if (ch == 'z') {
1628+
PyObject *replacement = NULL;
1629+
if (ch == 'z') {
16441630
/* %z -> +HHMM */
16451631
if (zreplacement == NULL) {
16461632
zreplacement = make_somezreplacement(object, "", tzinfoarg);
16471633
if (zreplacement == NULL)
1648-
goto Done;
1634+
goto Error;
16491635
}
1650-
assert(zreplacement != NULL);
1651-
assert(PyBytes_Check(zreplacement));
1652-
ptoappend = PyBytes_AS_STRING(zreplacement);
1653-
ntoappend = PyBytes_GET_SIZE(zreplacement);
1636+
replacement = zreplacement;
16541637
}
1655-
else if (ch == ':' && *pin == 'z' && pin++) {
1638+
else if (ch == ':' && i < flen && PyUnicode_READ_CHAR(format, i) == 'z') {
16561639
/* %:z -> +HH:MM */
1640+
i++;
16571641
if (colonzreplacement == NULL) {
16581642
colonzreplacement = make_somezreplacement(object, ":", tzinfoarg);
16591643
if (colonzreplacement == NULL)
1660-
goto Done;
1644+
goto Error;
16611645
}
1662-
assert(colonzreplacement != NULL);
1663-
assert(PyBytes_Check(colonzreplacement));
1664-
ptoappend = PyBytes_AS_STRING(colonzreplacement);
1665-
ntoappend = PyBytes_GET_SIZE(colonzreplacement);
1646+
replacement = colonzreplacement;
16661647
}
16671648
else if (ch == 'Z') {
16681649
/* format tzname */
16691650
if (Zreplacement == NULL) {
16701651
Zreplacement = make_Zreplacement(object,
16711652
tzinfoarg);
16721653
if (Zreplacement == NULL)
1673-
goto Done;
1654+
goto Error;
16741655
}
1675-
assert(Zreplacement != NULL);
1676-
assert(PyUnicode_Check(Zreplacement));
1677-
ptoappend = PyUnicode_AsUTF8AndSize(Zreplacement,
1678-
&ntoappend);
1679-
if (ptoappend == NULL)
1680-
goto Done;
1656+
replacement = Zreplacement;
16811657
}
16821658
else if (ch == 'f') {
16831659
/* format microseconds */
16841660
if (freplacement == NULL) {
16851661
freplacement = make_freplacement(object);
16861662
if (freplacement == NULL)
1687-
goto Done;
1663+
goto Error;
16881664
}
1689-
assert(freplacement != NULL);
1690-
assert(PyBytes_Check(freplacement));
1691-
ptoappend = PyBytes_AS_STRING(freplacement);
1692-
ntoappend = PyBytes_GET_SIZE(freplacement);
1665+
replacement = freplacement;
16931666
}
16941667
else {
16951668
/* percent followed by something else */
1696-
ptoappend = pin - 2;
1697-
ntoappend = 2;
1698-
}
1699-
1700-
/* Append the ntoappend chars starting at ptoappend to
1701-
* the new format.
1702-
*/
1703-
if (ntoappend == 0)
17041669
continue;
1705-
assert(ptoappend != NULL);
1706-
assert(ntoappend > 0);
1707-
while (usednew + ntoappend > totalnew) {
1708-
if (totalnew > (PY_SSIZE_T_MAX >> 1)) { /* overflow */
1709-
PyErr_NoMemory();
1710-
goto Done;
1711-
}
1712-
totalnew <<= 1;
1713-
if (_PyBytes_Resize(&newfmt, totalnew) < 0)
1714-
goto Done;
1715-
pnew = PyBytes_AsString(newfmt) + usednew;
17161670
}
1717-
memcpy(pnew, ptoappend, ntoappend);
1718-
pnew += ntoappend;
1719-
usednew += ntoappend;
1720-
assert(usednew <= totalnew);
1671+
assert(replacement != NULL);
1672+
assert(PyUnicode_Check(replacement));
1673+
if (_PyUnicodeWriter_WriteSubstring(&writer, format, start, end) < 0) {
1674+
goto Error;
1675+
}
1676+
start = i;
1677+
if (_PyUnicodeWriter_WriteStr(&writer, replacement) < 0) {
1678+
goto Error;
1679+
}
17211680
} /* end while() */
17221681

1723-
if (_PyBytes_Resize(&newfmt, usednew) < 0)
1724-
goto Done;
1725-
{
1726-
PyObject *format;
1727-
PyObject *strftime = _PyImport_GetModuleAttrString("time", "strftime");
1728-
1729-
if (strftime == NULL)
1682+
PyObject *newformat;
1683+
if (start == 0) {
1684+
_PyUnicodeWriter_Dealloc(&writer);
1685+
newformat = Py_NewRef(format);
1686+
}
1687+
else {
1688+
if (_PyUnicodeWriter_WriteSubstring(&writer, format, start, flen) < 0) {
1689+
goto Error;
1690+
}
1691+
newformat = _PyUnicodeWriter_Finish(&writer);
1692+
if (newformat == NULL) {
17301693
goto Done;
1731-
format = PyUnicode_FromString(PyBytes_AS_STRING(newfmt));
1732-
if (format != NULL) {
1733-
result = PyObject_CallFunctionObjArgs(strftime,
1734-
format, timetuple, NULL);
1735-
Py_DECREF(format);
17361694
}
1737-
Py_DECREF(strftime);
17381695
}
1696+
result = PyObject_CallFunctionObjArgs(strftime,
1697+
newformat, timetuple, NULL);
1698+
Py_DECREF(newformat);
1699+
17391700
Done:
17401701
Py_XDECREF(freplacement);
17411702
Py_XDECREF(zreplacement);
17421703
Py_XDECREF(colonzreplacement);
17431704
Py_XDECREF(Zreplacement);
1744-
Py_XDECREF(newfmt);
1705+
Py_XDECREF(strftime);
17451706
return result;
1707+
1708+
Error:
1709+
_PyUnicodeWriter_Dealloc(&writer);
1710+
goto Done;
17461711
}
17471712

17481713
/* ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)