-
-
Notifications
You must be signed in to change notification settings - Fork 32.2k
bpo-34454: Fix issue with non-UTF8 separator strings #8862
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2f48e53
85a99ca
dd82aa0
c24388e
a0246a0
b89d4f5
160e779
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Fix the .fromisoformat() methods of datetime types crashing when given | ||
unicode with non-UTF-8-encodable code points. Specifically, | ||
datetime.fromisoformat() now accepts surrogate unicode code points used as | ||
the separator. Report and tests by Alexey Izbyshev, patch by Paul Ganssle. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2883,6 +2883,9 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) { | |
Py_ssize_t len; | ||
|
||
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); | ||
if (dt_ptr == NULL) { | ||
goto invalid_string_error; | ||
} | ||
|
||
int year = 0, month = 0, day = 0; | ||
|
||
|
@@ -2894,12 +2897,15 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) { | |
} | ||
|
||
if (rv < 0) { | ||
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", | ||
dt_ptr); | ||
return NULL; | ||
goto invalid_string_error; | ||
} | ||
|
||
return new_date_subclass_ex(year, month, day, cls); | ||
|
||
invalid_string_error: | ||
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", | ||
dtstr); | ||
return NULL; | ||
} | ||
|
||
|
||
|
@@ -4258,15 +4264,18 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) { | |
Py_ssize_t len; | ||
const char *p = PyUnicode_AsUTF8AndSize(tstr, &len); | ||
|
||
if (p == NULL) { | ||
goto invalid_string_error; | ||
} | ||
|
||
int hour = 0, minute = 0, second = 0, microsecond = 0; | ||
int tzoffset, tzimicrosecond = 0; | ||
int rv = parse_isoformat_time(p, len, | ||
&hour, &minute, &second, µsecond, | ||
&tzoffset, &tzimicrosecond); | ||
|
||
if (rv < 0) { | ||
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", p); | ||
return NULL; | ||
goto invalid_string_error; | ||
} | ||
|
||
PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, | ||
|
@@ -4286,6 +4295,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) { | |
|
||
Py_DECREF(tzinfo); | ||
return t; | ||
|
||
invalid_string_error: | ||
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", tstr); | ||
return NULL; | ||
} | ||
|
||
|
||
|
@@ -4839,6 +4852,33 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw) | |
return result; | ||
} | ||
|
||
static PyObject * | ||
_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) { | ||
// `fromisoformat` allows surrogate characters in exactly one position, | ||
// the separator; to allow datetime_fromisoformat to make the simplifying | ||
// assumption that all valid strings can be encoded in UTF-8, this function | ||
// replaces any surrogate character separators with `T`. | ||
Py_ssize_t len = PyUnicode_GetLength(dtstr); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, good call. For some reason I had the impression that |
||
*needs_decref = 0; | ||
if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) { | ||
return dtstr; | ||
} | ||
|
||
PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
But I'm not sure that arbitrary character (including lone surrogates) should be accepted as a date-time separator. For example 2018-08-24016:10:00 looks pretty confusing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When we developed datetime.fromisoformat(dt.isoformat(*args, **kwargs)) == dt for all values of It may be worth bringing up the appropriate contract in the |
||
if (str_out == NULL) { | ||
return NULL; | ||
} | ||
|
||
if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 || | ||
PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) { | ||
Py_DECREF(str_out); | ||
return NULL; | ||
} | ||
|
||
*needs_decref = 1; | ||
return str_out; | ||
} | ||
|
||
static PyObject * | ||
datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { | ||
assert(dtstr != NULL); | ||
|
@@ -4848,9 +4888,20 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { | |
return NULL; | ||
} | ||
|
||
int needs_decref = 0; | ||
dtstr = _sanitize_isoformat_str(dtstr, &needs_decref); | ||
if (dtstr == NULL) { | ||
goto error; | ||
} | ||
|
||
Py_ssize_t len; | ||
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); | ||
const char * p = dt_ptr; | ||
|
||
if (dt_ptr == NULL) { | ||
goto invalid_string_error; | ||
} | ||
|
||
const char *p = dt_ptr; | ||
|
||
int year = 0, month = 0, day = 0; | ||
int hour = 0, minute = 0, second = 0, microsecond = 0; | ||
|
@@ -4883,20 +4934,32 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { | |
&tzoffset, &tzusec); | ||
} | ||
if (rv < 0) { | ||
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", dt_ptr); | ||
return NULL; | ||
goto invalid_string_error; | ||
} | ||
|
||
PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec); | ||
if (tzinfo == NULL) { | ||
return NULL; | ||
goto error; | ||
} | ||
|
||
PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute, | ||
second, microsecond, tzinfo, cls); | ||
|
||
Py_DECREF(tzinfo); | ||
if (needs_decref) { | ||
Py_DECREF(dtstr); | ||
} | ||
return dt; | ||
|
||
invalid_string_error: | ||
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This error message can contain not original string. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh good point. As part of the fixup I will change how this works so that |
||
|
||
error: | ||
if (needs_decref) { | ||
Py_DECREF(dtstr); | ||
} | ||
|
||
return NULL; | ||
} | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
{
should be on new line.