Skip to content

bpo-34454: Fix issue with non-UTF8 separator strings #8862

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion Lib/test/datetimetester.py
Original file line number Diff line number Diff line change
Expand Up @@ -1667,6 +1667,7 @@ def test_fromisoformat_fails(self):
# Test that fromisoformat() fails on invalid values
bad_strs = [
'', # Empty string
'\ud800', # bpo-34454: Surrogate code point
'009-03-04', # Not 10 characters
'123456789', # Not a date
'200a-12-04', # Invalid character in year
Expand All @@ -1675,6 +1676,7 @@ def test_fromisoformat_fails(self):
'2009-01-32', # Invalid day
'2009-02-29', # Invalid leap day
'20090228', # Valid ISO8601 output not from isoformat()
'2009\ud80002\ud80028', # Separators are surrogate codepoints
]

for bad_str in bad_strs:
Expand Down Expand Up @@ -2587,7 +2589,8 @@ def test_fromisoformat_separators(self):
' ', 'T', '\u007f', # 1-bit widths
'\u0080', 'ʁ', # 2-bit widths
'ᛇ', '時', # 3-bit widths
'🐍' # 4-bit widths
'🐍', # 4-bit widths
'\ud800', # bpo-34454: Surrogate code point
]

for sep in separators:
Expand Down Expand Up @@ -2639,6 +2642,7 @@ def test_fromisoformat_fails_datetime(self):
# Test that fromisoformat() fails on invalid values
bad_strs = [
'', # Empty string
'\ud800', # bpo-34454: Surrogate code point
'2009.04-19T03', # Wrong first separator
'2009-04.19T03', # Wrong second separator
'2009-04-19T0a', # Invalid hours
Expand All @@ -2652,6 +2656,8 @@ def test_fromisoformat_fails_datetime(self):
'2009-04-19T03:15:45.123456+24:30', # Invalid time zone offset
'2009-04-19T03:15:45.123456-24:30', # Invalid negative offset
'2009-04-10ᛇᛇᛇᛇᛇ12:15', # Too many unicode separators
'2009-04\ud80010T12:15', # Surrogate char in date
'2009-04-10T12\ud80015', # Surrogate char in time
'2009-04-19T1', # Incomplete hours
'2009-04-19T12:3', # Incomplete minutes
'2009-04-19T12:30:4', # Incomplete seconds
Expand Down Expand Up @@ -3521,6 +3527,7 @@ def test_fromisoformat_timespecs(self):
def test_fromisoformat_fails(self):
bad_strs = [
'', # Empty string
'12\ud80000', # Invalid separator - surrogate char
'12:', # Ends on a separator
'12:30:', # Ends on a separator
'12:30:15.', # Ends on a separator
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix the .fromisoformat() methods of datetime types crashing when given
unicode with non-UTF-8-encodable code points. Specifically,
datetime.fromisoformat() now accepts surrogate unicode code points used as
the separator. Report and tests by Alexey Izbyshev, patch by Paul Ganssle.
81 changes: 72 additions & 9 deletions Modules/_datetimemodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -2883,6 +2883,9 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
Py_ssize_t len;

const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
if (dt_ptr == NULL) {
goto invalid_string_error;
}

int year = 0, month = 0, day = 0;

Expand All @@ -2894,12 +2897,15 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
}

if (rv < 0) {
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s",
dt_ptr);
return NULL;
goto invalid_string_error;
}

return new_date_subclass_ex(year, month, day, cls);

invalid_string_error:
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R",
dtstr);
return NULL;
}


Expand Down Expand Up @@ -4258,15 +4264,18 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
Py_ssize_t len;
const char *p = PyUnicode_AsUTF8AndSize(tstr, &len);

if (p == NULL) {
goto invalid_string_error;
}

int hour = 0, minute = 0, second = 0, microsecond = 0;
int tzoffset, tzimicrosecond = 0;
int rv = parse_isoformat_time(p, len,
&hour, &minute, &second, &microsecond,
&tzoffset, &tzimicrosecond);

if (rv < 0) {
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", p);
return NULL;
goto invalid_string_error;
}

PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset,
Expand All @@ -4286,6 +4295,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {

Py_DECREF(tzinfo);
return t;

invalid_string_error:
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", tstr);
return NULL;
}


Expand Down Expand Up @@ -4839,6 +4852,33 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
return result;
}

static PyObject *
_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

{ should be on new line.

// `fromisoformat` allows surrogate characters in exactly one position,
// the separator; to allow datetime_fromisoformat to make the simplifying
// assumption that all valid strings can be encoded in UTF-8, this function
// replaces any surrogate character separators with `T`.
Py_ssize_t len = PyUnicode_GetLength(dtstr);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PyUnicode_GetLength() can set an exception and return -1.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, good call. For some reason I had the impression that Py_ssize_t was an unsigned integer.

*needs_decref = 0;
if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
return dtstr;
}

PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_PyUnicode_Copy() could be used.

But I'm not sure that arbitrary character (including lone surrogates) should be accepted as a date-time separator. For example 2018-08-24016:10:00 looks pretty confusing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When we developed datetime.fromisoformat, the contract of the function was that it should act as the inverse of datetime.isoformat, meaning that it should satisfy

datetime.fromisoformat(dt.isoformat(*args, **kwargs)) == dt

for all values of dt, args and kwargs. Since dt.isoformat(sep='\ud800') and dt.isoformat(sep='0') are valid, we need to accept any character in the separator position in order to comply with the contract of the function.

It may be worth bringing up the appropriate contract in the datetime-SIG mailing list, but it was decided to use a very simple contract so that it's very simple to define what is and is not the correct behavior for this function (which could otherwise grow unwieldy).

if (str_out == NULL) {
return NULL;
}

if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 ||
PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
Py_DECREF(str_out);
return NULL;
}

*needs_decref = 1;
return str_out;
}

static PyObject *
datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
assert(dtstr != NULL);
Expand All @@ -4848,9 +4888,20 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
return NULL;
}

int needs_decref = 0;
dtstr = _sanitize_isoformat_str(dtstr, &needs_decref);
if (dtstr == NULL) {
goto error;
}

Py_ssize_t len;
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
const char * p = dt_ptr;

if (dt_ptr == NULL) {
goto invalid_string_error;
}

const char *p = dt_ptr;

int year = 0, month = 0, day = 0;
int hour = 0, minute = 0, second = 0, microsecond = 0;
Expand Down Expand Up @@ -4883,20 +4934,32 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
&tzoffset, &tzusec);
}
if (rv < 0) {
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", dt_ptr);
return NULL;
goto invalid_string_error;
}

PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
if (tzinfo == NULL) {
return NULL;
goto error;
}

PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute,
second, microsecond, tzinfo, cls);

Py_DECREF(tzinfo);
if (needs_decref) {
Py_DECREF(dtstr);
}
return dt;

invalid_string_error:
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error message can contain not original string.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh good point. As part of the fixup I will change how this works so that dtstr is not re-assigned.


error:
if (needs_decref) {
Py_DECREF(dtstr);
}

return NULL;
}


Expand Down