Skip to content

Commit

Permalink
BUG: to_json not serializing non-nanosecond numpy dt64 correctly (#53757
Browse files Browse the repository at this point in the history
)

* BUG: to_json not serializing non-nanosecond numpy dt64 correctly

* fix tests

* change extraction mech

* fix object array case

* pre-commit

* address comments
  • Loading branch information
lithomas1 authored Jul 25, 2023
1 parent d2f05c2 commit ee5cf2c
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 19 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,7 @@ I/O
- Bug in :func:`read_sql` when reading multiple timezone aware columns with the same column name (:issue:`44421`)
- Bug in :func:`read_xml` stripping whitespace in string data (:issue:`53811`)
- Bug in :meth:`DataFrame.to_html` where ``colspace`` was incorrectly applied in case of multi index columns (:issue:`53885`)
- Bug in :meth:`DataFrame.to_json` where :class:`DateTimeArray`/:class:`DateTimeIndex` with non nanosecond precision could not be serialized correctly (:issue:`53686`)
- Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`)
- Bug where ``bz2`` was treated as a hard requirement (:issue:`53857`)

Expand Down
5 changes: 4 additions & 1 deletion pandas/_libs/include/pandas/datetime/date_conversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit);
// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
// len is mutated to save the length of the returned string
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len);
char *int64ToIso(int64_t value,
NPY_DATETIMEUNIT valueUnit,
NPY_DATETIMEUNIT base,
size_t *len);

// TODO(username): this function doesn't do a lot; should augment or
// replace with scaleNanosecToUnit
Expand Down
6 changes: 3 additions & 3 deletions pandas/_libs/include/pandas/datetime/pd_datetime.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ typedef struct {
npy_datetime (*npy_datetimestruct_to_datetime)(NPY_DATETIMEUNIT,
const npy_datetimestruct *);
int (*scaleNanosecToUnit)(npy_int64 *, NPY_DATETIMEUNIT);
char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, size_t *);
char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, size_t *);
npy_datetime (*NpyDateTimeToEpoch)(npy_datetime, NPY_DATETIMEUNIT);
char *(*PyDateTimeToIso)(PyObject *, NPY_DATETIMEUNIT, size_t *);
npy_datetime (*PyDateTimeToEpoch)(PyObject *, NPY_DATETIMEUNIT);
Expand Down Expand Up @@ -73,8 +73,8 @@ static PandasDateTime_CAPI *PandasDateTimeAPI = NULL;
(npy_datetimestruct))
#define scaleNanosecToUnit(value, unit) \
PandasDateTimeAPI->scaleNanosecToUnit((value), (unit))
#define int64ToIso(value, base, len) \
PandasDateTimeAPI->int64ToIso((value), (base), (len))
#define int64ToIso(value, valueUnit, base, len) \
PandasDateTimeAPI->int64ToIso((value), (valueUnit), (base), (len))
#define NpyDateTimeToEpoch(dt, base) \
PandasDateTimeAPI->NpyDateTimeToEpoch((dt), (base))
#define PyDateTimeToIso(obj, base, len) \
Expand Down
7 changes: 5 additions & 2 deletions pandas/_libs/src/datetime/date_conversions.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,14 @@ int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) {
}

/* Converts the int64_t representation of a datetime to ISO; mutates len */
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
char *int64ToIso(int64_t value,
NPY_DATETIMEUNIT valueUnit,
NPY_DATETIMEUNIT base,
size_t *len) {
npy_datetimestruct dts;
int ret_code;

pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts);
pandas_datetime_to_datetimestruct(value, valueUnit, &dts);

*len = (size_t)get_datetime_iso_8601_strlen(0, base);
char *result = PyObject_Malloc(*len);
Expand Down
50 changes: 37 additions & 13 deletions pandas/_libs/src/vendored/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ typedef struct __PyObjectEncoder {

int datetimeIso;
NPY_DATETIMEUNIT datetimeUnit;
NPY_DATETIMEUNIT valueUnit;

// output format style for pandas data types
int outputFormat;
Expand Down Expand Up @@ -350,7 +351,8 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc,
static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused),
JSONTypeContext *tc, size_t *len) {
NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, base, len);
NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit;
GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len);
return GET_TC(tc)->cStr;
}

Expand All @@ -364,8 +366,9 @@ static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused),
/* JSON callback */
static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc,
size_t *len) {
if (!PyDate_Check(obj)) {
PyErr_SetString(PyExc_TypeError, "Expected date object");
if (!PyDate_Check(obj) && !PyDateTime_Check(obj)) {
PyErr_SetString(PyExc_TypeError, "Expected date or datetime object");
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
return NULL;
}

Expand Down Expand Up @@ -502,6 +505,10 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
GET_TC(tc)->itemValue = obj;
Py_INCREF(obj);
((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array);
// Also write the resolution (unit) of the ndarray
PyArray_Descr *dtype = PyArray_DESCR(npyarr->array);
((PyObjectEncoder *)tc->encoder)->valueUnit =
get_datetime_metadata_from_dtype(dtype).base;
((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr;
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
} else {
Expand Down Expand Up @@ -1255,6 +1262,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
char **ret;
char *dataptr, *cLabel;
int type_num;
PyArray_Descr *dtype;
NPY_DATETIMEUNIT base = enc->datetimeUnit;

if (!labels) {
Expand Down Expand Up @@ -1283,6 +1291,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
stride = PyArray_STRIDE(labels, 0);
dataptr = PyArray_DATA(labels);
type_num = PyArray_TYPE(labels);
dtype = PyArray_DESCR(labels);

for (i = 0; i < num; i++) {
item = PyArray_GETITEM(labels, dataptr);
Expand All @@ -1293,7 +1302,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
}

int is_datetimelike = 0;
npy_int64 nanosecVal;
npy_int64 i8date;
NPY_DATETIMEUNIT dateUnit = NPY_FR_ns;
if (PyTypeNum_ISDATETIME(type_num)) {
is_datetimelike = 1;
PyArray_VectorUnaryFunc *castfunc =
Expand All @@ -1303,35 +1313,37 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
"Cannot cast numpy dtype %d to long",
enc->npyType);
}
castfunc(dataptr, &nanosecVal, 1, NULL, NULL);
castfunc(dataptr, &i8date, 1, NULL, NULL);
dateUnit = get_datetime_metadata_from_dtype(dtype).base;
} else if (PyDate_Check(item) || PyDelta_Check(item)) {
is_datetimelike = 1;
if (PyObject_HasAttrString(item, "_value")) {
// see test_date_index_and_values for case with non-nano
nanosecVal = get_long_attr(item, "_value");
i8date = get_long_attr(item, "_value");
} else {
if (PyDelta_Check(item)) {
nanosecVal = total_seconds(item) *
i8date = total_seconds(item) *
1000000000LL; // nanoseconds per second
} else {
// datetime.* objects don't follow above rules
nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns);
i8date = PyDateTimeToEpoch(item, NPY_FR_ns);
}
}
}

if (is_datetimelike) {
if (nanosecVal == get_nat()) {
if (i8date == get_nat()) {
len = 4;
cLabel = PyObject_Malloc(len + 1);
strncpy(cLabel, "null", len + 1);
} else {
if (enc->datetimeIso) {
if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) {
cLabel = int64ToIsoDuration(nanosecVal, &len);
// TODO(username): non-nano timedelta support?
cLabel = int64ToIsoDuration(i8date, &len);
} else {
if (type_num == NPY_DATETIME) {
cLabel = int64ToIso(nanosecVal, base, &len);
cLabel = int64ToIso(i8date, dateUnit, base, &len);
} else {
cLabel = PyDateTimeToIso(item, base, &len);
}
Expand All @@ -1346,7 +1358,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
int size_of_cLabel = 21; // 21 chars for int 64
cLabel = PyObject_Malloc(size_of_cLabel);
snprintf(cLabel, size_of_cLabel, "%" NPY_DATETIME_FMT,
NpyDateTimeToEpoch(nanosecVal, base));
NpyDateTimeToEpoch(i8date, base));
len = strlen(cLabel);
}
}
Expand Down Expand Up @@ -1538,13 +1550,25 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
tc->type = JT_UTF8;
return;
} else if (PyArray_IsScalar(obj, Datetime)) {
npy_int64 longVal;
if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) {
tc->type = JT_NULL;
return;
}
PyArray_Descr *dtype = PyArray_DescrFromScalar(obj);
if (!PyTypeNum_ISDATETIME(dtype->type_num)) {
PyErr_Format(PyExc_ValueError, "Could not get resolution of datetime");
return;
}

PyArray_Descr *outcode = PyArray_DescrFromType(NPY_INT64);
PyArray_CastScalarToCtype(obj, &longVal, outcode);
Py_DECREF(outcode);

if (enc->datetimeIso) {
pc->PyTypeToUTF8 = PyDateTimeToIsoCallback;
GET_TC(tc)->longValue = longVal;
pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
enc->valueUnit = get_datetime_metadata_from_dtype(dtype).base;
tc->type = JT_UTF8;
} else {
NPY_DATETIMEUNIT base =
Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,45 @@ def test_date_unit(self, unit, datetime_frame):
result = read_json(StringIO(json), date_unit=None)
tm.assert_frame_equal(result, df)

@pytest.mark.parametrize("unit", ["s", "ms", "us"])
def test_iso_non_nano_datetimes(self, unit):
# Test that numpy datetimes
# in an Index or a column with non-nano resolution can be serialized
# correctly
# GH53686
index = DatetimeIndex(
[np.datetime64("2023-01-01T11:22:33.123456", unit)],
dtype=f"datetime64[{unit}]",
)
df = DataFrame(
{
"date": Series(
[np.datetime64("2022-01-01T11:22:33.123456", unit)],
dtype=f"datetime64[{unit}]",
index=index,
),
"date_obj": Series(
[np.datetime64("2023-01-01T11:22:33.123456", unit)],
dtype=object,
index=index,
),
},
)

buf = StringIO()
df.to_json(buf, date_format="iso", date_unit=unit)
buf.seek(0)

# read_json always reads datetimes in nanosecond resolution
# TODO: check_dtype/check_index_type should be removable
# once read_json gets non-nano support
tm.assert_frame_equal(
read_json(buf, convert_dates=["date", "date_obj"]),
df,
check_index_type=False,
check_dtype=False,
)

def test_weird_nested_json(self):
# this used to core dump the parser
s = r"""{
Expand Down

0 comments on commit ee5cf2c

Please sign in to comment.