Skip to content

Commit

Permalink
Preserve nanosecond resolution when encoding/decoding times (#7827)
Browse files Browse the repository at this point in the history
* preserve nanosecond resolution when encoding/decoding times.

* Apply suggestions from code review

Co-authored-by: Spencer Clark <spencerkclark@gmail.com>

* use emit_user_level_warning

* move time alignment for nc3 to encode_nc3_variable

* fix test for encode_cf_timedelta

* fix CFMaskCoder for time-like (also allow timedelta64), add first tests

* rename to _unpack_time_units_and_ref_date as suggested in review

* refactor delta -> time_units as suggested in review

* refactor out function _time_units_to_timedelta64, reorder flow and remove unneeded checks, apply filterwarnings, adapt tests

* import _is_time_like from coding.variables

* adapt tests, add _numpy_to_netcdf_timeunit-conversion function

* adapt tests, add _numpy_to_netcdf_timeunit-conversion function

* adapt test as per review, remove arm_xfail for backend test

* add whats-new.rst entry

* Update doc/whats-new.rst

Co-authored-by: Spencer Clark <spencerkclark@gmail.com>

* Update doc/whats-new.rst

Co-authored-by: Spencer Clark <spencerkclark@gmail.com>

* fix whats-new.rst

---------

Co-authored-by: Spencer Clark <spencerkclark@gmail.com>
  • Loading branch information
kmuehlbauer and spencerkclark authored Sep 17, 2023
1 parent ecdb8c1 commit b6d4bf7
Show file tree
Hide file tree
Showing 6 changed files with 315 additions and 45 deletions.
6 changes: 6 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ Bug fixes
- Calling plot with kwargs ``col``, ``row`` or ``hue`` no longer squeezes dimensions passed via these arguments
(:issue:`7552`, :pull:`8174`).
By `Wiktor Kraśnicki <https://github.com/wkrasnicki>`_.
- Fixed a bug where casting from ``float`` to ``int64`` (undefined for ``NaN``) led to varying
issues (:issue:`7817`, :issue:`7942`, :issue:`7790`, :issue:`6191`, :issue:`7096`,
:issue:`1064`, :pull:`7827`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Documentation
~~~~~~~~~~~~~
Expand All @@ -86,6 +90,8 @@ Internal Changes

- Many error messages related to invalid dimensions or coordinates now always show the list of valid dims/coords (:pull:`8079`).
By `András Gunyhó <https://github.com/mgunyho>`_.
- Refactor of encoding and decoding times/timedeltas to preserve nanosecond resolution in arrays that contain missing values (:pull:`7827`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

.. _whats-new.2023.08.0:

Expand Down
19 changes: 18 additions & 1 deletion xarray/backends/netcdf3.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,30 @@ def encode_nc3_attrs(attrs):
return {k: encode_nc3_attr_value(v) for k, v in attrs.items()}


def _maybe_prepare_times(var):
# checks for integer-based time-like and
# replaces np.iinfo(np.int64).min with _FillValue or np.nan
# this keeps backwards compatibility

data = var.data
if data.dtype.kind in "iu":
units = var.attrs.get("units", None)
if units is not None:
if coding.variables._is_time_like(units):
mask = data == np.iinfo(np.int64).min
if mask.any():
data = np.where(mask, var.attrs.get("_FillValue", np.nan), data)
return data


def encode_nc3_variable(var):
for coder in [
coding.strings.EncodedStringCoder(allows_unicode=False),
coding.strings.CharacterArrayCoder(),
]:
var = coder.encode(var)
data = coerce_nc3_dtype(var.data)
data = _maybe_prepare_times(var)
data = coerce_nc3_dtype(data)
attrs = encode_nc3_attrs(var.attrs)
return Variable(var.dims, data, attrs, var.encoding)

Expand Down
139 changes: 104 additions & 35 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from xarray.core.formatting import first_n_items, format_timestamp, last_item
from xarray.core.pdcompat import nanosecond_precision_timestamp
from xarray.core.pycompat import is_duck_dask_array
from xarray.core.utils import emit_user_level_warning
from xarray.core.variable import Variable

try:
Expand Down Expand Up @@ -122,6 +123,18 @@ def _netcdf_to_numpy_timeunit(units: str) -> str:
}[units]


def _numpy_to_netcdf_timeunit(units: str) -> str:
return {
"ns": "nanoseconds",
"us": "microseconds",
"ms": "milliseconds",
"s": "seconds",
"m": "minutes",
"h": "hours",
"D": "days",
}[units]


def _ensure_padded_year(ref_date: str) -> str:
# Reference dates without a padded year (e.g. since 1-1-1 or since 2-3-4)
# are ambiguous (is it YMD or DMY?). This can lead to some very odd
Expand Down Expand Up @@ -171,6 +184,20 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
return delta_units, ref_date


def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
# same us _unpack_netcdf_time_units but finalizes ref_date for
# processing in encode_cf_datetime
time_units, _ref_date = _unpack_netcdf_time_units(units)
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(_ref_date)
# If the ref_date Timestamp is timezone-aware, convert to UTC and
# make it timezone-naive (GH 2649).
if ref_date.tz is not None:
ref_date = ref_date.tz_convert(None)
return time_units, ref_date


def _decode_cf_datetime_dtype(
data, units: str, calendar: str, use_cftime: bool | None
) -> np.dtype:
Expand Down Expand Up @@ -222,8 +249,8 @@ def _decode_datetime_with_pandas(
"pandas."
)

delta, ref_date = _unpack_netcdf_time_units(units)
delta = _netcdf_to_numpy_timeunit(delta)
time_units, ref_date = _unpack_netcdf_time_units(units)
time_units = _netcdf_to_numpy_timeunit(time_units)
try:
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
Expand All @@ -237,8 +264,8 @@ def _decode_datetime_with_pandas(
warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
if flat_num_dates.size > 0:
# avoid size 0 datetimes GH1329
pd.to_timedelta(flat_num_dates.min(), delta) + ref_date
pd.to_timedelta(flat_num_dates.max(), delta) + ref_date
pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date
pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date

# To avoid integer overflow when converting to nanosecond units for integer
# dtypes smaller than np.int64 cast all integer and unsigned integer dtype
Expand All @@ -251,9 +278,12 @@ def _decode_datetime_with_pandas(

# Cast input ordinals to integers of nanoseconds because pd.to_timedelta
# works much faster when dealing with integers (GH 1399).
flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype(
np.int64
)
# properly handle NaN/NaT to prevent casting NaN to int
nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min)
flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units]
flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64)
flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min
flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64)

# Use pd.to_timedelta to safely cast integer values to timedeltas,
# and add those to a Timestamp to safely produce a DatetimeIndex. This
Expand Down Expand Up @@ -364,6 +394,10 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str:
return "seconds"


def _time_units_to_timedelta64(units: str) -> np.timedelta64:
return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype("timedelta64[ns]")


def infer_calendar_name(dates) -> CFCalendar:
"""Given an array of datetimes, infer the CF calendar name"""
if is_np_datetime_like(dates.dtype):
Expand Down Expand Up @@ -572,9 +606,12 @@ def _should_cftime_be_used(


def _cleanup_netcdf_time_units(units: str) -> str:
delta, ref_date = _unpack_netcdf_time_units(units)
time_units, ref_date = _unpack_netcdf_time_units(units)
time_units = time_units.lower()
if not time_units.endswith("s"):
time_units = f"{time_units}s"
try:
units = f"{delta} since {format_timestamp(ref_date)}"
units = f"{time_units} since {format_timestamp(ref_date)}"
except (OutOfBoundsDatetime, ValueError):
# don't worry about reifying the units if they're out of bounds or
# formatted badly
Expand Down Expand Up @@ -633,62 +670,93 @@ def encode_cf_datetime(
"""
dates = np.asarray(dates)

data_units = infer_datetime_units(dates)

if units is None:
units = infer_datetime_units(dates)
units = data_units
else:
units = _cleanup_netcdf_time_units(units)

if calendar is None:
calendar = infer_calendar_name(dates)

delta, _ref_date = _unpack_netcdf_time_units(units)
try:
if not _is_standard_calendar(calendar) or dates.dtype.kind == "O":
# parse with cftime instead
raise OutOfBoundsDatetime
assert dates.dtype == "datetime64[ns]"

delta_units = _netcdf_to_numpy_timeunit(delta)
time_delta = np.timedelta64(1, delta_units).astype("timedelta64[ns]")

# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(_ref_date)
time_units, ref_date = _unpack_time_units_and_ref_date(units)
time_delta = _time_units_to_timedelta64(time_units)

# If the ref_date Timestamp is timezone-aware, convert to UTC and
# make it timezone-naive (GH 2649).
if ref_date.tz is not None:
ref_date = ref_date.tz_convert(None)
# retrieve needed units to faithfully encode to int64
needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units)
if data_units != units:
# this accounts for differences in the reference times
ref_delta = abs(data_ref_date - ref_date).to_timedelta64()
if ref_delta > np.timedelta64(0, "ns"):
needed_units = _infer_time_units_from_diff(ref_delta)

# Wrap the dates in a DatetimeIndex to do the subtraction to ensure
# an OverflowError is raised if the ref_date is too far away from
# dates to be encoded (GH 2272).
dates_as_index = pd.DatetimeIndex(dates.ravel())
time_deltas = dates_as_index - ref_date

# Use floor division if time_delta evenly divides all differences
# to preserve integer dtype if possible (GH 4045).
if np.all(time_deltas % time_delta == np.timedelta64(0, "ns")):
num = time_deltas // time_delta
# needed time delta to encode faithfully to int64
needed_time_delta = _time_units_to_timedelta64(needed_units)
if time_delta <= needed_time_delta:
# calculate int64 floor division
# to preserve integer dtype if possible (GH 4045, GH7817).
num = time_deltas // time_delta.astype(np.int64)
num = num.astype(np.int64, copy=False)
else:
emit_user_level_warning(
f"Times can't be serialized faithfully with requested units {units!r}. "
f"Resolution of {needed_units!r} needed. "
f"Serializing timeseries to floating point."
)
num = time_deltas / time_delta
num = num.values.reshape(dates.shape)

except (OutOfBoundsDatetime, OverflowError, ValueError):
num = _encode_datetime_with_cftime(dates, units, calendar)
# do it now only for cftime-based flow
# we already covered for this in pandas-based flow
num = cast_to_int_if_safe(num)

num = cast_to_int_if_safe(num)
return (num, units, calendar)


def encode_cf_timedelta(timedeltas, units: str | None = None) -> tuple[np.ndarray, str]:
if units is None:
units = infer_timedelta_units(timedeltas)
data_units = infer_timedelta_units(timedeltas)

np_unit = _netcdf_to_numpy_timeunit(units)
num = 1.0 * timedeltas / np.timedelta64(1, np_unit)
num = np.where(pd.isnull(timedeltas), np.nan, num)
num = cast_to_int_if_safe(num)
if units is None:
units = data_units

time_delta = _time_units_to_timedelta64(units)
time_deltas = pd.TimedeltaIndex(timedeltas.ravel())

# retrieve needed units to faithfully encode to int64
needed_units = data_units
if data_units != units:
needed_units = _infer_time_units_from_diff(np.unique(time_deltas.dropna()))

# needed time delta to encode faithfully to int64
needed_time_delta = _time_units_to_timedelta64(needed_units)
if time_delta <= needed_time_delta:
# calculate int64 floor division
# to preserve integer dtype if possible
num = time_deltas // time_delta.astype(np.int64)
num = num.astype(np.int64, copy=False)
else:
emit_user_level_warning(
f"Timedeltas can't be serialized faithfully with requested units {units!r}. "
f"Resolution of {needed_units!r} needed. "
f"Serializing timedeltas to floating point."
)
num = time_deltas / time_delta
num = num.values.reshape(timedeltas.shape)
return (num, units)


Expand All @@ -702,9 +770,10 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
) or contains_cftime_datetimes(variable):
dims, data, attrs, encoding = unpack_for_encoding(variable)

(data, units, calendar) = encode_cf_datetime(
data, encoding.pop("units", None), encoding.pop("calendar", None)
)
units = encoding.pop("units", None)
calendar = encoding.pop("calendar", None)
(data, units, calendar) = encode_cf_datetime(data, units, calendar)

safe_setitem(attrs, "units", units, name=name)
safe_setitem(attrs, "calendar", calendar, name=name)

Expand Down
38 changes: 35 additions & 3 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,21 @@ def _apply_mask(
return np.where(condition, decoded_fill_value, data)


def _is_time_like(units):
# test for time-like
time_strings = [
"since",
"days",
"hours",
"minutes",
"seconds",
"milliseconds",
"microseconds",
"nanoseconds",
]
return any(tstr in str(units) for tstr in time_strings)


class CFMaskCoder(VariableCoder):
"""Mask or unmask fill values according to CF conventions."""

Expand All @@ -236,19 +251,32 @@ def encode(self, variable: Variable, name: T_Name = None):
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
)

# special case DateTime to properly handle NaT
is_time_like = _is_time_like(attrs.get("units"))

if fv_exists:
# Ensure _FillValue is cast to same dtype as data's
encoding["_FillValue"] = dtype.type(fv)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
if not pd.isnull(fill_value):
data = duck_array_ops.fillna(data, fill_value)
if is_time_like and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

if mv_exists:
# Ensure missing_value is cast to same dtype as data's
encoding["missing_value"] = dtype.type(mv)
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
if not pd.isnull(fill_value) and not fv_exists:
data = duck_array_ops.fillna(data, fill_value)
if is_time_like and data.dtype.kind in "iu":
data = duck_array_ops.where(
data != np.iinfo(np.int64).min, data, fill_value
)
else:
data = duck_array_ops.fillna(data, fill_value)

return Variable(dims, data, attrs, encoding, fastpath=True)

Expand All @@ -275,7 +303,11 @@ def decode(self, variable: Variable, name: T_Name = None):
stacklevel=3,
)

dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
# special case DateTime to properly handle NaT
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)

if encoded_fill_values:
transform = partial(
Expand Down
2 changes: 0 additions & 2 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@
from xarray.core.options import set_options
from xarray.core.pycompat import array_type
from xarray.tests import (
arm_xfail,
assert_allclose,
assert_array_equal,
assert_equal,
Expand Down Expand Up @@ -526,7 +525,6 @@ def test_roundtrip_string_encoded_characters(self) -> None:
assert_identical(expected, actual)
assert actual["x"].encoding["_Encoding"] == "ascii"

@arm_xfail
def test_roundtrip_numpy_datetime_data(self) -> None:
times = pd.to_datetime(["2000-01-01", "2000-01-02", "NaT"])
expected = Dataset({"t": ("t", times), "t0": times[0]})
Expand Down
Loading

0 comments on commit b6d4bf7

Please sign in to comment.