From cc74d3ae226594db23730279711d4379debcf9a8 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Tue, 10 Sep 2024 11:12:08 -0400 Subject: [PATCH] Add days_in_year and decimal_year to dt accessor (#9105) * Add days_in_year and decimal_year to dt accessor * Upd whats new - add gregorian calendar - rename to decimal_year * Add to api.rst and pr number * Add requires cftime decorators where needed * Rewrite functions using suggestions from review * cleaner custom date field - docstrings - remove bad merge * add new fields to dask access test * Revert to rollback method * Revert "Revert to rollback method" This reverts commit 3f429c9cf81ed0f78f59fdb45044ac4ad558f65f. * explicit float cast? * Revert back to rollback method * Fix dask compatibility issues * Approach that passes tests under NumPy 1.26.4 * Adapt decimal_year test to be more comprehensive * Use proper sphinx roles for cross-referencing. --------- Co-authored-by: Spencer Clark --- doc/api.rst | 2 + doc/whats-new.rst | 7 ++ xarray/coding/calendar_ops.py | 115 +++++++++++++++++-------------- xarray/coding/cftimeindex.py | 5 ++ xarray/core/accessor_dt.py | 29 ++++++++ xarray/tests/test_accessor_dt.py | 55 +++++++++++++++ 6 files changed, 161 insertions(+), 52 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 6ed8d513934..d79c0612a98 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -529,9 +529,11 @@ Datetimelike properties DataArray.dt.quarter DataArray.dt.days_in_month DataArray.dt.daysinmonth + DataArray.dt.days_in_year DataArray.dt.season DataArray.dt.time DataArray.dt.date + DataArray.dt.decimal_year DataArray.dt.calendar DataArray.dt.is_month_start DataArray.dt.is_month_end diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 82cea1b3d7d..b5e3ced3be8 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,13 @@ v2024.07.1 (unreleased) New Features ~~~~~~~~~~~~ + +- Add :py:attr:`~core.accessor_dt.DatetimeAccessor.days_in_year` and :py:attr:`~core.accessor_dt.DatetimeAccessor.decimal_year` to the Datetime accessor on DataArrays. (:pull:`9105`). + By `Pascal Bourgault `_. + +Performance +~~~~~~~~~~~ + - Make chunk manager an option in ``set_options`` (:pull:`9362`). By `Tom White `_. - Support for :ref:`grouping by multiple variables `. diff --git a/xarray/coding/calendar_ops.py b/xarray/coding/calendar_ops.py index 1b2875b26f1..52a487ca46d 100644 --- a/xarray/coding/calendar_ops.py +++ b/xarray/coding/calendar_ops.py @@ -9,7 +9,12 @@ _should_cftime_be_used, convert_times, ) -from xarray.core.common import _contains_datetime_like_objects, is_np_datetime_like +from xarray.core.common import ( + _contains_datetime_like_objects, + full_like, + is_np_datetime_like, +) +from xarray.core.computation import apply_ufunc try: import cftime @@ -25,16 +30,6 @@ ] -def _days_in_year(year, calendar, use_cftime=True): - """Return the number of days in the input year according to the input calendar.""" - date_type = get_date_type(calendar, use_cftime=use_cftime) - if year == -1 and calendar in _CALENDARS_WITHOUT_YEAR_ZERO: - difference = date_type(year + 2, 1, 1) - date_type(year, 1, 1) - else: - difference = date_type(year + 1, 1, 1) - date_type(year, 1, 1) - return difference.days - - def convert_calendar( obj, calendar, @@ -191,11 +186,7 @@ def convert_calendar( # Special case for conversion involving 360_day calendar if align_on == "year": # Instead of translating dates directly, this tries to keep the position within a year similar. - new_doy = time.groupby(f"{dim}.year").map( - _interpolate_day_of_year, - target_calendar=calendar, - use_cftime=use_cftime, - ) + new_doy = _interpolate_day_of_year(time, target_calendar=calendar) elif align_on == "random": # The 5 days to remove are randomly chosen, one for each of the five 72-days periods of the year. new_doy = time.groupby(f"{dim}.year").map( @@ -242,16 +233,25 @@ def convert_calendar( return out -def _interpolate_day_of_year(time, target_calendar, use_cftime): - """Returns the nearest day in the target calendar of the corresponding - "decimal year" in the source calendar. - """ - year = int(time.dt.year[0]) - source_calendar = time.dt.calendar +def _is_leap_year(years, calendar): + func = np.vectorize(cftime.is_leap_year) + return func(years, calendar=calendar) + + +def _days_in_year(years, calendar): + """The number of days in the year according to given calendar.""" + if calendar == "360_day": + return full_like(years, 360) + return _is_leap_year(years, calendar).astype(int) + 365 + + +def _interpolate_day_of_year(times, target_calendar): + """Returns the nearest day in the target calendar of the corresponding "decimal year" in the source calendar.""" + source_calendar = times.dt.calendar return np.round( - _days_in_year(year, target_calendar, use_cftime) - * time.dt.dayofyear - / _days_in_year(year, source_calendar, use_cftime) + _days_in_year(times.dt.year, target_calendar) + * times.dt.dayofyear + / _days_in_year(times.dt.year, source_calendar) ).astype(int) @@ -260,18 +260,18 @@ def _random_day_of_year(time, target_calendar, use_cftime): Removes Feb 29th and five other days chosen randomly within five sections of 72 days. """ - year = int(time.dt.year[0]) + year = time.dt.year[0] source_calendar = time.dt.calendar new_doy = np.arange(360) + 1 rm_idx = np.random.default_rng().integers(0, 72, 5) + 72 * np.arange(5) if source_calendar == "360_day": for idx in rm_idx: new_doy[idx + 1 :] = new_doy[idx + 1 :] + 1 - if _days_in_year(year, target_calendar, use_cftime) == 366: + if _days_in_year(year, target_calendar) == 366: new_doy[new_doy >= 60] = new_doy[new_doy >= 60] + 1 elif target_calendar == "360_day": new_doy = np.insert(new_doy, rm_idx - np.arange(5), -1) - if _days_in_year(year, source_calendar, use_cftime) == 366: + if _days_in_year(year, source_calendar) == 366: new_doy = np.insert(new_doy, 60, -1) return new_doy[time.dt.dayofyear - 1] @@ -304,32 +304,45 @@ def _convert_to_new_calendar_with_new_day_of_year( return np.nan -def _datetime_to_decimal_year(times, dim="time", calendar=None): - """Convert a datetime DataArray to decimal years according to its calendar or the given one. +def _decimal_year_cftime(time, year, days_in_year, *, date_class): + year_start = date_class(year, 1, 1) + delta = np.timedelta64(time - year_start, "ns") + days_in_year = np.timedelta64(days_in_year, "D") + return year + delta / days_in_year + + +def _decimal_year_numpy(time, year, days_in_year, *, dtype): + time = np.asarray(time).astype(dtype) + year_start = np.datetime64(int(year) - 1970, "Y").astype(dtype) + delta = time - year_start + days_in_year = np.timedelta64(days_in_year, "D") + return year + delta / days_in_year + + +def _decimal_year(times): + """Convert a datetime DataArray to decimal years according to its calendar. The decimal year of a timestamp is its year plus its sub-year component converted to the fraction of its year. Ex: '2000-03-01 12:00' is 2000.1653 in a standard calendar, 2000.16301 in a "noleap" or 2000.16806 in a "360_day". """ - from xarray.core.dataarray import DataArray - - calendar = calendar or times.dt.calendar - - if is_np_datetime_like(times.dtype): - times = times.copy(data=convert_times(times.values, get_date_type("standard"))) - - def _make_index(time): - year = int(time.dt.year[0]) - doys = cftime.date2num(time, f"days since {year:04d}-01-01", calendar=calendar) - return DataArray( - year + doys / _days_in_year(year, calendar), - dims=(dim,), - coords=time.coords, - name=dim, - ) - - return times.groupby(f"{dim}.year").map(_make_index) + if times.dtype == "O": + function = _decimal_year_cftime + kwargs = {"date_class": get_date_type(times.dt.calendar, True)} + else: + function = _decimal_year_numpy + kwargs = {"dtype": times.dtype} + return apply_ufunc( + function, + times, + times.dt.year, + times.dt.days_in_year, + kwargs=kwargs, + vectorize=True, + dask="parallelized", + output_dtypes=[np.float64], + ) def interp_calendar(source, target, dim="time"): @@ -372,9 +385,7 @@ def interp_calendar(source, target, dim="time"): f"Both 'source.{dim}' and 'target' must contain datetime objects." ) - source_calendar = source[dim].dt.calendar target_calendar = target.dt.calendar - if ( source[dim].time.dt.year == 0 ).any() and target_calendar in _CALENDARS_WITHOUT_YEAR_ZERO: @@ -383,8 +394,8 @@ def interp_calendar(source, target, dim="time"): ) out = source.copy() - out[dim] = _datetime_to_decimal_year(source[dim], dim=dim, calendar=source_calendar) - target_idx = _datetime_to_decimal_year(target, dim=dim, calendar=target_calendar) + out[dim] = _decimal_year(source[dim]) + target_idx = _decimal_year(target) out = out.interp(**{dim: target_idx}) out[dim] = target return out diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index ae08abb06a7..d3a0fbb3dba 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -801,6 +801,11 @@ def round(self, freq): """ return self._round_via_method(freq, _round_to_nearest_half_even) + @property + def is_leap_year(self): + func = np.vectorize(cftime.is_leap_year) + return func(self.year, calendar=self.calendar) + def _parse_iso8601_without_reso(date_type, datetime_str): date, _ = _parse_iso8601_with_reso(date_type, datetime_str) diff --git a/xarray/core/accessor_dt.py b/xarray/core/accessor_dt.py index c5f953f15c4..e73893d0f35 100644 --- a/xarray/core/accessor_dt.py +++ b/xarray/core/accessor_dt.py @@ -6,10 +6,12 @@ import numpy as np import pandas as pd +from xarray.coding.calendar_ops import _decimal_year from xarray.coding.times import infer_calendar_name from xarray.core import duck_array_ops from xarray.core.common import ( _contains_datetime_like_objects, + full_like, is_np_datetime_like, is_np_timedelta_like, ) @@ -543,6 +545,33 @@ def calendar(self) -> CFCalendar: """ return infer_calendar_name(self._obj.data) + @property + def days_in_year(self) -> T_DataArray: + """Each datetime as the year plus the fraction of the year elapsed.""" + if self.calendar == "360_day": + result = full_like(self.year, 360) + else: + result = self.is_leap_year.astype(int) + 365 + newvar = Variable( + dims=self._obj.dims, + attrs=self._obj.attrs, + encoding=self._obj.encoding, + data=result, + ) + return self._obj._replace(newvar, name="days_in_year") + + @property + def decimal_year(self) -> T_DataArray: + """Convert the dates as a fractional year.""" + result = _decimal_year(self._obj) + newvar = Variable( + dims=self._obj.dims, + attrs=self._obj.attrs, + encoding=self._obj.encoding, + data=result, + ) + return self._obj._replace(newvar, name="decimal_year") + class TimedeltaAccessor(TimeAccessor[T_DataArray]): """Access Timedelta fields for DataArrays with Timedelta-like dtypes. diff --git a/xarray/tests/test_accessor_dt.py b/xarray/tests/test_accessor_dt.py index c73c5d3258f..587f43a5d7f 100644 --- a/xarray/tests/test_accessor_dt.py +++ b/xarray/tests/test_accessor_dt.py @@ -142,6 +142,17 @@ def test_strftime(self) -> None: "2000-01-01 01:00:00" == self.data.time.dt.strftime("%Y-%m-%d %H:%M:%S")[1] ) + @requires_cftime + @pytest.mark.parametrize( + "calendar,expected", + [("standard", 366), ("noleap", 365), ("360_day", 360), ("all_leap", 366)], + ) + def test_days_in_year(self, calendar, expected) -> None: + assert ( + self.data.convert_calendar(calendar, align_on="year").time.dt.days_in_year + == expected + ).all() + def test_not_datetime_type(self) -> None: nontime_data = self.data.copy() int_data = np.arange(len(self.data.time)).astype("int8") @@ -177,6 +188,7 @@ def test_not_datetime_type(self) -> None: "is_year_start", "is_year_end", "is_leap_year", + "days_in_year", ], ) def test_dask_field_access(self, field) -> None: @@ -698,3 +710,46 @@ def test_cftime_round_accessor( result = cftime_rounding_dataarray.dt.round(freq) assert_identical(result, expected) + + +@pytest.mark.parametrize( + "use_cftime", + [False, pytest.param(True, marks=requires_cftime)], + ids=lambda x: f"use_cftime={x}", +) +@pytest.mark.parametrize( + "use_dask", + [False, pytest.param(True, marks=requires_dask)], + ids=lambda x: f"use_dask={x}", +) +def test_decimal_year(use_cftime, use_dask) -> None: + year = 2000 + periods = 10 + freq = "h" + + shape = (2, 5) + dims = ["x", "y"] + hours_in_year = 24 * 366 + + times = xr.date_range(f"{year}", periods=periods, freq=freq, use_cftime=use_cftime) + + da = xr.DataArray(times.values.reshape(shape), dims=dims) + + if use_dask: + da = da.chunk({"y": 2}) + # Computing the decimal year for a cftime datetime array requires a + # number of small computes (6): + # - 4x one compute per .dt accessor call (requires inspecting one + # object-dtype array element to see if it is time-like) + # - 2x one compute per calendar inference (requires inspecting one + # array element to read off the calendar) + max_computes = 6 * use_cftime + with raise_if_dask_computes(max_computes=max_computes): + result = da.dt.decimal_year + else: + result = da.dt.decimal_year + + expected = xr.DataArray( + year + np.arange(periods).reshape(shape) / hours_in_year, dims=dims + ) + xr.testing.assert_equal(result, expected)