Open
Description
Today, Dask's resample codepath converts the resampling frequencies into DateOffsets before downstream processing. This is convenient, but currently prevents using resample with Dask-cuDF, as cuDF does not provide support for DateOffsets in the Grouper API.
To enable CPU/GPU compatibility in Dask, it might be nice to explore whether it's necessary in Dask to convert resampling frequencies into DateOffsets. If it's not necessary, we might be able to identify an alternative that is compatible with cuDF's existing resample functionality.
import cudf
index = cudf.date_range(start="2001-01-01", periods=10, freq="1T")
sr = cudf.Series(range(10), index=index)
sr.resample(cudf.DateOffset(minutes=10)).sum()
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
Input In [83], in <module>
3 index = cudf.date_range(start="2001-01-01", periods=10, freq="1T")
4 sr = cudf.Series(range(10), index=index)
----> 6 sr.resample(cudf.DateOffset(minutes=10)).sum()
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/indexed_frame.py:1238, in IndexedFrame.resample(self, rule, axis, closed, label, convention, kind, loffset, base, on, level, origin, offset)
1223 raise NotImplementedError(
1224 "The following arguments are not "
1225 "currently supported by resample:\n\n"
(...)
1232 "- offset"
1233 )
1234 by = cudf.Grouper(
1235 key=on, freq=rule, closed=closed, label=label, level=level
1236 )
1237 return (
-> 1238 cudf.core.resample.SeriesResampler(self, by=by)
1239 if isinstance(self, cudf.Series)
1240 else cudf.core.resample.DataFrameResampler(self, by=by)
1241 )
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/resample.py:37, in _Resampler.__init__(self, obj, by, axis, kind)
36 def __init__(self, obj, by, axis=None, kind=None):
---> 37 by = _ResampleGrouping(obj, by)
38 super().__init__(obj, by=by)
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/groupby/groupby.py:1488, in _Grouping.__init__(self, obj, by, level)
1485 # Need to keep track of named key columns
1486 # to support `as_index=False` correctly
1487 self._named_columns = []
-> 1488 self._handle_by_or_level(by, level)
1490 if len(obj) and not len(self._key_columns):
1491 raise ValueError("No group keys passed")
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/groupby/groupby.py:1513, in _Grouping._handle_by_or_level(self, by, level)
1511 self._handle_mapping(by)
1512 elif isinstance(by, Grouper):
-> 1513 self._handle_grouper(by)
1514 else:
1515 try:
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/groupby/groupby.py:1577, in _Grouping._handle_grouper(self, by)
1575 def _handle_grouper(self, by):
1576 if by.freq:
-> 1577 self._handle_frequency_grouper(by)
1578 elif by.key:
1579 self._handle_label(by.key)
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/resample.py:107, in _ResampleGrouping._handle_frequency_grouper(self, by)
104 closed = by.closed
106 if isinstance(freq, (cudf.DateOffset, pd.DateOffset)):
--> 107 raise NotImplementedError(
108 "Resampling by DateOffset objects is not yet supported."
109 )
110 if not isinstance(freq, str):
111 raise TypeError(
112 f"Unsupported type for freq: {type(freq).__name__}"
113 )
NotImplementedError: Resampling by DateOffset objects is not yet supported.
sr.groupby(cudf.Grouper(cudf.DateOffset(minutes=10))).mean()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Input In [94], in <module>
----> 1 sr.groupby(cudf.Grouper(cudf.DateOffset(minutes=10))).mean()
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/series.py:3265, in Series.groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed, dropna)
3257 if by is None and level is None:
3258 raise TypeError(
3259 "groupby() requires either by or level to be specified."
3260 )
3262 return (
3263 cudf.core.resample.SeriesResampler(self, by=by)
3264 if isinstance(by, cudf.Grouper) and by.freq
-> 3265 else SeriesGroupBy(
3266 self, by=by, level=level, dropna=dropna, sort=sort
3267 )
3268 )
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/groupby/groupby.py:82, in GroupBy.__init__(self, obj, by, level, sort, as_index, dropna)
80 self.grouping = by
81 else:
---> 82 self.grouping = _Grouping(obj, by, level)
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/groupby/groupby.py:1488, in _Grouping.__init__(self, obj, by, level)
1485 # Need to keep track of named key columns
1486 # to support `as_index=False` correctly
1487 self._named_columns = []
-> 1488 self._handle_by_or_level(by, level)
1490 if len(obj) and not len(self._key_columns):
1491 raise ValueError("No group keys passed")
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/groupby/groupby.py:1513, in _Grouping._handle_by_or_level(self, by, level)
1511 self._handle_mapping(by)
1512 elif isinstance(by, Grouper):
-> 1513 self._handle_grouper(by)
1514 else:
1515 try:
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/groupby/groupby.py:1579, in _Grouping._handle_grouper(self, by)
1577 self._handle_frequency_grouper(by)
1578 elif by.key:
-> 1579 self._handle_label(by.key)
1580 else:
1581 self._handle_level(by.level)
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/groupby/groupby.py:1571, in _Grouping._handle_label(self, by)
1570 def _handle_label(self, by):
-> 1571 self._key_columns.append(self._obj._data[by])
1572 self.names.append(by)
1573 self._named_columns.append(by)
File ~/conda/envs/rapids-22.04/lib/python3.8/site-packages/cudf/core/column_accessor.py:156, in ColumnAccessor.__getitem__(self, key)
155 def __getitem__(self, key: Any) -> ColumnBase:
--> 156 return self._data[key]
KeyError: <DateOffset: minutes=10>
Env:
conda list | grep "rapids\|dask"
# packages in environment at /home/nicholasb/conda/envs/rapids-22.04:
cucim 22.04.00a220201 cuda_11_py38_g0861858_17 rapidsai-nightly
cudf 22.04.00a220201 cuda_11_py38_g2c6b0dac61_95 rapidsai-nightly
cudf_kafka 22.04.00a220201 py38_g2c6b0dac61_95 rapidsai-nightly
cugraph 22.04.00a220201 cuda11_py38_g2b950598_32 rapidsai-nightly
cuml 22.04.00a220201 cuda11_py38_ga70044cf2_39 rapidsai-nightly
cusignal 22.04.00a220201 py39_gc620d82_7 rapidsai-nightly
cuspatial 22.04.00a220201 py38_ge00d63f_9 rapidsai-nightly
custreamz 22.04.00a220201 py38_g2c6b0dac61_95 rapidsai-nightly
cuxfilter 22.04.00a220201 py38_g17de7c0_7 rapidsai-nightly
dask 2022.1.0 pyhd8ed1ab_0 conda-forge
dask-core 2022.1.0 pyhd8ed1ab_0 conda-forge
dask-cuda 22.04.00a220201 py38_8 rapidsai-nightly
dask-cudf 22.04.00a220201 cuda_11_py38_g2c6b0dac61_95 rapidsai-nightly
libcucim 22.04.00a220201 cuda11_g0861858_17 rapidsai-nightly
libcudf 22.04.00a220201 cuda11_g2c6b0dac61_95 rapidsai-nightly
libcudf_kafka 22.04.00a220201 g2c6b0dac61_95 rapidsai-nightly
libcugraph 22.04.00a220201 cuda11_g2b950598_32 rapidsai-nightly
libcugraph_etl 22.04.00a220201 cuda11_g2b950598_32 rapidsai-nightly
libcuml 22.04.00a220201 cuda11_ga70044cf2_39 rapidsai-nightly
libcumlprims 22.04.00a220121 cuda11_g130a9d4_8 rapidsai-nightly
libcuspatial 22.04.00a220201 cuda11_ge00d63f_9 rapidsai-nightly
librmm 22.04.00a220201 cuda11_g81d523a_15 rapidsai-nightly
libxgboost 1.5.0dev.rapidsai22.04 cuda11.2_0 rapidsai-nightly
ptxcompiler 0.2.0 py38h98f4b32_0 rapidsai-nightly
py-xgboost 1.5.0dev.rapidsai22.04 cuda11.2py38_0 rapidsai-nightly
pylibcugraph 22.04.00a220201 cuda11_py38_g2b950598_32 rapidsai-nightly
rapids 22.04.00a220201 cuda11_py38_g1db237c_48 rapidsai-nightly
rapids-xgboost 22.04.00a220201 cuda11_py38_g1db237c_48 rapidsai-nightly
rmm 22.04.00a220201 cuda11_py38_g81d523a_15_has_cma rapidsai-nightly
ucx 1.12.0+gd367332 cuda11.2_0 rapidsai-nightly
ucx-proc 1.0.0 gpu rapidsai-nightly
ucx-py 0.25.00a220201 py38_gd367332_4 rapidsai-nightly
xgboost 1.5.0dev.rapidsai22.04 cuda11.2py38_0 rapidsai-nightly
Metadata
Metadata
Assignees
Type
Projects
Status
Todo