diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index a4d03e3ce73..77c00bff55d 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -6749,6 +6749,11 @@ def groupby( restore_coord_dims : bool, default: False If True, also restore the dimension order of multi-dimensional coordinates. + eagerly_compute_group: bool + Whether to eagerly compute ``group`` when it is a chunked array. + This option is to maintain backwards compatibility. Set to False + to opt-in to future behaviour, where ``group`` is not automatically loaded + into memory. **groupers : Mapping of str to Grouper or Resampler Mapping of variable name to group by to :py:class:`Grouper` or :py:class:`Resampler` object. One of ``group`` or ``groupers`` must be provided. @@ -6917,6 +6922,11 @@ def groupby_bins( coordinates. duplicates : {"raise", "drop"}, default: "raise" If bin edges are not unique, raise ValueError or drop non-uniques. + eagerly_compute_group: bool + Whether to eagerly compute ``group`` when it is a chunked array. + This option is to maintain backwards compatibility. Set to False + to opt-in to future behaviour, where ``group`` is not automatically loaded + into memory. Returns ------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5cdeb8049d7..415bf350e7d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -10382,6 +10382,11 @@ def groupby( restore_coord_dims : bool, default: False If True, also restore the dimension order of multi-dimensional coordinates. + eagerly_compute_group: bool + Whether to eagerly compute ``group`` when it is a chunked array. + This option is to maintain backwards compatibility. Set to False + to opt-in to future behaviour, where ``group`` is not automatically loaded + into memory. **groupers : Mapping of str to Grouper or Resampler Mapping of variable name to group by to :py:class:`Grouper` or :py:class:`Resampler` object. One of ``group`` or ``groupers`` must be provided. @@ -10519,6 +10524,11 @@ def groupby_bins( coordinates. duplicates : {"raise", "drop"}, default: "raise" If bin edges are not unique, raise ValueError or drop non-uniques. + eagerly_compute_group: bool + Whether to eagerly compute ``group`` when it is a chunked array. + This option is to maintain backwards compatibility. Set to False + to opt-in to future behaviour, where ``group`` is not automatically loaded + into memory. Returns ------- diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index e63243eae70..68362ccd187 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -308,21 +308,45 @@ def __post_init__(self) -> None: # of pd.cut # We do not want to modify the original object, since the same grouper # might be used multiple times. + from xarray.groupers import BinGrouper, UniqueGrouper + self.grouper = copy.deepcopy(self.grouper) self.group = _resolve_group(self.obj, self.group) - if ( - self.eagerly_compute_group - and not isinstance(self.group, _DummyGroup) - and is_chunked_array(self.group.variable._data) + if not isinstance(self.group, _DummyGroup) and is_chunked_array( + self.group.variable._data ): - emit_user_level_warning( - f"Eagerly computing the DataArray you're grouping by ({self.group.name!r}) " - "is deprecated and will be removed in v2025.05.0. " - "Please load this array's data manually using `.compute` or `.load`.", - DeprecationWarning, - ) + if self.eagerly_compute_group is False: + # This requires a pass to discover the groups present + if ( + isinstance(self.grouper, UniqueGrouper) + and self.grouper.labels is None + ): + raise ValueError( + "Please pass `labels` to UniqueGrouper when grouping by a chunked array." + ) + # this requires a pass to compute the bin edges + if isinstance(self.grouper, BinGrouper) and isinstance( + self.grouper.bins, int + ): + raise ValueError( + "Please pass explicit bin edges to BinGrouper using the ``bins`` kwarg" + "when grouping by a chunked array." + ) + + if self.eagerly_compute_group: + emit_user_level_warning( + f""""Eagerly computing the DataArray you're grouping by ({self.group.name!r}) " + is deprecated and will raise an error in v2025.05.0. + Please load this array's data manually using `.compute` or `.load`. + To intentionally avoid eager loading, either (1) specify + `.groupby({self.group.name}=UniqueGrouper(labels=...), eagerly_load_group=False)` + or (2) pass explicit bin edges using or `.groupby({self.group.name}=BinGrouper(bins=...), + eagerly_load_group=False)`; as appropriate.""", + DeprecationWarning, + ) + self.group = self.group.compute() self.encoded = self.grouper.factorize(self.group) @@ -678,8 +702,9 @@ def _raise_if_by_is_chunked(self): if self._by_chunked: raise ValueError( "This method is not supported when lazily grouping by a chunked array. " - "Either load the array in to memory prior to grouping, or explore another " - "way of applying your function, potentially using the `flox` package." + "Either load the array in to memory prior to grouping using .load or .compute, " + " or explore another way of applying your function, " + "potentially using the `flox` package." ) def _raise_if_not_single_group(self): diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index edf0f9df176..b69682c18c7 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -3091,6 +3091,41 @@ def test_groupby_multiple_bin_grouper_missing_groups(): assert_identical(actual, expected) +@requires_dask +def test_groupby_dask_eager_load_warnings(): + ds = xr.Dataset( + {"foo": (("z"), np.arange(12))}, + coords={"x": ("z", np.arange(12)), "y": ("z", np.arange(12))}, + ).chunk(z=6) + + with pytest.warns(DeprecationWarning): + ds.groupby(x=UniqueGrouper()) + + with pytest.warns(DeprecationWarning): + ds.groupby("x") + + with pytest.warns(DeprecationWarning): + ds.groupby(ds.x) + + with pytest.raises(ValueError, match="Please pass"): + ds.groupby("x", eagerly_compute_group=False) + + # This is technically fine but anyone iterating over the groupby object + # will see an error, so let's warn and have them opt-in. + with pytest.warns(DeprecationWarning): + ds.groupby(x=UniqueGrouper(labels=[1, 2, 3])) + + ds.groupby(x=UniqueGrouper(labels=[1, 2, 3]), eagerly_compute_group=False) + + with pytest.warns(DeprecationWarning): + ds.groupby_bins("x", bins=3) + with pytest.raises(ValueError, match="Please pass"): + ds.groupby_bins("x", bins=3, eagerly_compute_group=False) + with pytest.warns(DeprecationWarning): + ds.groupby_bins("x", bins=[1, 2, 3]) + ds.groupby_bins("x", bins=[1, 2, 3], eagerly_compute_group=False) + + # Possible property tests # 1. lambda x: x # 2. grouped-reduce on unique coords is identical to array