diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 102a64af433..f30aed63df4 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -43,6 +43,8 @@ Breaking changes Deprecations ~~~~~~~~~~~~ +- The `squeeze` kwarg to GroupBy is now deprecated. (:issue:`2157`) + By `Deepak Cherian `_. - As part of an effort to standardize the API, we're renaming the ``dims`` keyword arg to ``dim`` for the minority of functions which current use diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 1d7e82d3044..dfbc1317919 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -6620,7 +6620,7 @@ def interp_calendar( def groupby( self, group: Hashable | DataArray | IndexVariable, - squeeze: bool = True, + squeeze: bool | None = None, restore_coord_dims: bool = False, ) -> DataArrayGroupBy: """Returns a DataArrayGroupBy object for performing grouped operations. diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 8c81d3e6a96..3ac47e1c4d6 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -37,6 +37,7 @@ from xarray.core.types import Dims, QuantileMethods, T_DataArray, T_Xarray from xarray.core.utils import ( either_dict_or_kwargs, + emit_user_level_warning, hashable, is_scalar, maybe_wrap_array, @@ -73,6 +74,21 @@ def check_reduce_dims(reduce_dims, dimensions): ) +def _maybe_squeeze_indices( + indices, squeeze: bool | None, grouper: ResolvedGrouper, warn: bool +): + if squeeze in [None, True] and grouper.can_squeeze: + if squeeze is None and warn: + emit_user_level_warning( + "The `squeeze` kwarg to GroupBy is being removed." + "Pass .groupby(..., squeeze=False) to silence this warning." + ) + if isinstance(indices, slice): + assert indices.stop - indices.start == 1 + indices = indices.start + return indices + + def unique_value_groups( ar, sort: bool = True ) -> tuple[np.ndarray | pd.Index, T_GroupIndices, np.ndarray]: @@ -366,10 +382,10 @@ def dims(self): return self.group1d.dims @abstractmethod - def _factorize(self, squeeze: bool) -> T_FactorizeOut: + def factorize(self) -> T_FactorizeOut: raise NotImplementedError - def factorize(self, squeeze: bool) -> None: + def _factorize(self) -> None: # This design makes it clear to mypy that # codes, group_indices, unique_coord, and full_index # are set by the factorize method on the derived class. @@ -378,7 +394,7 @@ def factorize(self, squeeze: bool) -> None: self.group_indices, self.unique_coord, self.full_index, - ) = self._factorize(squeeze) + ) = self.factorize() @property def is_unique_and_monotonic(self) -> bool: @@ -393,15 +409,19 @@ def group_as_index(self) -> pd.Index: self._group_as_index = self.group1d.to_index() return self._group_as_index + @property + def can_squeeze(self): + is_dimension = self.group.dims == (self.group.name,) + return is_dimension and self.is_unique_and_monotonic + @dataclass class ResolvedUniqueGrouper(ResolvedGrouper): grouper: UniqueGrouper - def _factorize(self, squeeze) -> T_FactorizeOut: - is_dimension = self.group.dims == (self.group.name,) - if is_dimension and self.is_unique_and_monotonic: - return self._factorize_dummy(squeeze) + def factorize(self) -> T_FactorizeOut: + if self.can_squeeze: + return self._factorize_dummy() else: return self._factorize_unique() @@ -424,15 +444,12 @@ def _factorize_unique(self) -> T_FactorizeOut: return codes, group_indices, unique_coord, full_index - def _factorize_dummy(self, squeeze) -> T_FactorizeOut: + def _factorize_dummy(self) -> T_FactorizeOut: size = self.group.size # no need to factorize - if not squeeze: - # use slices to do views instead of fancy indexing - # equivalent to: group_indices = group_indices.reshape(-1, 1) - group_indices: T_GroupIndices = [slice(i, i + 1) for i in range(size)] - else: - group_indices = list(range(size)) + # use slices to do views instead of fancy indexing + # equivalent to: group_indices = group_indices.reshape(-1, 1) + group_indices: T_GroupIndices = [slice(i, i + 1) for i in range(size)] size_range = np.arange(size) if isinstance(self.group, _DummyGroup): codes = self.group.to_dataarray().copy(data=size_range) @@ -448,7 +465,7 @@ def _factorize_dummy(self, squeeze) -> T_FactorizeOut: class ResolvedBinGrouper(ResolvedGrouper): grouper: BinGrouper - def _factorize(self, squeeze: bool) -> T_FactorizeOut: + def factorize(self) -> T_FactorizeOut: from xarray.core.dataarray import DataArray data = self.group1d.values @@ -546,7 +563,7 @@ def first_items(self) -> tuple[pd.Series, np.ndarray]: _apply_loffset(self.grouper.loffset, first_items) return first_items, codes - def _factorize(self, squeeze: bool) -> T_FactorizeOut: + def factorize(self) -> T_FactorizeOut: full_index, first_items, codes_ = self._get_index_and_items() sbins = first_items.values.astype(np.int64) group_indices: T_GroupIndices = [ @@ -591,14 +608,14 @@ class TimeResampleGrouper(Grouper): loffset: datetime.timedelta | str | None -def _validate_groupby_squeeze(squeeze: bool) -> None: +def _validate_groupby_squeeze(squeeze: bool | None) -> None: # While we don't generally check the type of every arg, passing # multiple dimensions as multiple arguments is common enough, and the # consequences hidden enough (strings evaluate as true) to warrant # checking here. # A future version could make squeeze kwarg only, but would face # backward-compat issues. - if not isinstance(squeeze, bool): + if squeeze is not None and not isinstance(squeeze, bool): raise TypeError(f"`squeeze` must be True or False, but {squeeze} was supplied") @@ -730,7 +747,7 @@ def __init__( self._original_obj = obj for grouper_ in self.groupers: - grouper_.factorize(squeeze) + grouper_._factorize() (grouper,) = self.groupers self._original_group = grouper.group @@ -762,9 +779,14 @@ def sizes(self) -> Mapping[Hashable, int]: Dataset.sizes """ if self._sizes is None: - self._sizes = self._obj.isel( - {self._group_dim: self._group_indices[0]} - ).sizes + (grouper,) = self.groupers + index = _maybe_squeeze_indices( + self._group_indices[0], + self._squeeze, + grouper, + warn=True, + ) + self._sizes = self._obj.isel({self._group_dim: index}).sizes return self._sizes @@ -798,14 +820,22 @@ def groups(self) -> dict[GroupKey, GroupIndex]: # provided to mimic pandas.groupby if self._groups is None: (grouper,) = self.groupers - self._groups = dict(zip(grouper.unique_coord.values, self._group_indices)) + squeezed_indices = ( + _maybe_squeeze_indices(ind, self._squeeze, grouper, warn=idx > 0) + for idx, ind in enumerate(self._group_indices) + ) + self._groups = dict(zip(grouper.unique_coord.values, squeezed_indices)) return self._groups def __getitem__(self, key: GroupKey) -> T_Xarray: """ Get DataArray or Dataset corresponding to a particular group label. """ - return self._obj.isel({self._group_dim: self.groups[key]}) + (grouper,) = self.groupers + index = _maybe_squeeze_indices( + self.groups[key], self._squeeze, grouper, warn=True + ) + return self._obj.isel({self._group_dim: index}) def __len__(self) -> int: (grouper,) = self.groupers @@ -826,7 +856,11 @@ def __repr__(self) -> str: def _iter_grouped(self) -> Iterator[T_Xarray]: """Iterate over each element in this group""" - for indices in self._group_indices: + (grouper,) = self.groupers + for idx, indices in enumerate(self._group_indices): + indices = _maybe_squeeze_indices( + indices, self._squeeze, grouper, warn=idx > 0 + ) yield self._obj.isel({self._group_dim: indices}) def _infer_concat_args(self, applied_example): @@ -1309,7 +1343,11 @@ class DataArrayGroupByBase(GroupBy["DataArray"], DataArrayGroupbyArithmetic): @property def dims(self) -> tuple[Hashable, ...]: if self._dims is None: - self._dims = self._obj.isel({self._group_dim: self._group_indices[0]}).dims + (grouper,) = self.groupers + index = _maybe_squeeze_indices( + self._group_indices[0], self._squeeze, grouper, warn=True + ) + self._dims = self._obj.isel({self._group_dim: index}).dims return self._dims @@ -1318,7 +1356,11 @@ def _iter_grouped_shortcut(self): metadata """ var = self._obj.variable - for indices in self._group_indices: + (grouper,) = self.groupers + for idx, indices in enumerate(self._group_indices): + indices = _maybe_squeeze_indices( + indices, self._squeeze, grouper, warn=idx > 0 + ) yield var[{self._group_dim: indices}] def _concat_shortcut(self, applied, dim, positions=None): @@ -1517,7 +1559,14 @@ class DatasetGroupByBase(GroupBy["Dataset"], DatasetGroupbyArithmetic): @property def dims(self) -> Frozen[Hashable, int]: if self._dims is None: - self._dims = self._obj.isel({self._group_dim: self._group_indices[0]}).dims + (grouper,) = self.groupers + index = _maybe_squeeze_indices( + self._group_indices[0], + self._squeeze, + grouper, + warn=True, + ) + self._dims = self._obj.isel({self._group_dim: index}).dims return self._dims diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index b166992deb1..97e259988de 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -63,8 +63,11 @@ def test_groupby_dims_property(dataset) -> None: assert dataset.groupby("x").dims == dataset.isel(x=1).dims assert dataset.groupby("y").dims == dataset.isel(y=1).dims + assert dataset.groupby("x", squeeze=False).dims == dataset.isel(x=slice(1, 2)).dims + assert dataset.groupby("y", squeeze=False).dims == dataset.isel(y=slice(1, 2)).dims + stacked = dataset.stack({"xy": ("x", "y")}) - assert stacked.groupby("xy").dims == stacked.isel(xy=0).dims + assert stacked.groupby("xy", squeeze=False).dims == stacked.isel(xy=[0]).dims def test_multi_index_groupby_map(dataset) -> None: @@ -189,7 +192,7 @@ def func(arg1, arg2, arg3=0): array = xr.DataArray([1, 1, 1], [("x", [1, 2, 3])]) expected = xr.DataArray([3, 3, 3], [("x", [1, 2, 3])]) - actual = array.groupby("x").map(func, args=(1,), arg3=1) + actual = array.groupby("x", squeeze=False).map(func, args=(1,), arg3=1) assert_identical(expected, actual) @@ -468,8 +471,8 @@ def test_da_groupby_assign_coords() -> None: actual = xr.DataArray( [[3, 4, 5], [6, 7, 8]], dims=["y", "x"], coords={"y": range(2), "x": range(3)} ) - actual1 = actual.groupby("x").assign_coords({"y": [-1, -2]}) - actual2 = actual.groupby("x").assign_coords(y=[-1, -2]) + actual1 = actual.groupby("x", squeeze=False).assign_coords({"y": [-1, -2]}) + actual2 = actual.groupby("x", squeeze=False).assign_coords(y=[-1, -2]) expected = xr.DataArray( [[3, 4, 5], [6, 7, 8]], dims=["y", "x"], coords={"y": [-1, -2], "x": range(3)} ) @@ -617,15 +620,14 @@ def test_groupby_grouping_errors() -> None: def test_groupby_reduce_dimension_error(array) -> None: grouped = array.groupby("y") - with pytest.raises(ValueError, match=r"cannot reduce over dimensions"): - grouped.mean() - with pytest.raises(ValueError, match=r"cannot reduce over dimensions"): grouped.mean("huh") with pytest.raises(ValueError, match=r"cannot reduce over dimensions"): grouped.mean(("x", "y", "asd")) + assert_identical(array, grouped.mean()) + grouped = array.groupby("y", squeeze=False) assert_identical(array, grouped.mean()) @@ -667,13 +669,17 @@ def test_groupby_none_group_name() -> None: def test_groupby_getitem(dataset) -> None: - assert_identical(dataset.sel(x="a"), dataset.groupby("x")["a"]) - assert_identical(dataset.sel(z=1), dataset.groupby("z")[1]) + assert_identical(dataset.sel(x=["a"]), dataset.groupby("x", squeeze=False)["a"]) + assert_identical(dataset.sel(z=[1]), dataset.groupby("z", squeeze=False)[1]) - assert_identical(dataset.foo.sel(x="a"), dataset.foo.groupby("x")["a"]) - assert_identical(dataset.foo.sel(z=1), dataset.foo.groupby("z")[1]) + assert_identical( + dataset.foo.sel(x=["a"]), dataset.foo.groupby("x", squeeze=False)["a"] + ) + assert_identical(dataset.foo.sel(z=[1]), dataset.foo.groupby("z", squeeze=False)[1]) - actual = dataset.groupby("boo")["f"].unstack().transpose("x", "y", "z") + actual = ( + dataset.groupby("boo", squeeze=False)["f"].unstack().transpose("x", "y", "z") + ) expected = dataset.sel(y=[1], z=[1, 2]).transpose("x", "y", "z") assert_identical(expected, actual) @@ -683,14 +689,14 @@ def test_groupby_dataset() -> None: {"z": (["x", "y"], np.random.randn(3, 5))}, {"x": ("x", list("abc")), "c": ("x", [0, 1, 0]), "y": range(5)}, ) - groupby = data.groupby("x") + groupby = data.groupby("x", squeeze=False) assert len(groupby) == 3 - expected_groups = {"a": 0, "b": 1, "c": 2} + expected_groups = {"a": slice(0, 1), "b": slice(1, 2), "c": slice(2, 3)} assert groupby.groups == expected_groups expected_items = [ - ("a", data.isel(x=0)), - ("b", data.isel(x=1)), - ("c", data.isel(x=2)), + ("a", data.isel(x=[0])), + ("b", data.isel(x=[1])), + ("c", data.isel(x=[2])), ] for actual1, expected1 in zip(groupby, expected_items): assert actual1[0] == expected1[0] @@ -707,22 +713,22 @@ def identity(x): def test_groupby_dataset_returns_new_type() -> None: data = Dataset({"z": (["x", "y"], np.random.randn(3, 5))}) - actual1 = data.groupby("x").map(lambda ds: ds["z"]) + actual1 = data.groupby("x", squeeze=False).map(lambda ds: ds["z"]) expected1 = data["z"] assert_identical(expected1, actual1) - actual2 = data["z"].groupby("x").map(lambda x: x.to_dataset()) + actual2 = data["z"].groupby("x", squeeze=False).map(lambda x: x.to_dataset()) expected2 = data assert_identical(expected2, actual2) def test_groupby_dataset_iter() -> None: data = create_test_data() - for n, (t, sub) in enumerate(list(data.groupby("dim1"))[:3]): + for n, (t, sub) in enumerate(list(data.groupby("dim1", squeeze=False))[:3]): assert data["dim1"][n] == t - assert_equal(data["var1"][n], sub["var1"]) - assert_equal(data["var2"][n], sub["var2"]) - assert_equal(data["var3"][:, n], sub["var3"]) + assert_equal(data["var1"][[n]], sub["var1"]) + assert_equal(data["var2"][[n]], sub["var2"]) + assert_equal(data["var3"][:, [n]], sub["var3"]) def test_groupby_dataset_errors() -> None: @@ -1093,25 +1099,25 @@ def test_stack_groupby_unsorted_coord(self): y_vals = [2, 3] arr = xr.DataArray(data, dims=dims, coords={"y": y_vals}) - actual1 = arr.stack(z=dims).groupby("z").first() + actual1 = arr.stack(z=dims).groupby("z", squeeze=False).first() midx1 = pd.MultiIndex.from_product([[0, 1], [2, 3]], names=dims) expected1 = xr.DataArray(data_flat, dims=["z"], coords={"z": midx1}) assert_equal(actual1, expected1) # GH: 3287. Note that y coord values are not in sorted order. arr = xr.DataArray(data, dims=dims, coords={"y": y_vals[::-1]}) - actual2 = arr.stack(z=dims).groupby("z").first() + actual2 = arr.stack(z=dims).groupby("z", squeeze=False).first() midx2 = pd.MultiIndex.from_product([[0, 1], [3, 2]], names=dims) expected2 = xr.DataArray(data_flat, dims=["z"], coords={"z": midx2}) assert_equal(actual2, expected2) def test_groupby_iter(self): for (act_x, act_dv), (exp_x, exp_ds) in zip( - self.dv.groupby("y"), self.ds.groupby("y") + self.dv.groupby("y", squeeze=False), self.ds.groupby("y", squeeze=False) ): assert exp_x == act_x assert_identical(exp_ds["foo"], act_dv) - for (_, exp_dv), act_dv in zip(self.dv.groupby("x"), self.dv): + for (_, exp_dv), (_, act_dv) in zip(self.dv.groupby("x"), self.dv.groupby("x")): assert_identical(exp_dv, act_dv) def test_groupby_properties(self): @@ -1369,7 +1375,7 @@ def test_groupby_restore_dim_order(self): ("a", ("a", "y")), ("b", ("x", "b")), ]: - result = array.groupby(by).map(lambda x: x.squeeze()) + result = array.groupby(by, squeeze=False).map(lambda x: x.squeeze()) assert result.dims == expected_dims def test_groupby_restore_coord_dims(self): @@ -1389,7 +1395,7 @@ def test_groupby_restore_coord_dims(self): ("a", ("a", "y")), ("b", ("x", "b")), ]: - result = array.groupby(by, restore_coord_dims=True).map( + result = array.groupby(by, squeeze=False, restore_coord_dims=True).map( lambda x: x.squeeze() )["c"] assert result.dims == expected_dims @@ -1411,7 +1417,7 @@ def test_groupby_first_and_last(self): actual = array.groupby(by).first() assert_identical(expected, actual) - actual = array.groupby("x").first() + actual = array.groupby("x", squeeze=False).first() expected = array # should be a no-op assert_identical(expected, actual)