From 99f03c20a64a51ffd1f8bb6daee5bef8d6a9f246 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 18 Apr 2023 08:48:22 -0600 Subject: [PATCH] Fix binning by unsorted array (#7762) * Fix binning by unsorted array Closes #7759 * Update xarray/tests/test_groupby.py Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --------- Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- doc/whats-new.rst | 24 +++------------- xarray/core/groupby.py | 2 +- xarray/tests/test_groupby.py | 55 +++++++++++++++++++++++------------- 3 files changed, 40 insertions(+), 41 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1f3202370b4..8971015d404 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -16,33 +16,17 @@ What's New -.. _whats-new.2023.05.0: +.. _whats-new.2023.04.1: -v2023.05.0 (unreleased) +v2023.04.1 (unreleased) ----------------------- -New Features -~~~~~~~~~~~~ - - -Breaking changes -~~~~~~~~~~~~~~~~ - - -Deprecations -~~~~~~~~~~~~ - +This is a patch release to fix a bug with binning (:issue:`7759`) Bug fixes ~~~~~~~~~ - -Documentation -~~~~~~~~~~~~~ - - -Internal Changes -~~~~~~~~~~~~~~~~ +- Fix binning by unsorted arrays. (:issue:`7759`) .. _whats-new.2023.04.0: diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 8df1e5fedec..b786fa60af9 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -338,7 +338,7 @@ def _factorize_bins( if (codes == -1).all(): raise ValueError(f"None of the data falls within bins with edges {bins!r}") full_index = binned.categories - unique_values = binned.unique().dropna() + unique_values = np.sort(binned.unique().dropna()) group_indices = [g for g in _codes_to_groups(codes, len(full_index)) if g] if len(group_indices) == 0: diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 52c1af97fbf..13e26954950 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -1371,36 +1371,51 @@ def test_groupby_multidim_map(self): ) assert_identical(expected, actual) - def test_groupby_bins(self): - array = DataArray(np.arange(4), dims="dim_0") + @pytest.mark.parametrize("use_flox", [True, False]) + @pytest.mark.parametrize("coords", [np.arange(4), np.arange(4)[::-1], [2, 0, 3, 1]]) + def test_groupby_bins(self, coords: np.typing.ArrayLike, use_flox: bool) -> None: + array = DataArray( + np.arange(4), dims="dim_0", coords={"dim_0": coords}, name="a" + ) # the first value should not be part of any group ("right" binning) array[0] = 99 # bins follow conventions for pandas.cut # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html bins = [0, 1.5, 5] - bin_coords = pd.cut(array["dim_0"], bins).categories - expected = DataArray( - [1, 5], dims="dim_0_bins", coords={"dim_0_bins": bin_coords} + + df = array.to_dataframe() + df["dim_0_bins"] = pd.cut(array["dim_0"], bins) + + expected_df = df.groupby("dim_0_bins").sum() + # TODO: can't convert df with IntervalIndex to Xarray + + expected = ( + expected_df.reset_index(drop=True) + .to_xarray() + .assign_coords(index=np.array(expected_df.index)) + .rename({"index": "dim_0_bins"})["a"] ) - actual = array.groupby_bins("dim_0", bins=bins).sum() - assert_identical(expected, actual) - actual = array.groupby_bins("dim_0", bins=bins, labels=[1.2, 3.5]).sum() - assert_identical(expected.assign_coords(dim_0_bins=[1.2, 3.5]), actual) + with xr.set_options(use_flox=use_flox): + actual = array.groupby_bins("dim_0", bins=bins).sum() + assert_identical(expected, actual) - actual = array.groupby_bins("dim_0", bins=bins).map(lambda x: x.sum()) - assert_identical(expected, actual) + actual = array.groupby_bins("dim_0", bins=bins, labels=[1.2, 3.5]).sum() + assert_identical(expected.assign_coords(dim_0_bins=[1.2, 3.5]), actual) - # make sure original array dims are unchanged - assert len(array.dim_0) == 4 + actual = array.groupby_bins("dim_0", bins=bins).map(lambda x: x.sum()) + assert_identical(expected, actual) - da = xr.DataArray(np.ones((2, 3, 4))) - bins = [-1, 0, 1, 2] - with xr.set_options(use_flox=False): - actual = da.groupby_bins("dim_0", bins).mean(...) - with xr.set_options(use_flox=True): - expected = da.groupby_bins("dim_0", bins).mean(...) - assert_allclose(actual, expected) + # make sure original array dims are unchanged + assert len(array.dim_0) == 4 + + da = xr.DataArray(np.ones((2, 3, 4))) + bins = [-1, 0, 1, 2] + with xr.set_options(use_flox=False): + actual = da.groupby_bins("dim_0", bins).mean(...) + with xr.set_options(use_flox=True): + expected = da.groupby_bins("dim_0", bins).mean(...) + assert_allclose(actual, expected) def test_groupby_bins_empty(self): array = DataArray(np.arange(4), [("x", range(4))])