Skip to content

Commit

Permalink
Fix binning by unsorted array (#7762)
Browse files Browse the repository at this point in the history
* Fix binning by unsorted array

Closes #7759

* Update xarray/tests/test_groupby.py

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>

---------

Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com>
  • Loading branch information
dcherian and Illviljan authored Apr 18, 2023
1 parent 68bae08 commit 99f03c2
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 41 deletions.
24 changes: 4 additions & 20 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 +16,17 @@ What's New
.. _whats-new.2023.05.0:
.. _whats-new.2023.04.1:

v2023.05.0 (unreleased)
v2023.04.1 (unreleased)
-----------------------

New Features
~~~~~~~~~~~~


Breaking changes
~~~~~~~~~~~~~~~~


Deprecations
~~~~~~~~~~~~

This is a patch release to fix a bug with binning (:issue:`7759`)

Bug fixes
~~~~~~~~~


Documentation
~~~~~~~~~~~~~


Internal Changes
~~~~~~~~~~~~~~~~
- Fix binning by unsorted arrays. (:issue:`7759`)


.. _whats-new.2023.04.0:
Expand Down
2 changes: 1 addition & 1 deletion xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ def _factorize_bins(
if (codes == -1).all():
raise ValueError(f"None of the data falls within bins with edges {bins!r}")
full_index = binned.categories
unique_values = binned.unique().dropna()
unique_values = np.sort(binned.unique().dropna())
group_indices = [g for g in _codes_to_groups(codes, len(full_index)) if g]

if len(group_indices) == 0:
Expand Down
55 changes: 35 additions & 20 deletions xarray/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1371,36 +1371,51 @@ def test_groupby_multidim_map(self):
)
assert_identical(expected, actual)

def test_groupby_bins(self):
array = DataArray(np.arange(4), dims="dim_0")
@pytest.mark.parametrize("use_flox", [True, False])
@pytest.mark.parametrize("coords", [np.arange(4), np.arange(4)[::-1], [2, 0, 3, 1]])
def test_groupby_bins(self, coords: np.typing.ArrayLike, use_flox: bool) -> None:
array = DataArray(
np.arange(4), dims="dim_0", coords={"dim_0": coords}, name="a"
)
# the first value should not be part of any group ("right" binning)
array[0] = 99
# bins follow conventions for pandas.cut
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
bins = [0, 1.5, 5]
bin_coords = pd.cut(array["dim_0"], bins).categories
expected = DataArray(
[1, 5], dims="dim_0_bins", coords={"dim_0_bins": bin_coords}

df = array.to_dataframe()
df["dim_0_bins"] = pd.cut(array["dim_0"], bins)

expected_df = df.groupby("dim_0_bins").sum()
# TODO: can't convert df with IntervalIndex to Xarray

expected = (
expected_df.reset_index(drop=True)
.to_xarray()
.assign_coords(index=np.array(expected_df.index))
.rename({"index": "dim_0_bins"})["a"]
)
actual = array.groupby_bins("dim_0", bins=bins).sum()
assert_identical(expected, actual)

actual = array.groupby_bins("dim_0", bins=bins, labels=[1.2, 3.5]).sum()
assert_identical(expected.assign_coords(dim_0_bins=[1.2, 3.5]), actual)
with xr.set_options(use_flox=use_flox):
actual = array.groupby_bins("dim_0", bins=bins).sum()
assert_identical(expected, actual)

actual = array.groupby_bins("dim_0", bins=bins).map(lambda x: x.sum())
assert_identical(expected, actual)
actual = array.groupby_bins("dim_0", bins=bins, labels=[1.2, 3.5]).sum()
assert_identical(expected.assign_coords(dim_0_bins=[1.2, 3.5]), actual)

# make sure original array dims are unchanged
assert len(array.dim_0) == 4
actual = array.groupby_bins("dim_0", bins=bins).map(lambda x: x.sum())
assert_identical(expected, actual)

da = xr.DataArray(np.ones((2, 3, 4)))
bins = [-1, 0, 1, 2]
with xr.set_options(use_flox=False):
actual = da.groupby_bins("dim_0", bins).mean(...)
with xr.set_options(use_flox=True):
expected = da.groupby_bins("dim_0", bins).mean(...)
assert_allclose(actual, expected)
# make sure original array dims are unchanged
assert len(array.dim_0) == 4

da = xr.DataArray(np.ones((2, 3, 4)))
bins = [-1, 0, 1, 2]
with xr.set_options(use_flox=False):
actual = da.groupby_bins("dim_0", bins).mean(...)
with xr.set_options(use_flox=True):
expected = da.groupby_bins("dim_0", bins).mean(...)
assert_allclose(actual, expected)

def test_groupby_bins_empty(self):
array = DataArray(np.arange(4), [("x", range(4))])
Expand Down

0 comments on commit 99f03c2

Please sign in to comment.