Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add groupby.dims & Fix groupby reduce for DataArray #3338

Merged
merged 28 commits into from
Oct 10, 2019
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a6cb3f3
Fix groupby reduce for DataArray
dcherian Sep 24, 2019
f320e94
bugfix.
dcherian Sep 24, 2019
c4ea742
another bugfix.
dcherian Sep 24, 2019
f6d396a
bugfix unique_and_monotonic for object indexes (uniqueness is enough)
dcherian Oct 4, 2019
b202ad5
Add groupby.dims property.
dcherian Oct 4, 2019
75a52be
update reduce docstring to point to xarray.ALL_DIMS
dcherian Oct 4, 2019
c017481
test for object index dims.
dcherian Oct 4, 2019
d1f47d5
test reduce dimensions error.
dcherian Oct 4, 2019
f2093e2
Add whats-new
dcherian Oct 4, 2019
0baa390
fix docs build
dcherian Oct 4, 2019
33715a2
sq whats-new
dcherian Oct 4, 2019
4b13b5f
one more test.
dcherian Oct 4, 2019
8f257b1
fix test.
dcherian Oct 7, 2019
94a8688
undo monotonic change.
dcherian Oct 8, 2019
7c7c119
Add dimensions to repr.
dcherian Oct 8, 2019
88de8ae
Raise error if no bins.
dcherian Oct 8, 2019
2ee3bc2
Raise nice error if no groups were formed.
dcherian Oct 8, 2019
436b6e3
Some more error raising and testing.
dcherian Oct 8, 2019
de7cc7c
Add dataset tests.
dcherian Oct 8, 2019
5269e81
update whats-new.
dcherian Oct 8, 2019
bd5b5fd
fix tests.
dcherian Oct 8, 2019
0c69b77
Merge branch 'master' into fix/groupby
dcherian Oct 8, 2019
2cb7420
make dims a cached lazy property.
dcherian Oct 9, 2019
b8bc3d2
fix whats-new.
dcherian Oct 9, 2019
1bfe9a3
Merge remote-tracking branch 'origin/fix/groupby' into fix/groupby
dcherian Oct 9, 2019
7404508
whitespace
dcherian Oct 9, 2019
40ad11c
fix whats-new
dcherian Oct 9, 2019
611baae
Merge branch 'master' into fix/groupby
dcherian Oct 9, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -213,4 +213,4 @@ applying your function, and then unstacking the result:
.. ipython:: python

stacked = da.stack(gridcell=['ny', 'nx'])
stacked.groupby('gridcell').sum().unstack('gridcell')
stacked.groupby('gridcell').sum(xr.ALL_DIMS).unstack('gridcell')
12 changes: 8 additions & 4 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,16 @@ New functions/methods
Enhancements
~~~~~~~~~~~~

- Add a repr for :py:class:`~xarray.core.GroupBy` objects (:issue:`3344`).
Example::
- :py:class:`~xarray.core.GroupBy` enhancements. By `Deepak Cherian <https://github.com/dcherian>`_.

- Added a repr. Example::

>>> da.groupby("time.season")
DataArrayGroupBy, grouped over 'season'
4 groups with labels 'DJF', 'JJA', 'MAM', 'SON'

By `Deepak Cherian <https://github.com/dcherian>`_.
- Added a ``GroupBy.dims`` property that mirrors the dimensions
of each group.

Bug fixes
~~~~~~~~~
Expand All @@ -67,9 +69,11 @@ Bug fixes
- Line plots with the ``x`` or ``y`` argument set to a 1D non-dimensional coord
now plot the correct data for 2D DataArrays
(:issue:`3334`). By `Tom Nicholas <http://github.com/TomNicholas>`_.
- Fix deprecation of default reduction dimension for :py:class:`~xarray.core.groupby.DataArrayGroupBy` objects.
(:issue:`3337`). Also raise nicer error message when no groups are created (:issue:`1764`). By `Deepak Cherian <https://github.com/dcherian>`_.
- Fix error in concatenating unlabeled dimensions (:pull:`3362`).
By `Deepak Cherian <https://github.com/dcherian/>`_.

dcherian marked this conversation as resolved.
Show resolved Hide resolved
Documentation
~~~~~~~~~~~~~

Expand Down
53 changes: 44 additions & 9 deletions xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from . import dtypes, duck_array_ops, nputils, ops
from .arithmetic import SupportsArithmetic
from .common import ImplementsArrayReduce, ImplementsDatasetReduce
from .common import ALL_DIMS, ImplementsArrayReduce, ImplementsDatasetReduce
from .concat import concat
from .formatting import format_array_flat
from .options import _get_keep_attrs
Expand Down Expand Up @@ -248,6 +248,7 @@ class GroupBy(SupportsArithmetic):
"_restore_coord_dims",
"_stacked_dim",
"_unique_coord",
"dims",
)

def __init__(
Expand Down Expand Up @@ -320,6 +321,8 @@ def __init__(
full_index = None

if bins is not None:
if np.isnan(bins).all():
raise ValueError("All bin edges are NaN.")
binned = pd.cut(group.values, bins, **cut_kwargs)
new_dim_name = group.name + "_bins"
group = DataArray(binned, group.coords, name=new_dim_name)
Expand Down Expand Up @@ -380,6 +383,19 @@ def __init__(
# cached attributes
self._groups = None

if len(group_indices) == 0:
if bins is not None:
raise ValueError(
dcherian marked this conversation as resolved.
Show resolved Hide resolved
"None of the data falls within bins with edges %r" % bins
)
else:
raise ValueError(
"Failed to group data. Are you grouping by a variable that is all NaN?"
)
jhamman marked this conversation as resolved.
Show resolved Hide resolved

example = obj.isel(**{group_dim: group_indices[0]})
self.dims = example.dims
dcherian marked this conversation as resolved.
Show resolved Hide resolved

@property
def groups(self):
# provided to mimic pandas.groupby
Expand All @@ -394,11 +410,15 @@ def __iter__(self):
return zip(self._unique_coord.values, self._iter_grouped())

def __repr__(self):
return "%s, grouped over %r \n%r groups with labels %s" % (
self.__class__.__name__,
self._unique_coord.name,
self._unique_coord.size,
", ".join(format_array_flat(self._unique_coord, 30).split()),
return (
"%s, grouped over %r \n%r groups with labels %s. \nEach group has dimensions: %r"
% (
self.__class__.__name__,
self._unique_coord.name,
self._unique_coord.size,
", ".join(format_array_flat(self._unique_coord, 30).split()),
list(self.dims),
)
)

def _get_index_and_items(self, index, grouper):
Expand Down Expand Up @@ -689,7 +709,7 @@ def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None):
q : float in range of [0,1] (or sequence of floats)
Quantile to compute, which must be between 0 and 1
inclusive.
dim : str or sequence of str, optional
dim : xarray.ALL_DIMS, str or sequence of str, optional
Dimension(s) over which to apply quantile.
Defaults to the grouped dimension.
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
Expand Down Expand Up @@ -746,7 +766,7 @@ def reduce(
Function which can be called in the form
`func(x, axis=axis, **kwargs)` to return the result of collapsing
an np.ndarray over an integer valued axis.
dim : str or sequence of str, optional
dim : xarray.ALL_DIMS, str or sequence of str, optional
Dimension(s) over which to apply `func`.
axis : int or sequence of int, optional
Axis(es) over which to apply `func`. Only one of the 'dimension'
Expand All @@ -765,9 +785,18 @@ def reduce(
Array with summarized data and the indicated dimension(s)
removed.
"""
if dim is None:
dim = self._group_dim

if keep_attrs is None:
keep_attrs = _get_keep_attrs(default=False)

if dim is not ALL_DIMS and dim not in self.dims:
raise ValueError(
"cannot reduce over dimension %r. expected either xarray.ALL_DIMS to reduce over all dimensions or one or more of %r."
% (dim, self.dims)
)

def reduce_array(ar):
return ar.reduce(func, dim, axis, keep_attrs=keep_attrs, **kwargs)

Expand Down Expand Up @@ -835,7 +864,7 @@ def reduce(self, func, dim=None, keep_attrs=None, **kwargs):
Function which can be called in the form
`func(x, axis=axis, **kwargs)` to return the result of collapsing
an np.ndarray over an integer valued axis.
dim : str or sequence of str, optional
dim : xarray.ALL_DIMS, str or sequence of str, optional
Dimension(s) over which to apply `func`.
axis : int or sequence of int, optional
Axis(es) over which to apply `func`. Only one of the 'dimension'
Expand Down Expand Up @@ -863,6 +892,12 @@ def reduce(self, func, dim=None, keep_attrs=None, **kwargs):
def reduce_dataset(ds):
return ds.reduce(func, dim, keep_attrs, **kwargs)

if dim is not ALL_DIMS and dim not in self.dims:
raise ValueError(
"cannot reduce over dimension %r. expected either xarray.ALL_DIMS to reduce over all dimensions or one or more of %r."
% (dim, self.dims)
)

return self.apply(reduce_dataset)

def assign(self, **kwargs):
Expand Down
9 changes: 9 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -2579,6 +2579,15 @@ def change_metadata(x):
expected = change_metadata(expected)
assert_equal(expected, actual)

def test_groupby_reduce_dimension_error(self):
array = self.make_groupby_example_array()
grouped = array.groupby("y")
with raises_regex(ValueError, "cannot reduce over dimension 'y'"):
grouped.mean()

grouped = array.groupby("y", squeeze=False)
assert_identical(array, grouped.mean())

def test_groupby_math(self):
array = self.make_groupby_example_array()
for squeeze in [True, False]:
Expand Down
51 changes: 45 additions & 6 deletions xarray/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import xarray as xr
from xarray.core.groupby import _consolidate_slices

from . import assert_identical
from . import assert_identical, raises_regex


def test_consolidate_slices():
Expand All @@ -21,6 +21,19 @@ def test_consolidate_slices():
_consolidate_slices([slice(3), 4])


def test_groupby_dims_property():
ds = xr.Dataset(
{"foo": (("x", "y", "z"), np.random.randn(3, 4, 2))},
{"x": ["a", "bcd", "c"], "y": [1, 2, 3, 4], "z": [1, 2]},
)

assert ds.groupby("x").dims == ds.isel(x=1).dims
assert ds.groupby("y").dims == ds.isel(y=1).dims

stacked = ds.stack({"xy": ("x", "y")})
assert stacked.groupby("xy").dims == stacked.isel(xy=0).dims


def test_multi_index_groupby_apply():
# regression test for GH873
ds = xr.Dataset(
Expand Down Expand Up @@ -221,25 +234,51 @@ def test_groupby_repr(obj, dim):
expected = "%sGroupBy" % obj.__class__.__name__
expected += ", grouped over %r " % dim
expected += "\n%r groups with labels " % (len(np.unique(obj[dim])))
dims = list(obj.dims)
if dim == "x":
expected += "1, 2, 3, 4, 5"
expected += "1, 2, 3, 4, 5. "
elif dim == "y":
expected += "0, 1, 2, 3, 4, 5, ..., 15, 16, 17, 18, 19"
expected += "0, 1, 2, 3, 4, 5, ..., 15, 16, 17, 18, 19. "
dims = list(obj.isel(y=1).dims)
elif dim == "z":
expected += "'a', 'b', 'c'"
expected += "'a', 'b', 'c'. "
elif dim == "month":
expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12"
expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12. "
expected += "\nEach group has dimensions: %r" % dims
assert actual == expected


@pytest.mark.parametrize("obj", [repr_da, repr_da.to_dataset(name="a")])
def test_groupby_repr_datetime(obj):
dims = list(obj.dims)
actual = repr(obj.groupby("t.month"))
expected = "%sGroupBy" % obj.__class__.__name__
expected += ", grouped over 'month' "
expected += "\n%r groups with labels " % (len(np.unique(obj.t.dt.month)))
expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12"
expected += "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12. "
expected += "\nEach group has dimensions: %r" % dims
assert actual == expected


def test_groupby_grouping_errors():
dataset = xr.Dataset({"foo": ("x", [1, 1, 1])}, {"x": [1, 2, 3]})
with raises_regex(ValueError, "None of the data falls within bins with edges"):
dataset.groupby_bins("x", bins=[0.1, 0.2, 0.3])

with raises_regex(ValueError, "None of the data falls within bins with edges"):
dataset.to_array().groupby_bins("x", bins=[0.1, 0.2, 0.3])

with raises_regex(ValueError, "All bin edges are NaN."):
dataset.groupby_bins("x", bins=[np.nan, np.nan, np.nan])

with raises_regex(ValueError, "All bin edges are NaN."):
dataset.to_array().groupby_bins("x", bins=[np.nan, np.nan, np.nan])

with raises_regex(ValueError, "Failed to group data."):
dataset.groupby(dataset.foo * np.nan)

with raises_regex(ValueError, "Failed to group data."):
dataset.to_array().groupby(dataset.foo * np.nan)


# TODO: move other groupby tests from test_dataset and test_dataarray over here