Skip to content

Bug fixes for Dataset.reduce() and n-dimensional cumsum/cumprod #2156

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,24 @@ Documentation
Enhancements
~~~~~~~~~~~~

- :py:meth:`~DataArray.cumsum` and :py:meth:`~DataArray.cumprod` now support
aggregation over multiple dimensions at the same time. This is the default
behavior when dimensions are not specified (previously this raised an error).
By `Stephan Hoyer <https://github.com/shoyer>`_

Bug fixes
~~~~~~~~~

- Fixed a bug where `to_netcdf(..., unlimited_dims='bar'` yielded NetCDF files
with spurious 0-length dimensions (i.e. `b`, `a`, and `r`) (:issue:`2134`).
By `Joe Hamman <https://github.com/jhamman>`_.

- Aggregations with :py:meth:`Dataset.reduce` (including ``mean``, ``sum``,
etc) no longer drop unrelated coordinates (:issue:`1470`). Also fixed a
bug where non-scalar data-variables that did not include the aggregation
dimension were improperly skipped.
By `Stephan Hoyer <https://github.com/shoyer>`_

.. _whats-new.0.10.4:

v0.10.4 (May 16, 2018)
Expand Down
38 changes: 19 additions & 19 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2594,26 +2594,26 @@ def reduce(self, func, dim=None, keep_attrs=False, numeric_only=False,
variables = OrderedDict()
for name, var in iteritems(self._variables):
reduce_dims = [dim for dim in var.dims if dim in dims]
if reduce_dims or not var.dims:
if name not in self.coords:
if (not numeric_only or
np.issubdtype(var.dtype, np.number) or
(var.dtype == np.bool_)):
if len(reduce_dims) == 1:
# unpack dimensions for the benefit of functions
# like np.argmin which can't handle tuple arguments
reduce_dims, = reduce_dims
elif len(reduce_dims) == var.ndim:
# prefer to aggregate over axis=None rather than
# axis=(0, 1) if they will be equivalent, because
# the former is often more efficient
reduce_dims = None
variables[name] = var.reduce(func, dim=reduce_dims,
keep_attrs=keep_attrs,
allow_lazy=allow_lazy,
**kwargs)
if name in self.coords:
if not reduce_dims:
variables[name] = var
else:
variables[name] = var
if (not numeric_only or
np.issubdtype(var.dtype, np.number) or
(var.dtype == np.bool_)):
if len(reduce_dims) == 1:
# unpack dimensions for the benefit of functions
# like np.argmin which can't handle tuple arguments
reduce_dims, = reduce_dims
elif len(reduce_dims) == var.ndim:
# prefer to aggregate over axis=None rather than
# axis=(0, 1) if they will be equivalent, because
# the former is often more efficient
reduce_dims = None
variables[name] = var.reduce(func, dim=reduce_dims,
keep_attrs=keep_attrs,
allow_lazy=allow_lazy,
**kwargs)

coord_names = set(k for k in self.coords if k in variables)
attrs = self.attrs if keep_attrs else None
Expand Down
36 changes: 29 additions & 7 deletions xarray/core/duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,7 @@ def _nanvar_object(value, axis=None, **kwargs):


def _create_nan_agg_method(name, numeric_only=False, np_compat=False,
no_bottleneck=False, coerce_strings=False,
keep_dims=False):
no_bottleneck=False, coerce_strings=False):
def f(values, axis=None, skipna=None, **kwargs):
if kwargs.pop('out', None) is not None:
raise TypeError('`out` is not valid for {}'.format(name))
Expand Down Expand Up @@ -343,7 +342,6 @@ def f(values, axis=None, skipna=None, **kwargs):
'or newer to use skipna=True or skipna=None' % name)
raise NotImplementedError(msg)
f.numeric_only = numeric_only
f.keep_dims = keep_dims
f.__name__ = name
return f

Expand All @@ -358,10 +356,34 @@ def f(values, axis=None, skipna=None, **kwargs):
var = _create_nan_agg_method('var', numeric_only=True)
median = _create_nan_agg_method('median', numeric_only=True)
prod = _create_nan_agg_method('prod', numeric_only=True, no_bottleneck=True)
cumprod = _create_nan_agg_method('cumprod', numeric_only=True, np_compat=True,
no_bottleneck=True, keep_dims=True)
cumsum = _create_nan_agg_method('cumsum', numeric_only=True, np_compat=True,
no_bottleneck=True, keep_dims=True)
cumprod_1d = _create_nan_agg_method(
'cumprod', numeric_only=True, np_compat=True, no_bottleneck=True)
cumsum_1d = _create_nan_agg_method(
'cumsum', numeric_only=True, np_compat=True, no_bottleneck=True)


def _nd_cum_func(cum_func, array, axis, **kwargs):
array = asarray(array)
if axis is None:
axis = tuple(range(array.ndim))
if isinstance(axis, int):
axis = (axis,)

out = array
for ax in axis:
out = cum_func(out, axis=ax, **kwargs)
return out


def cumprod(array, axis=None, **kwargs):
"""N-dimensional version of cumprod."""
return _nd_cum_func(cumprod_1d, array, axis, **kwargs)


def cumsum(array, axis=None, **kwargs):
"""N-dimensional version of cumsum."""
return _nd_cum_func(cumsum_1d, array, axis, **kwargs)


_fail_on_dask_array_input_skipna = partial(
fail_on_dask_array_input,
Expand Down
5 changes: 0 additions & 5 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1256,11 +1256,6 @@ def reduce(self, func, dim=None, axis=None, keep_attrs=False,
if dim is not None and axis is not None:
raise ValueError("cannot supply both 'axis' and 'dim' arguments")

if getattr(func, 'keep_dims', False):
if dim is None and axis is None:
raise ValueError("must supply either single 'dim' or 'axis' "
"argument to %s" % (func.__name__))

if dim is not None:
axis = self.get_axis_num(dim)
data = func(self.data if allow_lazy else self.values,
Expand Down
5 changes: 5 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1767,6 +1767,11 @@ def test_cumops(self):
orig = DataArray([[-1, 0, 1], [-3, 0, 3]], coords,
dims=['x', 'y'])

actual = orig.cumsum()
expected = DataArray([[-1, -1, 0], [-4, -4, 0]], coords,
dims=['x', 'y'])
assert_identical(expected, actual)

actual = orig.cumsum('x')
expected = DataArray([[-1, 0, 1], [-4, 0, 4]], coords,
dims=['x', 'y'])
Expand Down
34 changes: 27 additions & 7 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3331,7 +3331,18 @@ def test_reduce(self):

assert_equal(data.mean(dim=[]), data)

# uint support
def test_reduce_coords(self):
# regression test for GH1470
data = xr.Dataset({'a': ('x', [1, 2, 3])}, coords={'b': 4})
expected = xr.Dataset({'a': 2}, coords={'b': 4})
actual = data.mean('x')
assert_identical(actual, expected)

# should be consistent
actual = data['a'].mean('x').to_dataset()
assert_identical(actual, expected)

def test_mean_uint_dtype(self):
data = xr.Dataset({'a': (('x', 'y'),
np.arange(6).reshape(3, 2).astype('uint')),
'b': (('x', ), np.array([0.1, 0.2, np.nan]))})
Expand All @@ -3345,15 +3356,20 @@ def test_reduce_bad_dim(self):
with raises_regex(ValueError, 'Dataset does not contain'):
data.mean(dim='bad_dim')

def test_reduce_cumsum(self):
data = xr.Dataset({'a': 1,
'b': ('x', [1, 2]),
'c': (('x', 'y'), [[np.nan, 3], [0, 4]])})
assert_identical(data.fillna(0), data.cumsum('y'))

expected = xr.Dataset({'a': 1,
'b': ('x', [1, 3]),
'c': (('x', 'y'), [[0, 3], [0, 7]])})
assert_identical(expected, data.cumsum())

def test_reduce_cumsum_test_dims(self):
data = create_test_data()
for cumfunc in ['cumsum', 'cumprod']:
with raises_regex(ValueError,
"must supply either single 'dim' or 'axis'"):
getattr(data, cumfunc)()
with raises_regex(ValueError,
"must supply either single 'dim' or 'axis'"):
getattr(data, cumfunc)(dim=['dim1', 'dim2'])
with raises_regex(ValueError, 'Dataset does not contain'):
getattr(data, cumfunc)(dim='bad_dim')

Expand Down Expand Up @@ -3460,6 +3476,10 @@ def test_reduce_scalars(self):
actual = ds.var()
assert_identical(expected, actual)

expected = Dataset({'x': 0, 'y': 0, 'z': ('b', [0])})
actual = ds.var('a')
assert_identical(expected, actual)

def test_reduce_only_one_axis(self):

def mean_only_one_axis(x, axis):
Expand Down
48 changes: 48 additions & 0 deletions xarray/tests/test_duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import warnings

from xarray import DataArray, concat
from xarray.core import duck_array_ops
from xarray.core.duck_array_ops import (
array_notnull_equiv, concatenate, count, first, last, mean, rolling_window,
stack, where)
Expand Down Expand Up @@ -103,6 +104,53 @@ def test_all_nan_arrays(self):
assert np.isnan(mean([np.nan, np.nan]))


def test_cumsum_1d():
inputs = np.array([0, 1, 2, 3])
expected = np.array([0, 1, 3, 6])
actual = duck_array_ops.cumsum(inputs)
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=0)
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=-1)
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=(0,))
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=())
assert_array_equal(inputs, actual)


def test_cumsum_2d():
inputs = np.array([[1, 2], [3, 4]])

expected = np.array([[1, 3], [4, 10]])
actual = duck_array_ops.cumsum(inputs)
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=(0, 1))
assert_array_equal(expected, actual)

actual = duck_array_ops.cumsum(inputs, axis=())
assert_array_equal(inputs, actual)


def test_cumprod_2d():
inputs = np.array([[1, 2], [3, 4]])

expected = np.array([[1, 2], [3, 2 * 3 * 4]])
actual = duck_array_ops.cumprod(inputs)
assert_array_equal(expected, actual)

actual = duck_array_ops.cumprod(inputs, axis=(0, 1))
assert_array_equal(expected, actual)

actual = duck_array_ops.cumprod(inputs, axis=())
assert_array_equal(inputs, actual)


class TestArrayNotNullEquiv():
@pytest.mark.parametrize("arr1, arr2", [
(np.array([1, 2, 3]), np.array([1, 2, 3])),
Expand Down