Skip to content

Dataset.reduce methods #137

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 21, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,23 @@ Selecting
Dataset.squeeze
Dataset.groupby

Computations
~~~~~~~~~~~~

.. autosummary::
:toctree: generated/

Dataset.all
Dataset.any
Dataset.argmax
Dataset.argmin
Dataset.max
Dataset.min
Dataset.mean
Dataset.std
Dataset.sum
Dataset.var

IO / Conversion
~~~~~~~~~~~~~~~

Expand Down
17 changes: 17 additions & 0 deletions doc/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,23 @@ contents of the ``Dataset`` will still be the same underlying
:py:class:`xray.Variable`. You can copy all data by supplying the argument
``deep=True``.

Datasets reductions
~~~~~~~~~~~~~~~~~~
We can numpy reduction functions to the entire dataset, returning a new
``Dataset``.

.. ipython:: python

bar = ds.mean()
bar

The ``dimension``(default=None) keyword will limit the reduction to only the dimension(s) provided.

.. ipython:: python

spam = ds.mean(dimension='time')
spam

``DataArray`` objects
---------------------

Expand Down
42 changes: 42 additions & 0 deletions test/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,3 +656,45 @@ def test_lazy_load(self):
# these should not raise UnexpectedDataAccess:
ds.indexed(time=10)
ds.indexed(time=slice(10), dim1=[0]).indexed(dim1=0, dim2=-1)

def test_reduce(self):
data = create_test_data()

self.assertEqual(len(data.mean().coordinates), 0)

expected = data.max()
for var in data.noncoordinates:
expected = data[var].max()
actual = expected[var]
self.assertDataArrayEqual(expected, actual)

self.assertDatasetEqual(data.min(dimension=['dim1']),
data.min(dimension='dim1'))

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a test for dimension=[]:
self.assertDatasetEqual(data.mean(dimension=[]), data)

for reduct, expected in [('dim2', ['dim1', 'dim3', 'time']),
(['dim2', 'time'], ['dim1', 'dim3']),
(('dim2', 'time'), ['dim1', 'dim3']),
((), ['dim1', 'dim2', 'dim3', 'time'])]:
actual = data.min(dimension=reduct).dimensions
print(reduct, actual, expected)
self.assertItemsEqual(actual, expected)

self.assertDatasetEqual(data.mean(dimension=[]), data)

def test_reduce_bad_dimension(self):
data = create_test_data()
with self.assertRaisesRegexp(ValueError, 'Dataset does not contain'):
ds = data.mean(dimension='bad_dim')

def test_reduce_non_numeric(self):
data1 = create_test_data(seed=44)
data2 = create_test_data(seed=44)
add_vars = {'var4': ['dim1', 'dim2']}
for v, dims in sorted(add_vars.items()):
data = np.random.random_integers(0, 100, size=tuple(_dims[d] for d in dims)).astype(np.str_)
data1[v] = (dims, data, {'foo': 'variable'})

self.assertTrue('var4' not in data1.mean())
self.assertDatasetEqual(data1.mean(), data2.mean())
self.assertDatasetEqual(data1.mean(dimension='dim1'),
data2.mean(dimension='dim1'))
83 changes: 83 additions & 0 deletions xray/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from . import variable
from . import utils
from . import data_array
from . import ops
from .utils import (FrozenOrderedDict, Frozen, SortedKeysDict, ChainMap,
multi_index_from_product)
from .pycompat import iteritems, basestring
Expand Down Expand Up @@ -973,6 +974,86 @@ def squeeze(self, dimension=None):
"""
return utils.squeeze(self, self.dimensions, dimension)

_reduce_method_docstring = \
"""Reduce this {cls}'s data' by applying `{name}` along some
dimension(s).

Parameters
----------
dimension : str or sequence of str, optional
Dimension(s) over which to apply `func`. By default `func` is
applied over all dimensions.
**kwargs : dict
Additional keyword arguments passed on to `{name}`.

Returns
-------
reduced : {cls}
New {cls} object with `{name}` applied to its data and the
indicated dimension(s) removed.
"""

@classmethod
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can just make Dataset (also) inherit from common.ImplementsReduce instead of rewriting this method.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I played around with this but after removing the axis keyword, I think it makes sense to localize the method and the docstring.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I suppose so -- although I hate to see even a few lines of repeated code!

def _reduce_method(cls, f, name=None, module=None):
def func(self, dimension=None, **kwargs):
return self.reduce(f, dimension, **kwargs)
if name is None:
name = f.__name__
func.__name__ = name
func.__doc__ = cls._reduce_method_docstring.format(
name=('' if module is None else module + '.') + name,
cls=cls.__name__)
return func

def reduce(self, func, dimension=None, **kwargs):
"""Reduce this dataset by applying `func` along some dimension(s).

Parameters
----------
func : function
Function which can be called in the form
`f(x, axis=axis, **kwargs)` to return the result of reducing an
np.ndarray over an integer valued axis.
dimension : str or sequence of str, optional
Dimension(s) over which to apply `func`. By default `func` is
applied over all dimensions.
**kwargs : dict
Additional keyword arguments passed on to `func`.

Returns
-------
reduced : Dataset
Dataset with this object's DataArrays replaced with new DataArrays
of summarized data and the indicated dimension(s) removed.
"""

if isinstance(dimension, basestring):
dims = set([dimension])
elif dimension is None:
dims = set(self.coordinates)
else:
dims = set(dimension)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to add some validation for dimensions. Something like:

if not dims <= set(self.coordinates):
    raise ValueError('...')

bad_dims = [dim for dim in dims if dim not in self.coordinates]
if bad_dims:
raise ValueError('Dataset does not contain the dimensions: '
'{0}'.format(bad_dims))

variables = OrderedDict()
for name, var in iteritems(self.variables):
reduce_dims = [dim for dim in var.dimensions if dim in dims]
if reduce_dims:
if name not in self.dimensions:
try:
variables[name] = var.reduce(func,
dimension=reduce_dims,
**kwargs)
except TypeError:
pass
else:
variables[name] = var
return Dataset(variables=variables)

@classmethod
def concat(cls, datasets, dimension='concat_dimension', indexers=None,
mode='different', concat_over=None, compat='equals'):
Expand Down Expand Up @@ -1166,3 +1247,5 @@ def from_dataframe(cls, dataframe):
data = series.values.reshape(shape)
obj[name] = (dimensions, data)
return obj

ops.inject_reduce_methods(Dataset)