Skip to content

.groupby_bins fails when data is not contained in bins #1764

Closed
@jbusecke

Description

@jbusecke

Consider the following example.

import xarray as xr
import numpy as np
import dask.array as dsa
from dask.diagnostics import ProgressBar
# Groupby bins problem with small bins?
x_raw = np.arange(20)
y_raw = np.arange(10)
z_raw = np.arange(15)

x = xr.DataArray(dsa.from_array(x_raw, chunks=(-1)), dims=['x'], coords={'x':('x', x_raw)})
y = xr.DataArray(dsa.from_array(y_raw, chunks=(-1)), dims=['y'], coords={'y':('y', y_raw)})
z = xr.DataArray(dsa.from_array(z_raw, chunks=(-1)), dims=['z'], coords={'z':('z', z_raw)})

data = xr.DataArray(dsa.ones([20, 10, 15], chunks=[-1, -1, -1]), dims=['x', 'y', 'z'], coords={
    'x':x, 'y':y, 'z':z
})
data
<xarray.DataArray 'wrapped-bb05d395159047b749ca855110244cb7' (x: 20, y: 10, z: 15)>
dask.array<shape=(20, 10, 15), dtype=float64, chunksize=(20, 10, 15)>
Coordinates:
  * x        (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
  * y        (y) int64 0 1 2 3 4 5 6 7 8 9
  * z        (z) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14

This dask array only contains ones. If I now try to apply groupby_bins with a specified array of bins (which are all below 1) it fails with a rather cryptic error.
# This doesnt work
bins = np.array([0, 20, 40, 60 , 80, 100])*1e-6

binned = data.groupby_bins(data, bins).sum()
binned
---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
<ipython-input-7-dc9283bee4ea> in <module>()
      2 bins = np.array([0, 20, 40, 60 , 80, 100])*1e-6
      3 
----> 4 binned = data.groupby_bins(data, bins).sum()
      5 binned

~/Work/CODE/PYTHON/xarray/xarray/core/common.py in wrapped_func(self, dim, axis, skipna, keep_attrs, **kwargs)
     20                              keep_attrs=False, **kwargs):
     21                 return self.reduce(func, dim, axis, keep_attrs=keep_attrs,
---> 22                                    skipna=skipna, allow_lazy=True, **kwargs)
     23         else:
     24             def wrapped_func(self, dim=None, axis=None, keep_attrs=False,

~/Work/CODE/PYTHON/xarray/xarray/core/groupby.py in reduce(self, func, dim, axis, keep_attrs, shortcut, **kwargs)
    572         def reduce_array(ar):
    573             return ar.reduce(func, dim, axis, keep_attrs=keep_attrs, **kwargs)
--> 574         return self.apply(reduce_array, shortcut=shortcut)
    575 
    576 ops.inject_reduce_methods(DataArrayGroupBy)

~/Work/CODE/PYTHON/xarray/xarray/core/groupby.py in apply(self, func, shortcut, **kwargs)
    516         applied = (maybe_wrap_array(arr, func(arr, **kwargs))
    517                    for arr in grouped)
--> 518         return self._combine(applied, shortcut=shortcut)
    519 
    520     def _combine(self, applied, shortcut=False):

~/Work/CODE/PYTHON/xarray/xarray/core/groupby.py in _combine(self, applied, shortcut)
    520     def _combine(self, applied, shortcut=False):
    521         """Recombine the applied objects like the original."""
--> 522         applied_example, applied = peek_at(applied)
    523         coord, dim, positions = self._infer_concat_args(applied_example)
    524         if shortcut:

~/Work/CODE/PYTHON/xarray/xarray/core/utils.py in peek_at(iterable)
    114     """
    115     gen = iter(iterable)
--> 116     peek = next(gen)
    117     return peek, itertools.chain([peek], gen)
    118 

StopIteration: 

If however the last bin includes the value 1 it runs as expected:

# If I include a larger value at the end it works
bins = np.array([0, 20, 40, 60 , 80, 100, 1e7])*1e-6

binned = data.groupby_bins(data, bins).sum()
binned
<xarray.DataArray 'wrapped-bb05d395159047b749ca855110244cb7' (wrapped-bb05d395159047b749ca855110244cb7_bins: 6)>
dask.array<shape=(6,), dtype=float64, chunksize=(5,)>
Coordinates:
  * wrapped-bb05d395159047b749ca855110244cb7_bins  (wrapped-bb05d395159047b749ca855110244cb7_bins) object (0.0, 2e-05] ...

Problem description

Is this expected behaviour? I would prefer it if it returned nan values for the bins that capture no values.
It took me a bit to find out why my script using this was failing, and if this is expected behavior could a more helpful error message be considered?

Expected Output

Output of xr.show_versions()

# Paste the output here xr.show_versions() here INSTALLED VERSIONS ------------------ commit: None python: 3.6.2.final.0 python-bits: 64 OS: Darwin OS-release: 16.7.0 machine: x86_64 processor: i386 byteorder: little LC_ALL: None LANG: en_US.UTF-8 LOCALE: en_US.UTF-8

xarray: 0.10.0rc1-9-gdbf7b01
pandas: 0.20.3
numpy: 1.13.1
scipy: 0.19.1
netCDF4: 1.2.9
h5netcdf: 0.4.1
Nio: None
bottleneck: 1.2.1
cyordereddict: None
dask: 0.15.4
matplotlib: 2.0.2
cartopy: 0.15.1
seaborn: 0.8.1
setuptools: 36.3.0
pip: 9.0.1
conda: None
pytest: 3.2.2
IPython: 6.1.0
sphinx: 1.6.5

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions