diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py index 342475b96df..8d0c3932870 100644 --- a/asv_bench/benchmarks/unstacking.py +++ b/asv_bench/benchmarks/unstacking.py @@ -7,18 +7,23 @@ class Unstacking: def setup(self): - data = np.random.RandomState(0).randn(1, 1000, 500) - self.ds = xr.DataArray(data).stack(flat_dim=["dim_1", "dim_2"]) + data = np.random.RandomState(0).randn(500, 1000) + self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...]) + self.da_missing = self.da_full[:-1] + self.df_missing = self.da_missing.to_pandas() def time_unstack_fast(self): - self.ds.unstack("flat_dim") + self.da_full.unstack("flat_dim") def time_unstack_slow(self): - self.ds[:, ::-1].unstack("flat_dim") + self.da_missing.unstack("flat_dim") + + def time_unstack_pandas_slow(self): + self.df_missing.unstack() class UnstackingDask(Unstacking): def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) - self.ds = self.ds.chunk({"flat_dim": 50}) + self.da_full = self.da_full.chunk({"flat_dim": 50}) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0f2bf423449..488d8baa650 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -17,7 +17,7 @@ What's New .. _whats-new.0.16.3: -v0.16.3 (unreleased) +v0.17.0 (unreleased) -------------------- Breaking changes @@ -45,6 +45,11 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Significantly higher ``unstack`` performance on numpy-backed arrays which + contain missing values; 8x faster in our benchmark, and 2x faster than pandas. + (:pull:`4746`); + By `Maximilian Roos `_. + - Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables. By `Deepak Cherian `_. - :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f8718377104..a73e299e27a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4,6 +4,7 @@ import sys import warnings from collections import defaultdict +from distutils.version import LooseVersion from html import escape from numbers import Number from operator import methodcaller @@ -79,7 +80,7 @@ ) from .missing import get_clean_interp_index from .options import OPTIONS, _get_keep_attrs -from .pycompat import is_duck_dask_array +from .pycompat import is_duck_dask_array, sparse_array_type from .utils import ( Default, Frozen, @@ -3715,7 +3716,40 @@ def ensure_stackable(val): return data_array - def _unstack_once(self, dim: Hashable, fill_value, sparse) -> "Dataset": + def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": + index = self.get_index(dim) + index = remove_unused_levels_categories(index) + + variables: Dict[Hashable, Variable] = {} + indexes = {k: v for k, v in self.indexes.items() if k != dim} + + for name, var in self.variables.items(): + if name != dim: + if dim in var.dims: + if isinstance(fill_value, Mapping): + fill_value_ = fill_value[name] + else: + fill_value_ = fill_value + + variables[name] = var._unstack_once( + index=index, dim=dim, fill_value=fill_value_ + ) + else: + variables[name] = var + + for name, lev in zip(index.names, index.levels): + variables[name] = IndexVariable(name, lev) + indexes[name] = lev + + coord_names = set(self._coord_names) - {dim} | set(index.names) + + return self._replace_with_new_dims( + variables, coord_names=coord_names, indexes=indexes + ) + + def _unstack_full_reindex( + self, dim: Hashable, fill_value, sparse: bool + ) -> "Dataset": index = self.get_index(dim) index = remove_unused_levels_categories(index) full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) @@ -3812,7 +3846,38 @@ def unstack( result = self.copy(deep=False) for dim in dims: - result = result._unstack_once(dim, fill_value, sparse) + + if ( + # Dask arrays don't support assignment by index, which the fast unstack + # function requires. + # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125 + any(is_duck_dask_array(v.data) for v in self.variables.values()) + # Sparse doesn't currently support (though we could special-case + # it) + # https://github.com/pydata/sparse/issues/422 + or any( + isinstance(v.data, sparse_array_type) + for v in self.variables.values() + ) + or sparse + # numpy full_like only added `shape` in 1.17 + or LooseVersion(np.__version__) < LooseVersion("1.17") + # Until https://github.com/pydata/xarray/pull/4751 is resolved, + # we check explicitly whether it's a numpy array. Once that is + # resolved, explicitly exclude pint arrays. + # # pint doesn't implement `np.full_like` in a way that's + # # currently compatible. + # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173 + # # or any( + # # isinstance(v.data, pint_array_type) for v in self.variables.values() + # # ) + or any( + not isinstance(v.data, np.ndarray) for v in self.variables.values() + ) + ): + result = result._unstack_full_reindex(dim, fill_value, sparse) + else: + result = result._unstack_once(dim, fill_value) return result def update(self, other: "CoercibleMapping") -> "Dataset": @@ -4982,6 +5047,10 @@ def _set_numpy_data_from_dataframe( self[name] = (dims, values) return + # NB: similar, more general logic, now exists in + # variable.unstack_once; we could consider combining them at some + # point. + shape = tuple(lev.size for lev in idx.levels) indexer = tuple(idx.codes) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 797de65bbcf..64c1895da59 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -10,6 +10,7 @@ Any, Dict, Hashable, + List, Mapping, Optional, Sequence, @@ -1488,7 +1489,7 @@ def set_dims(self, dims, shape=None): ) return expanded_var.transpose(*dims) - def _stack_once(self, dims, new_dim): + def _stack_once(self, dims: List[Hashable], new_dim: Hashable): if not set(dims) <= set(self.dims): raise ValueError("invalid existing dimensions: %s" % dims) @@ -1544,7 +1545,15 @@ def stack(self, dimensions=None, **dimensions_kwargs): result = result._stack_once(dims, new_dim) return result - def _unstack_once(self, dims, old_dim): + def _unstack_once_full( + self, dims: Mapping[Hashable, int], old_dim: Hashable + ) -> "Variable": + """ + Unstacks the variable without needing an index. + + Unlike `_unstack_once`, this function requires the existing dimension to + contain the full product of the new dimensions. + """ new_dim_names = tuple(dims.keys()) new_dim_sizes = tuple(dims.values()) @@ -1573,6 +1582,53 @@ def _unstack_once(self, dims, old_dim): return Variable(new_dims, new_data, self._attrs, self._encoding, fastpath=True) + def _unstack_once( + self, + index: pd.MultiIndex, + dim: Hashable, + fill_value=dtypes.NA, + ) -> "Variable": + """ + Unstacks this variable given an index to unstack and the name of the + dimension to which the index refers. + """ + + reordered = self.transpose(..., dim) + + new_dim_sizes = [lev.size for lev in index.levels] + new_dim_names = index.names + indexer = index.codes + + # Potentially we could replace `len(other_dims)` with just `-1` + other_dims = [d for d in self.dims if d != dim] + new_shape = list(reordered.shape[: len(other_dims)]) + new_dim_sizes + new_dims = reordered.dims[: len(other_dims)] + new_dim_names + + if fill_value is dtypes.NA: + is_missing_values = np.prod(new_shape) > np.prod(self.shape) + if is_missing_values: + dtype, fill_value = dtypes.maybe_promote(self.dtype) + else: + dtype = self.dtype + fill_value = dtypes.get_fill_value(dtype) + else: + dtype = self.dtype + + # Currently fails on sparse due to https://github.com/pydata/sparse/issues/422 + data = np.full_like( + self.data, + fill_value=fill_value, + shape=new_shape, + dtype=dtype, + ) + + # Indexer is a list of lists of locations. Each list is the locations + # on the new dimension. This is robust to the data being sparse; in that + # case the destinations will be NaN / zero. + data[(..., *indexer)] = reordered + + return self._replace(dims=new_dims, data=data) + def unstack(self, dimensions=None, **dimensions_kwargs): """ Unstack an existing dimension into multiple new dimensions. @@ -1580,6 +1636,10 @@ def unstack(self, dimensions=None, **dimensions_kwargs): New dimensions will be added at the end, and the order of the data along each new dimension will be in contiguous (C) order. + Note that unlike ``DataArray.unstack`` and ``Dataset.unstack``, this + method requires the existing dimension to contain the full product of + the new dimensions. + Parameters ---------- dimensions : mapping of hashable to mapping of hashable to int @@ -1598,11 +1658,13 @@ def unstack(self, dimensions=None, **dimensions_kwargs): See also -------- Variable.stack + DataArray.unstack + Dataset.unstack """ dimensions = either_dict_or_kwargs(dimensions, dimensions_kwargs, "unstack") result = self for old_dim, dims in dimensions.items(): - result = result._unstack_once(dims, old_dim) + result = result._unstack_once_full(dims, old_dim) return result def fillna(self, value):