diff --git a/doc/related-projects.rst b/doc/related-projects.rst index a8af05f3074..3188751366f 100644 --- a/doc/related-projects.rst +++ b/doc/related-projects.rst @@ -25,6 +25,7 @@ Geosciences - `PyGDX `_: Python 3 package for accessing data stored in GAMS Data eXchange (GDX) files. Also uses a custom subclass. +- `pyinterp `_: Python 3 package for interpolating geo-referenced data used in the field of geosciences. - `pyXpcm `_: xarray-based Profile Classification Modelling (PCM), mostly for ocean data. - `Regionmask `_: plotting and creation of masks of spatial regions - `rioxarray `_: geospatial xarray extension powered by rasterio diff --git a/doc/whats-new.rst b/doc/whats-new.rst index fe05a4d2c21..00d1c50780e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,12 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Implement :py:func:`median` and :py:func:`nanmedian` for dask arrays. This works by rechunking + to a single chunk along all reduction axes. (:issue:`2999`). + By `Deepak Cherian `_. +- :py:func:`xarray.concat` now preserves attributes from the first Variable. + (:issue:`2575`, :issue:`2060`, :issue:`1614`) + By `Deepak Cherian `_. - :py:meth:`Dataset.quantile`, :py:meth:`DataArray.quantile` and ``GroupBy.quantile`` now work with dask Variables. By `Deepak Cherian `_. @@ -38,6 +44,9 @@ New Features Bug fixes ~~~~~~~~~ +- Fix :py:meth:`xarray.combine_by_coords` to allow for combining incomplete + hypercubes of Datasets (:issue:`3648`). By `Ian Bolliger + `_. - Fix :py:meth:`xarray.combine_by_coords` when combining cftime coordinates which span long time intervals (:issue:`3535`). By `Spencer Clark `_. diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 65087b05cc0..3f6e0e79351 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -115,11 +115,12 @@ def _infer_concat_order_from_coords(datasets): return combined_ids, concat_dims -def _check_shape_tile_ids(combined_tile_ids): +def _check_dimension_depth_tile_ids(combined_tile_ids): + """ + Check all tuples are the same length, i.e. check that all lists are + nested to the same depth. + """ tile_ids = combined_tile_ids.keys() - - # Check all tuples are the same length - # i.e. check that all lists are nested to the same depth nesting_depths = [len(tile_id) for tile_id in tile_ids] if not nesting_depths: nesting_depths = [0] @@ -128,8 +129,13 @@ def _check_shape_tile_ids(combined_tile_ids): "The supplied objects do not form a hypercube because" " sub-lists do not have consistent depths" ) + # return these just to be reused in _check_shape_tile_ids + return tile_ids, nesting_depths - # Check all lists along one dimension are same length + +def _check_shape_tile_ids(combined_tile_ids): + """Check all lists along one dimension are same length.""" + tile_ids, nesting_depths = _check_dimension_depth_tile_ids(combined_tile_ids) for dim in range(nesting_depths[0]): indices_along_dim = [tile_id[dim] for tile_id in tile_ids] occurrences = Counter(indices_along_dim) @@ -536,7 +542,8 @@ def combine_by_coords( coords : {'minimal', 'different', 'all' or list of str}, optional As per the 'data_vars' kwarg, but for coordinate variables. fill_value : scalar, optional - Value to use for newly missing values + Value to use for newly missing values. If None, raises a ValueError if + the passed Datasets do not create a complete hypercube. join : {'outer', 'inner', 'left', 'right', 'exact'}, optional String indicating how to combine differing indexes (excluding concat_dim) in objects @@ -653,6 +660,15 @@ def combine_by_coords( temperature (y, x) float64 1.654 10.63 7.015 2.543 ... 12.46 2.22 15.96 precipitation (y, x) float64 0.2136 0.9974 0.7603 ... 0.6125 0.4654 0.5953 + >>> xr.combine_by_coords([x1, x2, x3]) + + Dimensions: (x: 6, y: 4) + Coordinates: + * x (x) int64 10 20 30 40 50 60 + * y (y) int64 0 1 2 3 + Data variables: + temperature (y, x) float64 1.654 10.63 7.015 nan ... 12.46 2.22 15.96 + precipitation (y, x) float64 0.2136 0.9974 0.7603 ... 0.6125 0.4654 0.5953 """ # Group by data vars @@ -667,7 +683,13 @@ def combine_by_coords( list(datasets_with_same_vars) ) - _check_shape_tile_ids(combined_ids) + if fill_value is None: + # check that datasets form complete hypercube + _check_shape_tile_ids(combined_ids) + else: + # check only that all datasets have same dimension depth for these + # vars + _check_dimension_depth_tile_ids(combined_ids) # Concatenate along all of concat_dims one by one to create single ds concatenated = _combine_nd( diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 5ccbfa3f2b4..302f7afcec6 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -93,12 +93,14 @@ def concat( those of the first object with that dimension. Indexes for the same dimension must have the same size in all objects. - indexers, mode, concat_over : deprecated - Returns ------- concatenated : type of objs + Notes + ----- + Each concatenated Variable preserves corresponding ``attrs`` from the first element of ``objs``. + See also -------- merge diff --git a/xarray/core/dask_array_compat.py b/xarray/core/dask_array_compat.py index c3dbdd27098..de55de89f0c 100644 --- a/xarray/core/dask_array_compat.py +++ b/xarray/core/dask_array_compat.py @@ -1,8 +1,14 @@ from distutils.version import LooseVersion +from typing import Iterable -import dask.array as da import numpy as np -from dask import __version__ as dask_version + +try: + import dask.array as da + from dask import __version__ as dask_version +except ImportError: + dask_version = "0.0.0" + da = None if LooseVersion(dask_version) >= LooseVersion("2.0.0"): meta_from_array = da.utils.meta_from_array @@ -89,3 +95,76 @@ def meta_from_array(x, ndim=None, dtype=None): meta = meta.astype(dtype) return meta + + +if LooseVersion(dask_version) >= LooseVersion("2.8.1"): + median = da.median +else: + # Copied from dask v2.8.1 + # Used under the terms of Dask's license, see licenses/DASK_LICENSE. + def median(a, axis=None, keepdims=False): + """ + This works by automatically chunking the reduced axes to a single chunk + and then calling ``numpy.median`` function across the remaining dimensions + """ + + if axis is None: + raise NotImplementedError( + "The da.median function only works along an axis. " + "The full algorithm is difficult to do in parallel" + ) + + if not isinstance(axis, Iterable): + axis = (axis,) + + axis = [ax + a.ndim if ax < 0 else ax for ax in axis] + + a = a.rechunk({ax: -1 if ax in axis else "auto" for ax in range(a.ndim)}) + + result = a.map_blocks( + np.median, + axis=axis, + keepdims=keepdims, + drop_axis=axis if not keepdims else None, + chunks=[1 if ax in axis else c for ax, c in enumerate(a.chunks)] + if keepdims + else None, + ) + + return result + + +if LooseVersion(dask_version) > LooseVersion("2.9.0"): + nanmedian = da.nanmedian +else: + + def nanmedian(a, axis=None, keepdims=False): + """ + This works by automatically chunking the reduced axes to a single chunk + and then calling ``numpy.nanmedian`` function across the remaining dimensions + """ + + if axis is None: + raise NotImplementedError( + "The da.nanmedian function only works along an axis. " + "The full algorithm is difficult to do in parallel" + ) + + if not isinstance(axis, Iterable): + axis = (axis,) + + axis = [ax + a.ndim if ax < 0 else ax for ax in axis] + + a = a.rechunk({ax: -1 if ax in axis else "auto" for ax in range(a.ndim)}) + + result = a.map_blocks( + np.nanmedian, + axis=axis, + keepdims=keepdims, + drop_axis=axis if not keepdims else None, + chunks=[1 if ax in axis else c for ax, c in enumerate(a.chunks)] + if keepdims + else None, + ) + + return result diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index cf616acb485..98b371ab7c3 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd -from . import dask_array_ops, dtypes, npcompat, nputils +from . import dask_array_ops, dask_array_compat, dtypes, npcompat, nputils from .nputils import nanfirst, nanlast from .pycompat import dask_array_type @@ -284,7 +284,7 @@ def _ignore_warnings_if(condition): yield -def _create_nan_agg_method(name, coerce_strings=False): +def _create_nan_agg_method(name, dask_module=dask_array, coerce_strings=False): from . import nanops def f(values, axis=None, skipna=None, **kwargs): @@ -301,7 +301,7 @@ def f(values, axis=None, skipna=None, **kwargs): nanname = "nan" + name func = getattr(nanops, nanname) else: - func = _dask_or_eager_func(name) + func = _dask_or_eager_func(name, dask_module=dask_module) try: return func(values, axis=axis, **kwargs) @@ -337,7 +337,7 @@ def f(values, axis=None, skipna=None, **kwargs): std.numeric_only = True var = _create_nan_agg_method("var") var.numeric_only = True -median = _create_nan_agg_method("median") +median = _create_nan_agg_method("median", dask_module=dask_array_compat) median.numeric_only = True prod = _create_nan_agg_method("prod") prod.numeric_only = True diff --git a/xarray/core/nanops.py b/xarray/core/nanops.py index f70e96217e8..f9989c2c8c9 100644 --- a/xarray/core/nanops.py +++ b/xarray/core/nanops.py @@ -6,8 +6,10 @@ try: import dask.array as dask_array + from . import dask_array_compat except ImportError: dask_array = None + dask_array_compat = None # type: ignore def _replace_nan(a, val): @@ -141,7 +143,15 @@ def nanmean(a, axis=None, dtype=None, out=None): def nanmedian(a, axis=None, out=None): - return _dask_or_eager_func("nanmedian", eager_module=nputils)(a, axis=axis) + # The dask algorithm works by rechunking to one chunk along axis + # Make sure we trigger the dask error when passing all dimensions + # so that we don't rechunk the entire array to one chunk and + # possibly blow memory + if axis is not None and len(np.atleast_1d(axis)) == a.ndim: + axis = None + return _dask_or_eager_func( + "nanmedian", dask_module=dask_array_compat, eager_module=nputils + )(a, axis=axis) def _nanvar_object(value, axis=None, ddof=0, keepdims=False, **kwargs): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index cb00c9dcfe0..0a9d0767b77 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1625,8 +1625,9 @@ def concat(cls, variables, dim="concat_dim", positions=None, shortcut=False): if not shortcut: for var in variables: if var.dims != first_var.dims: - raise ValueError("inconsistent dimensions") - utils.remove_incompatible_items(attrs, var.attrs) + raise ValueError( + f"Variable has dimensions {list(var.dims)} but first Variable has dimensions {list(first_var.dims)}" + ) return cls(dims, data, attrs, encoding) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index a29fe0190cf..d907e1c5e46 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -711,6 +711,22 @@ def test_check_for_impossible_ordering(self): ): combine_by_coords([ds1, ds0]) + def test_combine_by_coords_incomplete_hypercube(self): + # test that this succeeds with default fill_value + x1 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [0]}) + x2 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [1], "x": [0]}) + x3 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [1]}) + actual = combine_by_coords([x1, x2, x3]) + expected = Dataset( + {"a": (("y", "x"), [[1, 1], [1, np.nan]])}, + coords={"y": [0, 1], "x": [0, 1]}, + ) + assert_identical(expected, actual) + + # test that this fails if fill_value is None + with pytest.raises(ValueError): + combine_by_coords([x1, x2, x3], fill_value=None) + @pytest.mark.filterwarnings( "ignore:In xarray version 0.15 `auto_combine` " "will be deprecated" diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 0661ebb7a38..def5abc942f 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -462,3 +462,16 @@ def test_concat_join_kwarg(self): for join in expected: actual = concat([ds1, ds2], join=join, dim="x") assert_equal(actual, expected[join].to_array()) + + +@pytest.mark.parametrize("attr1", ({"a": {"meta": [10, 20, 30]}}, {"a": [1, 2, 3]}, {})) +@pytest.mark.parametrize("attr2", ({"a": [1, 2, 3]}, {})) +def test_concat_attrs_first_variable(attr1, attr2): + + arrs = [ + DataArray([[1], [2]], dims=["x", "y"], attrs=attr1), + DataArray([[3], [4]], dims=["x", "y"], attrs=attr2), + ] + + concat_attrs = concat(arrs, "y").attrs + assert concat_attrs == attr1 diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 6122e987154..d0e2654eed3 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -216,8 +216,10 @@ def test_reduce(self): self.assertLazyAndAllClose(u.argmin(dim="x"), actual) self.assertLazyAndAllClose((u > 1).any(), (v > 1).any()) self.assertLazyAndAllClose((u < 1).all("x"), (v < 1).all("x")) - with raises_regex(NotImplementedError, "dask"): + with raises_regex(NotImplementedError, "only works along an axis"): v.median() + with raises_regex(NotImplementedError, "only works along an axis"): + v.median(v.dims) with raise_if_dask_computes(): v.reduce(duck_array_ops.mean) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 49a6906d5be..62fde920b1e 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -432,7 +432,7 @@ def test_concat(self): assert_identical( Variable(["b", "a"], np.array([x, y])), Variable.concat((v, w), "b") ) - with raises_regex(ValueError, "inconsistent dimensions"): + with raises_regex(ValueError, "Variable has dimensions"): Variable.concat([v, Variable(["c"], y)], "b") # test indexers actual = Variable.concat( @@ -451,16 +451,12 @@ def test_concat(self): Variable.concat([v[:, 0], v[:, 1:]], "x") def test_concat_attrs(self): - # different or conflicting attributes should be removed + # always keep attrs from first variable v = self.cls("a", np.arange(5), {"foo": "bar"}) w = self.cls("a", np.ones(5)) expected = self.cls( "a", np.concatenate([np.arange(5), np.ones(5)]) ).to_base_variable() - assert_identical(expected, Variable.concat([v, w], "a")) - w.attrs["foo"] = 2 - assert_identical(expected, Variable.concat([v, w], "a")) - w.attrs["foo"] = "bar" expected.attrs["foo"] = "bar" assert_identical(expected, Variable.concat([v, w], "a"))