Skip to content
forked from pydata/xarray

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into groupby-plot
Browse files Browse the repository at this point in the history
* upstream/master:
  Add nanmedian for dask arrays (pydata#3604)
  added pyinterp to related projects (pydata#3655)
  Allow incomplete hypercubes in combine_by_coords (pydata#3649)
  concat keeps attrs from first variable. (pydata#3637)
  • Loading branch information
dcherian committed Dec 30, 2019
2 parents 97f3323 + b3d3b44 commit e018ea8
Show file tree
Hide file tree
Showing 12 changed files with 176 additions and 25 deletions.
1 change: 1 addition & 0 deletions doc/related-projects.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Geosciences
- `PyGDX <https://pygdx.readthedocs.io/en/latest/>`_: Python 3 package for
accessing data stored in GAMS Data eXchange (GDX) files. Also uses a custom
subclass.
- `pyinterp <https://pangeo-pyinterp.readthedocs.io/en/latest/>`_: Python 3 package for interpolating geo-referenced data used in the field of geosciences.
- `pyXpcm <https://pyxpcm.readthedocs.io>`_: xarray-based Profile Classification Modelling (PCM), mostly for ocean data.
- `Regionmask <https://regionmask.readthedocs.io/>`_: plotting and creation of masks of spatial regions
- `rioxarray <https://corteva.github.io/rioxarray>`_: geospatial xarray extension powered by rasterio
Expand Down
9 changes: 9 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ Breaking changes

New Features
~~~~~~~~~~~~
- Implement :py:func:`median` and :py:func:`nanmedian` for dask arrays. This works by rechunking
to a single chunk along all reduction axes. (:issue:`2999`).
By `Deepak Cherian <https://github.com/dcherian>`_.
- :py:func:`xarray.concat` now preserves attributes from the first Variable.
(:issue:`2575`, :issue:`2060`, :issue:`1614`)
By `Deepak Cherian <https://github.com/dcherian>`_.
- :py:meth:`Dataset.quantile`, :py:meth:`DataArray.quantile` and ``GroupBy.quantile``
now work with dask Variables.
By `Deepak Cherian <https://github.com/dcherian>`_.
Expand All @@ -38,6 +44,9 @@ New Features

Bug fixes
~~~~~~~~~
- Fix :py:meth:`xarray.combine_by_coords` to allow for combining incomplete
hypercubes of Datasets (:issue:`3648`). By `Ian Bolliger
<https://github.com/bolliger32>`_.
- Fix :py:meth:`xarray.combine_by_coords` when combining cftime coordinates
which span long time intervals (:issue:`3535`). By `Spencer Clark
<https://github.com/spencerkclark>`_.
Expand Down
36 changes: 29 additions & 7 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,12 @@ def _infer_concat_order_from_coords(datasets):
return combined_ids, concat_dims


def _check_shape_tile_ids(combined_tile_ids):
def _check_dimension_depth_tile_ids(combined_tile_ids):
"""
Check all tuples are the same length, i.e. check that all lists are
nested to the same depth.
"""
tile_ids = combined_tile_ids.keys()

# Check all tuples are the same length
# i.e. check that all lists are nested to the same depth
nesting_depths = [len(tile_id) for tile_id in tile_ids]
if not nesting_depths:
nesting_depths = [0]
Expand All @@ -128,8 +129,13 @@ def _check_shape_tile_ids(combined_tile_ids):
"The supplied objects do not form a hypercube because"
" sub-lists do not have consistent depths"
)
# return these just to be reused in _check_shape_tile_ids
return tile_ids, nesting_depths

# Check all lists along one dimension are same length

def _check_shape_tile_ids(combined_tile_ids):
"""Check all lists along one dimension are same length."""
tile_ids, nesting_depths = _check_dimension_depth_tile_ids(combined_tile_ids)
for dim in range(nesting_depths[0]):
indices_along_dim = [tile_id[dim] for tile_id in tile_ids]
occurrences = Counter(indices_along_dim)
Expand Down Expand Up @@ -536,7 +542,8 @@ def combine_by_coords(
coords : {'minimal', 'different', 'all' or list of str}, optional
As per the 'data_vars' kwarg, but for coordinate variables.
fill_value : scalar, optional
Value to use for newly missing values
Value to use for newly missing values. If None, raises a ValueError if
the passed Datasets do not create a complete hypercube.
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
String indicating how to combine differing indexes
(excluding concat_dim) in objects
Expand Down Expand Up @@ -653,6 +660,15 @@ def combine_by_coords(
temperature (y, x) float64 1.654 10.63 7.015 2.543 ... 12.46 2.22 15.96
precipitation (y, x) float64 0.2136 0.9974 0.7603 ... 0.6125 0.4654 0.5953
>>> xr.combine_by_coords([x1, x2, x3])
<xarray.Dataset>
Dimensions: (x: 6, y: 4)
Coordinates:
* x (x) int64 10 20 30 40 50 60
* y (y) int64 0 1 2 3
Data variables:
temperature (y, x) float64 1.654 10.63 7.015 nan ... 12.46 2.22 15.96
precipitation (y, x) float64 0.2136 0.9974 0.7603 ... 0.6125 0.4654 0.5953
"""

# Group by data vars
Expand All @@ -667,7 +683,13 @@ def combine_by_coords(
list(datasets_with_same_vars)
)

_check_shape_tile_ids(combined_ids)
if fill_value is None:
# check that datasets form complete hypercube
_check_shape_tile_ids(combined_ids)
else:
# check only that all datasets have same dimension depth for these
# vars
_check_dimension_depth_tile_ids(combined_ids)

# Concatenate along all of concat_dims one by one to create single ds
concatenated = _combine_nd(
Expand Down
6 changes: 4 additions & 2 deletions xarray/core/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,14 @@ def concat(
those of the first object with that dimension. Indexes for the same
dimension must have the same size in all objects.
indexers, mode, concat_over : deprecated
Returns
-------
concatenated : type of objs
Notes
-----
Each concatenated Variable preserves corresponding ``attrs`` from the first element of ``objs``.
See also
--------
merge
Expand Down
83 changes: 81 additions & 2 deletions xarray/core/dask_array_compat.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
from distutils.version import LooseVersion
from typing import Iterable

import dask.array as da
import numpy as np
from dask import __version__ as dask_version

try:
import dask.array as da
from dask import __version__ as dask_version
except ImportError:
dask_version = "0.0.0"
da = None

if LooseVersion(dask_version) >= LooseVersion("2.0.0"):
meta_from_array = da.utils.meta_from_array
Expand Down Expand Up @@ -89,3 +95,76 @@ def meta_from_array(x, ndim=None, dtype=None):
meta = meta.astype(dtype)

return meta


if LooseVersion(dask_version) >= LooseVersion("2.8.1"):
median = da.median
else:
# Copied from dask v2.8.1
# Used under the terms of Dask's license, see licenses/DASK_LICENSE.
def median(a, axis=None, keepdims=False):
"""
This works by automatically chunking the reduced axes to a single chunk
and then calling ``numpy.median`` function across the remaining dimensions
"""

if axis is None:
raise NotImplementedError(
"The da.median function only works along an axis. "
"The full algorithm is difficult to do in parallel"
)

if not isinstance(axis, Iterable):
axis = (axis,)

axis = [ax + a.ndim if ax < 0 else ax for ax in axis]

a = a.rechunk({ax: -1 if ax in axis else "auto" for ax in range(a.ndim)})

result = a.map_blocks(
np.median,
axis=axis,
keepdims=keepdims,
drop_axis=axis if not keepdims else None,
chunks=[1 if ax in axis else c for ax, c in enumerate(a.chunks)]
if keepdims
else None,
)

return result


if LooseVersion(dask_version) > LooseVersion("2.9.0"):
nanmedian = da.nanmedian
else:

def nanmedian(a, axis=None, keepdims=False):
"""
This works by automatically chunking the reduced axes to a single chunk
and then calling ``numpy.nanmedian`` function across the remaining dimensions
"""

if axis is None:
raise NotImplementedError(
"The da.nanmedian function only works along an axis. "
"The full algorithm is difficult to do in parallel"
)

if not isinstance(axis, Iterable):
axis = (axis,)

axis = [ax + a.ndim if ax < 0 else ax for ax in axis]

a = a.rechunk({ax: -1 if ax in axis else "auto" for ax in range(a.ndim)})

result = a.map_blocks(
np.nanmedian,
axis=axis,
keepdims=keepdims,
drop_axis=axis if not keepdims else None,
chunks=[1 if ax in axis else c for ax, c in enumerate(a.chunks)]
if keepdims
else None,
)

return result
8 changes: 4 additions & 4 deletions xarray/core/duck_array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import numpy as np
import pandas as pd

from . import dask_array_ops, dtypes, npcompat, nputils
from . import dask_array_ops, dask_array_compat, dtypes, npcompat, nputils
from .nputils import nanfirst, nanlast
from .pycompat import dask_array_type

Expand Down Expand Up @@ -284,7 +284,7 @@ def _ignore_warnings_if(condition):
yield


def _create_nan_agg_method(name, coerce_strings=False):
def _create_nan_agg_method(name, dask_module=dask_array, coerce_strings=False):
from . import nanops

def f(values, axis=None, skipna=None, **kwargs):
Expand All @@ -301,7 +301,7 @@ def f(values, axis=None, skipna=None, **kwargs):
nanname = "nan" + name
func = getattr(nanops, nanname)
else:
func = _dask_or_eager_func(name)
func = _dask_or_eager_func(name, dask_module=dask_module)

try:
return func(values, axis=axis, **kwargs)
Expand Down Expand Up @@ -337,7 +337,7 @@ def f(values, axis=None, skipna=None, **kwargs):
std.numeric_only = True
var = _create_nan_agg_method("var")
var.numeric_only = True
median = _create_nan_agg_method("median")
median = _create_nan_agg_method("median", dask_module=dask_array_compat)
median.numeric_only = True
prod = _create_nan_agg_method("prod")
prod.numeric_only = True
Expand Down
12 changes: 11 additions & 1 deletion xarray/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@

try:
import dask.array as dask_array
from . import dask_array_compat
except ImportError:
dask_array = None
dask_array_compat = None # type: ignore


def _replace_nan(a, val):
Expand Down Expand Up @@ -141,7 +143,15 @@ def nanmean(a, axis=None, dtype=None, out=None):


def nanmedian(a, axis=None, out=None):
return _dask_or_eager_func("nanmedian", eager_module=nputils)(a, axis=axis)
# The dask algorithm works by rechunking to one chunk along axis
# Make sure we trigger the dask error when passing all dimensions
# so that we don't rechunk the entire array to one chunk and
# possibly blow memory
if axis is not None and len(np.atleast_1d(axis)) == a.ndim:
axis = None
return _dask_or_eager_func(
"nanmedian", dask_module=dask_array_compat, eager_module=nputils
)(a, axis=axis)


def _nanvar_object(value, axis=None, ddof=0, keepdims=False, **kwargs):
Expand Down
5 changes: 3 additions & 2 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1625,8 +1625,9 @@ def concat(cls, variables, dim="concat_dim", positions=None, shortcut=False):
if not shortcut:
for var in variables:
if var.dims != first_var.dims:
raise ValueError("inconsistent dimensions")
utils.remove_incompatible_items(attrs, var.attrs)
raise ValueError(
f"Variable has dimensions {list(var.dims)} but first Variable has dimensions {list(first_var.dims)}"
)

return cls(dims, data, attrs, encoding)

Expand Down
16 changes: 16 additions & 0 deletions xarray/tests/test_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,22 @@ def test_check_for_impossible_ordering(self):
):
combine_by_coords([ds1, ds0])

def test_combine_by_coords_incomplete_hypercube(self):
# test that this succeeds with default fill_value
x1 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [0]})
x2 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [1], "x": [0]})
x3 = Dataset({"a": (("y", "x"), [[1]])}, coords={"y": [0], "x": [1]})
actual = combine_by_coords([x1, x2, x3])
expected = Dataset(
{"a": (("y", "x"), [[1, 1], [1, np.nan]])},
coords={"y": [0, 1], "x": [0, 1]},
)
assert_identical(expected, actual)

# test that this fails if fill_value is None
with pytest.raises(ValueError):
combine_by_coords([x1, x2, x3], fill_value=None)


@pytest.mark.filterwarnings(
"ignore:In xarray version 0.15 `auto_combine` " "will be deprecated"
Expand Down
13 changes: 13 additions & 0 deletions xarray/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,3 +462,16 @@ def test_concat_join_kwarg(self):
for join in expected:
actual = concat([ds1, ds2], join=join, dim="x")
assert_equal(actual, expected[join].to_array())


@pytest.mark.parametrize("attr1", ({"a": {"meta": [10, 20, 30]}}, {"a": [1, 2, 3]}, {}))
@pytest.mark.parametrize("attr2", ({"a": [1, 2, 3]}, {}))
def test_concat_attrs_first_variable(attr1, attr2):

arrs = [
DataArray([[1], [2]], dims=["x", "y"], attrs=attr1),
DataArray([[3], [4]], dims=["x", "y"], attrs=attr2),
]

concat_attrs = concat(arrs, "y").attrs
assert concat_attrs == attr1
4 changes: 3 additions & 1 deletion xarray/tests/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,10 @@ def test_reduce(self):
self.assertLazyAndAllClose(u.argmin(dim="x"), actual)
self.assertLazyAndAllClose((u > 1).any(), (v > 1).any())
self.assertLazyAndAllClose((u < 1).all("x"), (v < 1).all("x"))
with raises_regex(NotImplementedError, "dask"):
with raises_regex(NotImplementedError, "only works along an axis"):
v.median()
with raises_regex(NotImplementedError, "only works along an axis"):
v.median(v.dims)
with raise_if_dask_computes():
v.reduce(duck_array_ops.mean)

Expand Down
8 changes: 2 additions & 6 deletions xarray/tests/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ def test_concat(self):
assert_identical(
Variable(["b", "a"], np.array([x, y])), Variable.concat((v, w), "b")
)
with raises_regex(ValueError, "inconsistent dimensions"):
with raises_regex(ValueError, "Variable has dimensions"):
Variable.concat([v, Variable(["c"], y)], "b")
# test indexers
actual = Variable.concat(
Expand All @@ -451,16 +451,12 @@ def test_concat(self):
Variable.concat([v[:, 0], v[:, 1:]], "x")

def test_concat_attrs(self):
# different or conflicting attributes should be removed
# always keep attrs from first variable
v = self.cls("a", np.arange(5), {"foo": "bar"})
w = self.cls("a", np.ones(5))
expected = self.cls(
"a", np.concatenate([np.arange(5), np.ones(5)])
).to_base_variable()
assert_identical(expected, Variable.concat([v, w], "a"))
w.attrs["foo"] = 2
assert_identical(expected, Variable.concat([v, w], "a"))
w.attrs["foo"] = "bar"
expected.attrs["foo"] = "bar"
assert_identical(expected, Variable.concat([v, w], "a"))

Expand Down

0 comments on commit e018ea8

Please sign in to comment.