diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 27709a09e7a..b88d10ffc23 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,6 +21,9 @@ v0.12.2 (unreleased) Enhancements ~~~~~~~~~~~~ +- Add ``fill_value`` argument for reindex, align, and merge operations + to enable custom fill values. (:issue:`2876`) + By `Zach Griffith `_. - Character arrays' character dimension name decoding and encoding handled by ``var.encoding['char_dim_name']`` (:issue:`2895`) By `James McCreight `_. diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 642be735e9b..295f69a2afc 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from . import utils +from . import utils, dtypes from .indexing import get_indexer_nd from .utils import is_dict_like, is_full_slice from .variable import IndexVariable, Variable @@ -31,20 +31,17 @@ def _get_joiner(join): raise ValueError('invalid value for join: %s' % join) -_DEFAULT_EXCLUDE = frozenset() # type: frozenset - - -def align(*objects, **kwargs): - """align(*objects, join='inner', copy=True, indexes=None, - exclude=frozenset()) - +def align(*objects, join='inner', copy=True, indexes=None, exclude=frozenset(), + fill_value=dtypes.NA): + """ Given any number of Dataset and/or DataArray objects, returns new objects with aligned indexes and dimension sizes. Array from the aligned objects are suitable as input to mathematical operators, because along each dimension they have the same index and size. - Missing values (if ``join != 'inner'``) are filled with NaN. + Missing values (if ``join != 'inner'``) are filled with ``fill_value``. + The default fill value is NaN. Parameters ---------- @@ -65,11 +62,13 @@ def align(*objects, **kwargs): ``copy=False`` and reindexing is unnecessary, or can be performed with only slice operations, then the output may share memory with the input. In either case, new xarray objects are always returned. - exclude : sequence of str, optional - Dimensions that must be excluded from alignment indexes : dict-like, optional Any indexes explicitly provided with the `indexes` argument should be used in preference to the aligned indexes. + exclude : sequence of str, optional + Dimensions that must be excluded from alignment + fill_value : scalar, optional + Value to use for newly missing values Returns ------- @@ -82,15 +81,8 @@ def align(*objects, **kwargs): If any dimensions without labels on the arguments have different sizes, or a different size than the size of the aligned dimension labels. """ - join = kwargs.pop('join', 'inner') - copy = kwargs.pop('copy', True) - indexes = kwargs.pop('indexes', None) - exclude = kwargs.pop('exclude', _DEFAULT_EXCLUDE) if indexes is None: indexes = {} - if kwargs: - raise TypeError('align() got unexpected keyword arguments: %s' - % list(kwargs)) if not indexes and len(objects) == 1: # fast path for the trivial case @@ -162,7 +154,8 @@ def align(*objects, **kwargs): # fast path for no reindexing necessary new_obj = obj.copy(deep=copy) else: - new_obj = obj.reindex(copy=copy, **valid_indexers) + new_obj = obj.reindex(copy=copy, fill_value=fill_value, + **valid_indexers) new_obj.encoding = obj.encoding result.append(new_obj) @@ -170,7 +163,8 @@ def align(*objects, **kwargs): def deep_align(objects, join='inner', copy=True, indexes=None, - exclude=frozenset(), raise_on_invalid=True): + exclude=frozenset(), raise_on_invalid=True, + fill_value=dtypes.NA): """Align objects for merging, recursing into dictionary values. This function is not public API. @@ -214,7 +208,7 @@ def is_alignable(obj): out.append(variables) aligned = align(*targets, join=join, copy=copy, indexes=indexes, - exclude=exclude) + exclude=exclude, fill_value=fill_value) for position, key, aligned_obj in zip(positions, keys, aligned): if key is no_key: @@ -270,6 +264,7 @@ def reindex_variables( method: Optional[str] = None, tolerance: Any = None, copy: bool = True, + fill_value: Optional[Any] = dtypes.NA, ) -> 'Tuple[OrderedDict[Any, Variable], OrderedDict[Any, pd.Index]]': """Conform a dictionary of aligned variables onto a new set of variables, filling in missing values with NaN. @@ -305,6 +300,8 @@ def reindex_variables( ``copy=False`` and reindexing is unnecessary, or can be performed with only slice operations, then the output may share memory with the input. In either case, new xarray objects are always returned. + fill_value : scalar, optional + Value to use for newly missing values Returns ------- @@ -380,7 +377,7 @@ def reindex_variables( needs_masking = any(d in masked_dims for d in var.dims) if needs_masking: - new_var = var._getitem_with_mask(key) + new_var = var._getitem_with_mask(key, fill_value=fill_value) elif all(is_full_slice(k) for k in key): # no reindexing necessary # here we need to manually deal with copying data, since diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 39e9fc048e3..15e2e00dc21 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -879,9 +879,10 @@ def sel_points(self, dim='points', method=None, tolerance=None, dim=dim, method=method, tolerance=tolerance, **indexers) return self._from_temp_dataset(ds) - def reindex_like(self, other, method=None, tolerance=None, copy=True): - """Conform this object onto the indexes of another object, filling - in missing values with NaN. + def reindex_like(self, other, method=None, tolerance=None, copy=True, + fill_value=dtypes.NA): + """Conform this object onto the indexes of another object, filling in + missing values with ``fill_value``. The default fill value is NaN. Parameters ---------- @@ -910,6 +911,8 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True): ``copy=False`` and reindexing is unnecessary, or can be performed with only slice operations, then the output may share memory with the input. In either case, a new xarray object is always returned. + fill_value : scalar, optional + Value to use for newly missing values Returns ------- @@ -924,12 +927,12 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True): """ indexers = reindex_like_indexers(self, other) return self.reindex(method=method, tolerance=tolerance, copy=copy, - **indexers) + fill_value=fill_value, **indexers) def reindex(self, indexers=None, method=None, tolerance=None, copy=True, - **indexers_kwargs): - """Conform this object onto a new set of indexes, filling in - missing values with NaN. + fill_value=dtypes.NA, **indexers_kwargs): + """Conform this object onto the indexes of another object, filling in + missing values with ``fill_value``. The default fill value is NaN. Parameters ---------- @@ -956,6 +959,8 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True, Maximum distance between original and new labels for inexact matches. The values of the index at the matching locations must satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + fill_value : scalar, optional + Value to use for newly missing values **indexers_kwarg : {dim: indexer, ...}, optional The keyword arguments form of ``indexers``. One of indexers or indexers_kwargs must be provided. @@ -974,7 +979,8 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True, indexers = either_dict_or_kwargs( indexers, indexers_kwargs, 'reindex') ds = self._to_temp_dataset().reindex( - indexers=indexers, method=method, tolerance=tolerance, copy=copy) + indexers=indexers, method=method, tolerance=tolerance, copy=copy, + fill_value=fill_value) return self._from_temp_dataset(ds) def interp(self, coords=None, method='linear', assume_sorted=False, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0f9f68d3106..79a42b303c2 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1932,9 +1932,10 @@ def sel_points(self, dim='points', method=None, tolerance=None, ) return self.isel_points(dim=dim, **pos_indexers) - def reindex_like(self, other, method=None, tolerance=None, copy=True): - """Conform this object onto the indexes of another object, filling - in missing values with NaN. + def reindex_like(self, other, method=None, tolerance=None, copy=True, + fill_value=dtypes.NA): + """Conform this object onto the indexes of another object, filling in + missing values with ``fill_value``. The default fill value is NaN. Parameters ---------- @@ -1963,6 +1964,8 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True): ``copy=False`` and reindexing is unnecessary, or can be performed with only slice operations, then the output may share memory with the input. In either case, a new xarray object is always returned. + fill_value : scalar, optional + Value to use for newly missing values Returns ------- @@ -1977,12 +1980,12 @@ def reindex_like(self, other, method=None, tolerance=None, copy=True): """ indexers = alignment.reindex_like_indexers(self, other) return self.reindex(indexers=indexers, method=method, copy=copy, - tolerance=tolerance) + fill_value=fill_value, tolerance=tolerance) def reindex(self, indexers=None, method=None, tolerance=None, copy=True, - **indexers_kwargs): + fill_value=dtypes.NA, **indexers_kwargs): """Conform this object onto a new set of indexes, filling in - missing values with NaN. + missing values with ``fill_value``. The default fill value is NaN. Parameters ---------- @@ -2010,6 +2013,8 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True, ``copy=False`` and reindexing is unnecessary, or can be performed with only slice operations, then the output may share memory with the input. In either case, a new xarray object is always returned. + fill_value : scalar, optional + Value to use for newly missing values **indexers_kwarg : {dim: indexer, ...}, optional Keyword arguments in the same form as ``indexers``. One of indexers or indexers_kwargs must be provided. @@ -2034,7 +2039,7 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True, variables, indexes = alignment.reindex_variables( self.variables, self.sizes, self.indexes, indexers, method, - tolerance, copy=copy) + tolerance, copy=copy, fill_value=fill_value) coord_names = set(self._coord_names) coord_names.update(indexers) return self._replace_with_new_dims( @@ -2752,7 +2757,7 @@ def update(self, other, inplace=None): inplace=inplace) def merge(self, other, inplace=None, overwrite_vars=frozenset(), - compat='no_conflicts', join='outer'): + compat='no_conflicts', join='outer', fill_value=dtypes.NA): """Merge the arrays of two datasets into a single dataset. This method generally not allow for overriding data, with the exception @@ -2790,6 +2795,8 @@ def merge(self, other, inplace=None, overwrite_vars=frozenset(), - 'left': use indexes from ``self`` - 'right': use indexes from ``other`` - 'exact': error instead of aligning non-equal indexes + fill_value: scalar, optional + Value to use for newly missing values Returns ------- @@ -2804,7 +2811,7 @@ def merge(self, other, inplace=None, overwrite_vars=frozenset(), inplace = _check_inplace(inplace) variables, coord_names, dims = dataset_merge_method( self, other, overwrite_vars=overwrite_vars, compat=compat, - join=join) + join=join, fill_value=fill_value) return self._replace_vars_and_dims(variables, coord_names, dims, inplace=inplace) diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 363fdfc2337..421ac39ebd8 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -4,6 +4,7 @@ import pandas as pd +from . import dtypes from .alignment import deep_align from .pycompat import TYPE_CHECKING from .utils import Frozen @@ -349,7 +350,7 @@ def expand_and_merge_variables(objs, priority_arg=None): def merge_coords(objs, compat='minimal', join='outer', priority_arg=None, - indexes=None): + indexes=None, fill_value=dtypes.NA): """Merge coordinate variables. See merge_core below for argument descriptions. This works similarly to @@ -358,7 +359,8 @@ def merge_coords(objs, compat='minimal', join='outer', priority_arg=None, """ _assert_compat_valid(compat) coerced = coerce_pandas_values(objs) - aligned = deep_align(coerced, join=join, copy=False, indexes=indexes) + aligned = deep_align(coerced, join=join, copy=False, indexes=indexes, + fill_value=fill_value) expanded = expand_variable_dicts(aligned) priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat) variables = merge_variables(expanded, priority_vars, compat=compat) @@ -404,7 +406,8 @@ def merge_core(objs, join='outer', priority_arg=None, explicit_coords=None, - indexes=None): + indexes=None, + fill_value=dtypes.NA): """Core logic for merging labeled objects. This is not public API. @@ -423,6 +426,8 @@ def merge_core(objs, An explicit list of variables from `objs` that are coordinates. indexes : dict, optional Dictionary with values given by pandas.Index objects. + fill_value : scalar, optional + Value to use for newly missing values Returns ------- @@ -442,7 +447,8 @@ def merge_core(objs, _assert_compat_valid(compat) coerced = coerce_pandas_values(objs) - aligned = deep_align(coerced, join=join, copy=False, indexes=indexes) + aligned = deep_align(coerced, join=join, copy=False, indexes=indexes, + fill_value=fill_value) expanded = expand_variable_dicts(aligned) coord_names, noncoord_names = determine_coords(coerced) @@ -470,7 +476,7 @@ def merge_core(objs, return variables, coord_names, dict(dims) -def merge(objects, compat='no_conflicts', join='outer'): +def merge(objects, compat='no_conflicts', join='outer', fill_value=dtypes.NA): """Merge any number of xarray objects into a single Dataset as variables. Parameters @@ -492,6 +498,8 @@ def merge(objects, compat='no_conflicts', join='outer'): of all non-null values. join : {'outer', 'inner', 'left', 'right', 'exact'}, optional How to combine objects with different indexes. + fill_value : scalar, optional + Value to use for newly missing values Returns ------- @@ -529,7 +537,8 @@ def merge(objects, compat='no_conflicts', join='outer'): obj.to_dataset() if isinstance(obj, DataArray) else obj for obj in objects] - variables, coord_names, dims = merge_core(dict_like_objects, compat, join) + variables, coord_names, dims = merge_core(dict_like_objects, compat, join, + fill_value=fill_value) # TODO: don't always recompute indexes merged = Dataset._construct_direct( variables, coord_names, dims, indexes=None) @@ -537,7 +546,8 @@ def merge(objects, compat='no_conflicts', join='outer'): return merged -def dataset_merge_method(dataset, other, overwrite_vars, compat, join): +def dataset_merge_method(dataset, other, overwrite_vars, compat, join, + fill_value=dtypes.NA): """Guts of the Dataset.merge method.""" # we are locked into supporting overwrite_vars for the Dataset.merge @@ -565,7 +575,8 @@ def dataset_merge_method(dataset, other, overwrite_vars, compat, join): objs = [dataset, other_no_overwrite, other_overwrite] priority_arg = 2 - return merge_core(objs, compat, join, priority_arg=priority_arg) + return merge_core(objs, compat, join, priority_arg=priority_arg, + fill_value=fill_value) def dataset_update_method(dataset, other): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index a8655bbbf8c..ab6a5eb3626 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1259,6 +1259,18 @@ def test_reindex_like_no_index(self): ValueError, 'different size for unlabeled'): foo.reindex_like(bar) + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_reindex_fill_value(self, fill_value): + foo = DataArray([10, 20], dims='y', coords={'y': [0, 1]}) + bar = DataArray([10, 20, 30], dims='y', coords={'y': [0, 1, 2]}) + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + actual = x.reindex_like(bar, fill_value=fill_value) + expected = DataArray([10, 20, fill_value], coords=[('y', [0, 1, 2])]) + assert_identical(expected, actual) + @pytest.mark.filterwarnings('ignore:Indexer has dimensions') def test_reindex_regressions(self): # regression test for #279 @@ -1286,6 +1298,18 @@ def test_reindex_method(self): expected = DataArray([10, 20, np.nan], coords=[('y', y)]) assert_identical(expected, actual) + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_reindex_fill_value(self, fill_value): + x = DataArray([10, 20], dims='y', coords={'y': [0, 1]}) + y = [0, 1, 2] + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + actual = x.reindex(y=y, fill_value=fill_value) + expected = DataArray([10, 20, fill_value], coords=[('y', y)]) + assert_identical(expected, actual) + def test_rename(self): renamed = self.dv.rename('bar') assert_identical( diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 3ace80f5eea..207bb5d992d 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1619,6 +1619,54 @@ def test_reindex_method(self): actual = ds.reindex_like(alt, method='pad') assert_identical(expected, actual) + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_reindex_fill_value(self, fill_value): + ds = Dataset({'x': ('y', [10, 20]), 'y': [0, 1]}) + y = [0, 1, 2] + actual = ds.reindex(y=y, fill_value=fill_value) + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + expected = Dataset({'x': ('y', [10, 20, fill_value]), 'y': y}) + assert_identical(expected, actual) + + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_reindex_like_fill_value(self, fill_value): + ds = Dataset({'x': ('y', [10, 20]), 'y': [0, 1]}) + y = [0, 1, 2] + alt = Dataset({'y': y}) + actual = ds.reindex_like(alt, fill_value=fill_value) + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + expected = Dataset({'x': ('y', [10, 20, fill_value]), 'y': y}) + assert_identical(expected, actual) + + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_align_fill_value(self, fill_value): + x = Dataset({'foo': DataArray([1, 2], dims=['x'], + coords={'x': [1, 2]})}) + y = Dataset({'bar': DataArray([1, 2], dims=['x'], + coords={'x': [1, 3]})}) + x2, y2 = align(x, y, join='outer', fill_value=fill_value) + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + + expected_x2 = Dataset( + {'foo': DataArray([1, 2, fill_value], + dims=['x'], + coords={'x': [1, 2, 3]})}) + expected_y2 = Dataset( + {'bar': DataArray([1, fill_value, 2], + dims=['x'], + coords={'x': [1, 2, 3]})}) + assert_identical(expected_x2, x2) + assert_identical(expected_y2, y2) + def test_align(self): left = create_test_data() right = left.copy(deep=True) diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index 4f26d616ce7..9c043f4dcfb 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -2,7 +2,7 @@ import pytest import xarray as xr -from xarray.core import merge +from xarray.core import merge, dtypes from . import raises_regex from .test_dataset import create_test_data @@ -213,6 +213,21 @@ def test_merge_auto_align(self): assert expected.identical(ds1.merge(ds2, join='inner')) assert expected.identical(ds2.merge(ds1, join='inner')) + @pytest.mark.parametrize('fill_value', [dtypes.NA, 2, 2.0]) + def test_merge_fill_value(self, fill_value): + ds1 = xr.Dataset({'a': ('x', [1, 2]), 'x': [0, 1]}) + ds2 = xr.Dataset({'b': ('x', [3, 4]), 'x': [1, 2]}) + if fill_value == dtypes.NA: + # if we supply the default, we expect the missing value for a + # float array + fill_value = np.nan + expected = xr.Dataset({'a': ('x', [1, 2, fill_value]), + 'b': ('x', [fill_value, 3, 4])}, + {'x': [0, 1, 2]}) + assert expected.identical(ds1.merge(ds2, fill_value=fill_value)) + assert expected.identical(ds2.merge(ds1, fill_value=fill_value)) + assert expected.identical(xr.merge([ds1, ds2], fill_value=fill_value)) + def test_merge_no_conflicts(self): ds1 = xr.Dataset({'a': ('x', [1, 2]), 'x': [0, 1]}) ds2 = xr.Dataset({'a': ('x', [2, 3]), 'x': [1, 2]})