Skip to content

Commit

Permalink
add 'no_conflicts' as compat option for merging non-conflicting data (#…
Browse files Browse the repository at this point in the history
…996)

* Add notnull_equals method to Variable

* add more tests for Variable.notnull_equals

* add DataArray.notnull_equals

* more tests for DataArray.notnull_equals

* add Dataset.notnull_equals

* remove redundant notnull_equivalent from utils

* fix introduced flake8 errors

* add 'notnull_equals' compat option to merge function and method

* add wrong shape test to Variable.notnull_equals

* refactor isnull

* remove Dataset/Array `notnull_equals` methods

* rename 'notnull_equals' --> 'no_conflicts'

* update merge docstrings to include 'no_conflicts'

* add docs on 'no_conflicts'

* whats new entry

* update dataset.merge docs

and fix Raises ValueError -> MergeError

* Update combining.rst
Update combining.rst

* DOC: We don't use None for missing values
  • Loading branch information
jcmgray authored and shoyer committed Sep 15, 2016
1 parent 41654ef commit f40d323
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 23 deletions.
18 changes: 18 additions & 0 deletions doc/combining.rst
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,21 @@ numpy):
Note that ``NaN`` does not compare equal to ``NaN`` in element-wise comparison;
you may need to deal with missing values explicitly.

Merging with 'no_conflicts'
~~~~~~~~~~~~~~~~~~~~~~~~~~~

The ``compat`` argument ``'no_conflicts'`` is only available when
combining xarray objects with ``merge``. In addition to the above comparison
methods it allows the merging of xarray objects with locations where *either*
have ``NaN`` values. This can be used to combine data with overlapping
coordinates as long as any non-missing values agree or are disjoint:

.. ipython:: python
ds1 = xr.Dataset({'a': ('x', [10, 20, 30, np.nan])}, {'x': [1, 2, 3, 4]})
ds2 = xr.Dataset({'a': ('x', [np.nan, 30, 40, 50])}, {'x': [2, 3, 4, 5]})
xr.merge([ds1, ds2], compat='no_conflicts')
Note that due to the underlying representation of missing values as floating
point numbers (``NaN``), variable data type is not always preserved when merging
in this manner.
5 changes: 5 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ By `Robin Wilson <https://github.com/robintw>`_.
(see :ref:`multi-level indexing`).
By `Benoit Bovy <https://github.com/benbovy>`_.

- Added the ``compat`` option ``'no_conflicts'`` to ``merge``, allowing the
combination of xarray objects with disjoint (:issue:`742`) or
overlapping (:issue:`835`) coordinates as long as any present data agrees.
By `Johnnie Gray <https://github.com/jcmgray>`_.

Bug fixes
~~~~~~~~~

Expand Down
8 changes: 6 additions & 2 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1453,7 +1453,8 @@ def merge(self, other, inplace=False, overwrite_vars=set(),
overwrite_vars : str or sequence, optional
If provided, update variables of these name(s) without checking for
conflicts in this dataset.
compat : {'broadcast_equals', 'equals', 'identical'}, optional
compat : {'broadcast_equals', 'equals', 'identical',
'no_conflicts'}, optional
String indicating how to compare variables of the same name for
potential conflicts:
Expand All @@ -1462,6 +1463,9 @@ def merge(self, other, inplace=False, overwrite_vars=set(),
- 'equals': all values and dimensions must be the same.
- 'identical': all values, dimensions and attributes must be the
same.
- 'no_conflicts': only values which are not null in both datasets
must be equal. The returned dataset then contains the combination
of all non-null values.
join : {'outer', 'inner', 'left', 'right'}, optional
Method for joining ``self`` and ``other`` along shared dimensions:
Expand All @@ -1477,7 +1481,7 @@ def merge(self, other, inplace=False, overwrite_vars=set(),
Raises
------
ValueError
MergeError
If any variables conflict (see ``compat``).
"""
variables, coord_names, dims = dataset_merge_method(
Expand Down
32 changes: 23 additions & 9 deletions xarray/core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
_VALID_COMPAT = Frozen({'identical': 0,
'equals': 1,
'broadcast_equals': 2,
'minimal': 3})
'minimal': 3,
'no_conflicts': 4})


def broadcast_dimension_size(variables):
Expand Down Expand Up @@ -48,7 +49,8 @@ def unique_variable(name, variables, compat='broadcast_equals'):
variables : list of xarray.Variable
List of Variable objects, all of which go by the same name in different
inputs.
compat : {'identical', 'equals', 'broadcast_equals'}, optional
compat : {'identical', 'equals', 'broadcast_equals',
'no_conflicts'}, optional
Type of equality check to use.
Returns
Expand All @@ -61,19 +63,27 @@ def unique_variable(name, variables, compat='broadcast_equals'):
"""
out = variables[0]
if len(variables) > 1:
combine_method = None

if compat == 'minimal':
compat = 'broadcast_equals'

if compat == 'broadcast_equals':
dim_lengths = broadcast_dimension_size(variables)
out = out.expand_dims(dim_lengths)

if compat == 'no_conflicts':
combine_method = 'fillna'

for var in variables[1:]:
if not getattr(out, compat)(var):
raise MergeError('conflicting values for variable %r on '
'objects to be combined:\n'
'first value: %r\nsecond value: %r'
% (name, out, var))
if combine_method:
out = getattr(out, combine_method)(var)

return out


Expand Down Expand Up @@ -110,8 +120,9 @@ def merge_variables(
priority_vars : mapping with Variable values, optional
If provided, variables are always taken from this dict in preference to
the input variable dictionaries, without checking for conflicts.
compat : {'identical', 'equals', 'broadcast_equals', 'minimal'}, optional
Type of equality check to use when checking for conflicts.
compat : {'identical', 'equals', 'broadcast_equals',
'minimal', 'no_conflicts'}, optional
Type of equality check to use wben checking for conflicts.
Returns
-------
Expand Down Expand Up @@ -342,7 +353,8 @@ def _get_priority_vars(objects, priority_arg, compat='equals'):
Dictionaries in which to find the priority variables.
priority_arg : int or None
Integer object whose variable should take priority.
compat : 'broadcast_equals', 'equals' or 'identical', optional
compat : {'identical', 'equals', 'broadcast_equals',
'no_conflicts'}, optional
Compatibility checks to use when merging variables.
Returns
Expand Down Expand Up @@ -395,9 +407,10 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
----------
objs : list of mappings
All values must be convertable to labeled arrays.
compat : 'broadcast_equals', 'equals' or 'identical', optional
compat : {'identical', 'equals', 'broadcast_equals',
'no_conflicts'}, optional
Compatibility checks to use when merging variables.
join : 'outer', 'inner', 'left' or 'right', optional
join : {'outer', 'inner', 'left', 'right'}, optional
How to combine objects with different indexes.
priority_arg : integer, optional
Optional argument in `objs` that takes precedence over the others.
Expand Down Expand Up @@ -461,9 +474,10 @@ def merge(objects, compat='broadcast_equals', join='outer'):
objects : Iterable[Union[xarray.Dataset, xarray.DataArray, dict]]
Merge together all variables from these objects. If any of them are
DataArray objects, they must have a name.
compat : 'broadcast_equals', 'equals' or 'identical', optional
compat : {'identical', 'equals', 'broadcast_equals',
'no_conflicts'}, optional
Compatibility checks to use when merging variables.
join : 'outer', 'inner', 'left' or 'right', optional
join : {'outer', 'inner', 'left', 'right'}, optional
How to combine objects with different indexes.
Returns
Expand Down
33 changes: 25 additions & 8 deletions xarray/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,20 @@ def _fail_on_dask_array_input(values, msg=None, func_name=None):

around = _dask_or_eager_func('around')
isclose = _dask_or_eager_func('isclose')
isnull = _dask_or_eager_func('isnull', pd)
notnull = _dask_or_eager_func('notnull', pd)
_isnull = _dask_or_eager_func('isnull', pd)


def isnull(data):
# GH837, GH861
# isnull fcn from pandas will throw TypeError when run on numpy structured
# array therefore for dims that are np structured arrays we assume all
# data is present
try:
return _isnull(data)
except TypeError:
return np.zeros(data.shape, dtype=bool)


transpose = _dask_or_eager_func('transpose')
where = _dask_or_eager_func('where', n_array_args=3)
Expand Down Expand Up @@ -125,17 +137,22 @@ def array_equiv(arr1, arr2):
return False

flag_array = (arr1 == arr2)
flag_array |= (isnull(arr1) & isnull(arr2))

# GH837, GH861
# isnull fcn from pandas will throw TypeError when run on numpy structured array
# therefore for dims that are np structured arrays we skip testing for nan
return bool(flag_array.all())

try:

flag_array |= (isnull(arr1) & isnull(arr2))
def array_notnull_equiv(arr1, arr2):
"""Like np.array_equal, but also allows values to be NaN in either or both
arrays
"""
arr1, arr2 = as_like_arrays(arr1, arr2)
if arr1.shape != arr2.shape:
return False

except TypeError:
pass
flag_array = (arr1 == arr2)
flag_array |= isnull(arr1)
flag_array |= isnull(arr2)

return bool(flag_array.all())

Expand Down
18 changes: 18 additions & 0 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,6 +1016,24 @@ def identical(self, other):
except (TypeError, AttributeError):
return False

def _data_no_conflicts(self, other):
return (self._data is other._data or
ops.array_notnull_equiv(self.data, other.data))

def no_conflicts(self, other):
"""True if the intersection of two Variable's non-null data is
equal; otherwise false.
Variables can thus still be equal if there are locations where either,
or both, contain NaN values.
"""
other = getattr(other, 'variable', other)
try:
return (self.dims == other.dims and
self._data_no_conflicts(other))
except (TypeError, AttributeError):
return False

@property
def real(self):
return type(self)(self.dims, self.data.real, self._attrs)
Expand Down
76 changes: 74 additions & 2 deletions xarray/test/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,51 @@ def test_merge_error(self):
with self.assertRaises(xr.MergeError):
xr.merge([ds, ds + 1])

def test_merge_no_conflicts_single_var(self):
ds1 = xr.Dataset({'a': ('x', [1, 2])})
ds2 = xr.Dataset({'a': ('x', [2, 3]), 'x': [1, 2]})
expected = xr.Dataset({'a': ('x', [1, 2, 3])})
assert expected.identical(xr.merge([ds1, ds2],
compat='no_conflicts'))
assert expected.identical(xr.merge([ds2, ds1],
compat='no_conflicts'))
assert ds1.identical(xr.merge([ds1, ds2],
compat='no_conflicts',
join='left'))
assert ds2.identical(xr.merge([ds1, ds2],
compat='no_conflicts',
join='right'))
expected = xr.Dataset({'a': ('x', [2]), 'x': [1]})
assert expected.identical(xr.merge([ds1, ds2],
compat='no_conflicts',
join='inner'))

with self.assertRaises(xr.MergeError):
ds3 = xr.Dataset({'a': ('x', [99, 3]), 'x': [1, 2]})
xr.merge([ds1, ds3], compat='no_conflicts')

with self.assertRaises(xr.MergeError):
ds3 = xr.Dataset({'a': ('y', [2, 3]), 'y': [1, 2]})
xr.merge([ds1, ds3], compat='no_conflicts')

def test_merge_no_conflicts_multi_var(self):
data = create_test_data()
data1 = data.copy(deep=True)
data2 = data.copy(deep=True)

expected = data[['var1', 'var2']]
actual = xr.merge([data1.var1, data2.var2], compat='no_conflicts')
assert expected.identical(actual)

data1['var1'][:, :5] = np.nan
data2['var1'][:, 5:] = np.nan
data1['var2'][:4, :] = np.nan
data2['var2'][4:, :] = np.nan
del data2['var3']

actual = xr.merge([data1, data2], compat='no_conflicts')
assert data.equals(actual)


class TestMergeMethod(TestCase):

Expand Down Expand Up @@ -111,7 +156,8 @@ def test_merge_broadcast_equals(self):
def test_merge_compat(self):
ds1 = xr.Dataset({'x': 0})
ds2 = xr.Dataset({'x': 1})
for compat in ['broadcast_equals', 'equals', 'identical']:
for compat in ['broadcast_equals', 'equals', 'identical',
'no_conflicts']:
with self.assertRaises(xr.MergeError):
ds1.merge(ds2, compat=compat)

Expand All @@ -132,7 +178,7 @@ def test_merge_auto_align(self):
ds1 = xr.Dataset({'a': ('x', [1, 2])})
ds2 = xr.Dataset({'b': ('x', [3, 4]), 'x': [1, 2]})
expected = xr.Dataset({'a': ('x', [1, 2, np.nan]),
'b': ('x', [np.nan, 3, 4])})
'b': ('x', [np.nan, 3, 4])})
assert expected.identical(ds1.merge(ds2))
assert expected.identical(ds2.merge(ds1))

Expand All @@ -143,3 +189,29 @@ def test_merge_auto_align(self):
expected = expected.isel(x=slice(1, 2))
assert expected.identical(ds1.merge(ds2, join='inner'))
assert expected.identical(ds2.merge(ds1, join='inner'))

def test_merge_no_conflicts(self):
ds1 = xr.Dataset({'a': ('x', [1, 2])})
ds2 = xr.Dataset({'a': ('x', [2, 3]), 'x': [1, 2]})
expected = xr.Dataset({'a': ('x', [1, 2, 3])})

assert expected.identical(ds1.merge(ds2, compat='no_conflicts'))
assert expected.identical(ds2.merge(ds1, compat='no_conflicts'))

assert ds1.identical(ds1.merge(ds2, compat='no_conflicts',
join='left'))

assert ds2.identical(ds1.merge(ds2, compat='no_conflicts',
join='right'))

expected2 = xr.Dataset({'a': ('x', [2]), 'x': [1]})
assert expected2.identical(ds1.merge(ds2, compat='no_conflicts',
join='inner'))

with self.assertRaises(xr.MergeError):
ds3 = xr.Dataset({'a': ('x', [99, 3]), 'x': [1, 2]})
ds1.merge(ds3, compat='no_conflicts')

with self.assertRaises(xr.MergeError):
ds3 = xr.Dataset({'a': ('y', [2, 3]), 'y': [1, 2]})
ds1.merge(ds3, compat='no_conflicts')
35 changes: 33 additions & 2 deletions xarray/test/test_ops.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from pytest import mark
import numpy as np
from numpy import array, nan
from xarray.core import ops
from xarray.core.ops import (
first, last, count, mean
first, last, count, mean, array_notnull_equiv,
)

from . import TestCase
Expand Down Expand Up @@ -74,3 +74,34 @@ def test_count(self):

def test_all_nan_arrays(self):
assert np.isnan(mean([np.nan, np.nan]))


class TestArrayNotNullEquiv():
@mark.parametrize("arr1, arr2", [
(np.array([1, 2, 3]), np.array([1, 2, 3])),
(np.array([1, 2, np.nan]), np.array([1, np.nan, 3])),
(np.array([np.nan, 2, np.nan]), np.array([1, np.nan, np.nan])),
])
def test_equal(self, arr1, arr2):
assert array_notnull_equiv(arr1, arr2)

def test_some_not_equal(self):
a = np.array([1, 2, 4])
b = np.array([1, np.nan, 3])
assert not array_notnull_equiv(a, b)

def test_wrong_shape(self):
a = np.array([[1, np.nan, np.nan, 4]])
b = np.array([[1, 2], [np.nan, 4]])
assert not array_notnull_equiv(a, b)

@mark.parametrize("val1, val2, val3, null", [
(1, 2, 3, None),
(1., 2., 3., np.nan),
(1., 2., 3., None),
('foo', 'bar', 'baz', None),
])
def test_types(self, val1, val2, val3, null):
arr1 = np.array([val1, null, val3, null])
arr2 = np.array([val1, val2, null, null])
assert array_notnull_equiv(arr1, arr2)
Loading

0 comments on commit f40d323

Please sign in to comment.