add 'no_conflicts' as compat option for merging non-conflicting data (#…

…996) * Add notnull_equals method to Variable * add more tests for Variable.notnull_equals * add DataArray.notnull_equals * more tests for DataArray.notnull_equals * add Dataset.notnull_equals * remove redundant notnull_equivalent from utils * fix introduced flake8 errors * add 'notnull_equals' compat option to merge function and method * add wrong shape test to Variable.notnull_equals * refactor isnull * remove Dataset/Array `notnull_equals` methods * rename 'notnull_equals' --> 'no_conflicts' * update merge docstrings to include 'no_conflicts' * add docs on 'no_conflicts' * whats new entry * update dataset.merge docs and fix Raises ValueError -> MergeError * Update combining.rst Update combining.rst * DOC: We don't use None for missing values
pydata · Sep 15, 2016 · f40d323 · f40d323
1 parent 41654ef
commit f40d323
Show file tree

Hide file tree

Showing 9 changed files with 222 additions and 23 deletions.
diff --git a/doc/combining.rst b/doc/combining.rst
@@ -192,3 +192,21 @@ numpy):
 
 Note that ``NaN`` does not compare equal to ``NaN`` in element-wise comparison;
 you may need to deal with missing values explicitly.
+
+Merging with 'no_conflicts'
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``compat`` argument ``'no_conflicts'`` is only available when
+combining xarray objects with ``merge``. In addition to the above comparison
+methods it allows the merging of xarray objects with locations where *either*
+have ``NaN`` values. This can be used to combine data with overlapping
+coordinates as long as any non-missing values agree or are disjoint:
+
+.. ipython:: python
+    ds1 = xr.Dataset({'a': ('x', [10, 20, 30, np.nan])}, {'x': [1, 2, 3, 4]})
+    ds2 = xr.Dataset({'a': ('x', [np.nan, 30, 40, 50])}, {'x': [2, 3, 4, 5]})
+    xr.merge([ds1, ds2], compat='no_conflicts')
+
+Note that due to the underlying representation of missing values as floating
+point numbers (``NaN``), variable data type is not always preserved when merging
+in this manner.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -57,6 +57,11 @@ By `Robin Wilson <https://github.com/robintw>`_.
   (see :ref:`multi-level indexing`).
   By `Benoit Bovy <https://github.com/benbovy>`_.
 
+- Added the ``compat`` option ``'no_conflicts'`` to ``merge``, allowing the
+  combination of xarray objects with disjoint (:issue:`742`) or
+  overlapping (:issue:`835`) coordinates as long as any present data agrees.
+  By `Johnnie Gray <https://github.com/jcmgray>`_.
+
 Bug fixes
 ~~~~~~~~~
 

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -1453,7 +1453,8 @@ def merge(self, other, inplace=False, overwrite_vars=set(),
         overwrite_vars : str or sequence, optional
             If provided, update variables of these name(s) without checking for
             conflicts in this dataset.
-        compat : {'broadcast_equals', 'equals', 'identical'}, optional
+        compat : {'broadcast_equals', 'equals', 'identical',
+                  'no_conflicts'}, optional
             String indicating how to compare variables of the same name for
             potential conflicts:
 
@@ -1462,6 +1463,9 @@ def merge(self, other, inplace=False, overwrite_vars=set(),
             - 'equals': all values and dimensions must be the same.
             - 'identical': all values, dimensions and attributes must be the
               same.
+            - 'no_conflicts': only values which are not null in both datasets
+              must be equal. The returned dataset then contains the combination
+              of all non-null values.
         join : {'outer', 'inner', 'left', 'right'}, optional
             Method for joining ``self`` and ``other`` along shared dimensions:
 
@@ -1477,7 +1481,7 @@ def merge(self, other, inplace=False, overwrite_vars=set(),
 
         Raises
         ------
-        ValueError
+        MergeError
             If any variables conflict (see ``compat``).
         """
         variables, coord_names, dims = dataset_merge_method(

diff --git a/xarray/core/merge.py b/xarray/core/merge.py
@@ -12,7 +12,8 @@
 _VALID_COMPAT = Frozen({'identical': 0,
                         'equals': 1,
                         'broadcast_equals': 2,
-                        'minimal': 3})
+                        'minimal': 3,
+                        'no_conflicts': 4})
 
 
 def broadcast_dimension_size(variables):
@@ -48,7 +49,8 @@ def unique_variable(name, variables, compat='broadcast_equals'):
     variables : list of xarray.Variable
         List of Variable objects, all of which go by the same name in different
         inputs.
-    compat : {'identical', 'equals', 'broadcast_equals'}, optional
+    compat : {'identical', 'equals', 'broadcast_equals',
+              'no_conflicts'}, optional
         Type of equality check to use.
 
     Returns
@@ -61,19 +63,27 @@ def unique_variable(name, variables, compat='broadcast_equals'):
     """
     out = variables[0]
     if len(variables) > 1:
+        combine_method = None
+
         if compat == 'minimal':
             compat = 'broadcast_equals'
 
         if compat == 'broadcast_equals':
             dim_lengths = broadcast_dimension_size(variables)
             out = out.expand_dims(dim_lengths)
 
+        if compat == 'no_conflicts':
+            combine_method = 'fillna'
+
         for var in variables[1:]:
             if not getattr(out, compat)(var):
                 raise MergeError('conflicting values for variable %r on '
                                  'objects to be combined:\n'
                                  'first value: %r\nsecond value: %r'
                                  % (name, out, var))
+            if combine_method:
+                out = getattr(out, combine_method)(var)
+
     return out
 
 
@@ -110,8 +120,9 @@ def merge_variables(
     priority_vars : mapping with Variable values, optional
         If provided, variables are always taken from this dict in preference to
         the input variable dictionaries, without checking for conflicts.
-    compat : {'identical', 'equals', 'broadcast_equals', 'minimal'}, optional
-        Type of equality check to use when checking for conflicts.
+    compat : {'identical', 'equals', 'broadcast_equals',
+              'minimal', 'no_conflicts'}, optional
+        Type of equality check to use wben checking for conflicts.
 
     Returns
     -------
@@ -342,7 +353,8 @@ def _get_priority_vars(objects, priority_arg, compat='equals'):
         Dictionaries in which to find the priority variables.
     priority_arg : int or None
         Integer object whose variable should take priority.
-    compat : 'broadcast_equals', 'equals' or 'identical', optional
+    compat : {'identical', 'equals', 'broadcast_equals',
+              'no_conflicts'}, optional
         Compatibility checks to use when merging variables.
 
     Returns
@@ -395,9 +407,10 @@ def merge_core(objs, compat='broadcast_equals', join='outer', priority_arg=None,
     ----------
     objs : list of mappings
         All values must be convertable to labeled arrays.
-    compat : 'broadcast_equals', 'equals' or 'identical', optional
+    compat : {'identical', 'equals', 'broadcast_equals',
+              'no_conflicts'}, optional
         Compatibility checks to use when merging variables.
-    join : 'outer', 'inner', 'left' or 'right', optional
+    join : {'outer', 'inner', 'left', 'right'}, optional
         How to combine objects with different indexes.
     priority_arg : integer, optional
         Optional argument in `objs` that takes precedence over the others.
@@ -461,9 +474,10 @@ def merge(objects, compat='broadcast_equals', join='outer'):
     objects : Iterable[Union[xarray.Dataset, xarray.DataArray, dict]]
         Merge together all variables from these objects. If any of them are
         DataArray objects, they must have a name.
-    compat : 'broadcast_equals', 'equals' or 'identical', optional
+    compat : {'identical', 'equals', 'broadcast_equals',
+              'no_conflicts'}, optional
         Compatibility checks to use when merging variables.
-    join : 'outer', 'inner', 'left' or 'right', optional
+    join : {'outer', 'inner', 'left', 'right'}, optional
         How to combine objects with different indexes.
 
     Returns

diff --git a/xarray/core/ops.py b/xarray/core/ops.py
@@ -79,8 +79,20 @@ def _fail_on_dask_array_input(values, msg=None, func_name=None):
 
 around = _dask_or_eager_func('around')
 isclose = _dask_or_eager_func('isclose')
-isnull = _dask_or_eager_func('isnull', pd)
 notnull = _dask_or_eager_func('notnull', pd)
+_isnull = _dask_or_eager_func('isnull', pd)
+
+
+def isnull(data):
+    # GH837, GH861
+    # isnull fcn from pandas will throw TypeError when run on numpy structured
+    # array therefore for dims that are np structured arrays we assume all
+    # data is present
+    try:
+        return _isnull(data)
+    except TypeError:
+        return np.zeros(data.shape, dtype=bool)
+
 
 transpose = _dask_or_eager_func('transpose')
 where = _dask_or_eager_func('where', n_array_args=3)
@@ -125,17 +137,22 @@ def array_equiv(arr1, arr2):
         return False
 
     flag_array = (arr1 == arr2)
+    flag_array |= (isnull(arr1) & isnull(arr2))
 
-    # GH837, GH861
-    # isnull fcn from pandas will throw TypeError when run on numpy structured array
-    # therefore for dims that are np structured arrays we skip testing for nan
+    return bool(flag_array.all())
 
-    try:
 
-        flag_array |= (isnull(arr1) & isnull(arr2))
+def array_notnull_equiv(arr1, arr2):
+    """Like np.array_equal, but also allows values to be NaN in either or both
+    arrays
+    """
+    arr1, arr2 = as_like_arrays(arr1, arr2)
+    if arr1.shape != arr2.shape:
+        return False
 
-    except TypeError:
-        pass
+    flag_array = (arr1 == arr2)
+    flag_array |= isnull(arr1)
+    flag_array |= isnull(arr2)
 
     return bool(flag_array.all())
 

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -1016,6 +1016,24 @@ def identical(self, other):
         except (TypeError, AttributeError):
             return False
 
+    def _data_no_conflicts(self, other):
+        return (self._data is other._data or
+                ops.array_notnull_equiv(self.data, other.data))
+
+    def no_conflicts(self, other):
+        """True if the intersection of two Variable's non-null data is
+        equal; otherwise false.
+
+        Variables can thus still be equal if there are locations where either,
+        or both, contain NaN values.
+        """
+        other = getattr(other, 'variable', other)
+        try:
+            return (self.dims == other.dims and
+                    self._data_no_conflicts(other))
+        except (TypeError, AttributeError):
+            return False
+
     @property
     def real(self):
         return type(self)(self.dims, self.data.real, self._attrs)

diff --git a/xarray/test/test_merge.py b/xarray/test/test_merge.py
@@ -60,6 +60,51 @@ def test_merge_error(self):
         with self.assertRaises(xr.MergeError):
             xr.merge([ds, ds + 1])
 
+    def test_merge_no_conflicts_single_var(self):
+        ds1 = xr.Dataset({'a': ('x', [1, 2])})
+        ds2 = xr.Dataset({'a': ('x', [2, 3]), 'x': [1, 2]})
+        expected = xr.Dataset({'a': ('x', [1, 2, 3])})
+        assert expected.identical(xr.merge([ds1, ds2],
+                                  compat='no_conflicts'))
+        assert expected.identical(xr.merge([ds2, ds1],
+                                  compat='no_conflicts'))
+        assert ds1.identical(xr.merge([ds1, ds2],
+                                      compat='no_conflicts',
+                                      join='left'))
+        assert ds2.identical(xr.merge([ds1, ds2],
+                                      compat='no_conflicts',
+                                      join='right'))
+        expected = xr.Dataset({'a': ('x', [2]), 'x': [1]})
+        assert expected.identical(xr.merge([ds1, ds2],
+                                           compat='no_conflicts',
+                                           join='inner'))
+
+        with self.assertRaises(xr.MergeError):
+            ds3 = xr.Dataset({'a': ('x', [99, 3]), 'x': [1, 2]})
+            xr.merge([ds1, ds3], compat='no_conflicts')
+
+        with self.assertRaises(xr.MergeError):
+            ds3 = xr.Dataset({'a': ('y', [2, 3]), 'y': [1, 2]})
+            xr.merge([ds1, ds3], compat='no_conflicts')
+
+    def test_merge_no_conflicts_multi_var(self):
+        data = create_test_data()
+        data1 = data.copy(deep=True)
+        data2 = data.copy(deep=True)
+
+        expected = data[['var1', 'var2']]
+        actual = xr.merge([data1.var1, data2.var2], compat='no_conflicts')
+        assert expected.identical(actual)
+
+        data1['var1'][:, :5] = np.nan
+        data2['var1'][:, 5:] = np.nan
+        data1['var2'][:4, :] = np.nan
+        data2['var2'][4:, :] = np.nan
+        del data2['var3']
+
+        actual = xr.merge([data1, data2], compat='no_conflicts')
+        assert data.equals(actual)
+
 
 class TestMergeMethod(TestCase):
 
@@ -111,7 +156,8 @@ def test_merge_broadcast_equals(self):
     def test_merge_compat(self):
         ds1 = xr.Dataset({'x': 0})
         ds2 = xr.Dataset({'x': 1})
-        for compat in ['broadcast_equals', 'equals', 'identical']:
+        for compat in ['broadcast_equals', 'equals', 'identical',
+                       'no_conflicts']:
             with self.assertRaises(xr.MergeError):
                 ds1.merge(ds2, compat=compat)
 
@@ -132,7 +178,7 @@ def test_merge_auto_align(self):
         ds1 = xr.Dataset({'a': ('x', [1, 2])})
         ds2 = xr.Dataset({'b': ('x', [3, 4]), 'x': [1, 2]})
         expected = xr.Dataset({'a': ('x', [1, 2, np.nan]),
-                            'b': ('x', [np.nan, 3, 4])})
+                               'b': ('x', [np.nan, 3, 4])})
         assert expected.identical(ds1.merge(ds2))
         assert expected.identical(ds2.merge(ds1))
 
@@ -143,3 +189,29 @@ def test_merge_auto_align(self):
         expected = expected.isel(x=slice(1, 2))
         assert expected.identical(ds1.merge(ds2, join='inner'))
         assert expected.identical(ds2.merge(ds1, join='inner'))
+
+    def test_merge_no_conflicts(self):
+        ds1 = xr.Dataset({'a': ('x', [1, 2])})
+        ds2 = xr.Dataset({'a': ('x', [2, 3]), 'x': [1, 2]})
+        expected = xr.Dataset({'a': ('x', [1, 2, 3])})
+
+        assert expected.identical(ds1.merge(ds2, compat='no_conflicts'))
+        assert expected.identical(ds2.merge(ds1, compat='no_conflicts'))
+
+        assert ds1.identical(ds1.merge(ds2, compat='no_conflicts',
+                                       join='left'))
+
+        assert ds2.identical(ds1.merge(ds2, compat='no_conflicts',
+                                       join='right'))
+
+        expected2 = xr.Dataset({'a': ('x', [2]), 'x': [1]})
+        assert expected2.identical(ds1.merge(ds2, compat='no_conflicts',
+                                             join='inner'))
+
+        with self.assertRaises(xr.MergeError):
+            ds3 = xr.Dataset({'a': ('x', [99, 3]), 'x': [1, 2]})
+            ds1.merge(ds3, compat='no_conflicts')
+
+        with self.assertRaises(xr.MergeError):
+            ds3 = xr.Dataset({'a': ('y', [2, 3]), 'y': [1, 2]})
+            ds1.merge(ds3, compat='no_conflicts')
diff --git a/xarray/test/test_ops.py b/xarray/test/test_ops.py
@@ -1,8 +1,8 @@
+from pytest import mark
 import numpy as np
 from numpy import array, nan
-from xarray.core import ops
 from xarray.core.ops import (
-    first, last, count, mean
+    first, last, count, mean, array_notnull_equiv,
 )
 
 from . import TestCase
@@ -74,3 +74,34 @@ def test_count(self):
 
     def test_all_nan_arrays(self):
         assert np.isnan(mean([np.nan, np.nan]))
+
+
+class TestArrayNotNullEquiv():
+    @mark.parametrize("arr1, arr2", [
+        (np.array([1, 2, 3]), np.array([1, 2, 3])),
+        (np.array([1, 2, np.nan]), np.array([1, np.nan, 3])),
+        (np.array([np.nan, 2, np.nan]), np.array([1, np.nan, np.nan])),
+    ])
+    def test_equal(self, arr1, arr2):
+        assert array_notnull_equiv(arr1, arr2)
+
+    def test_some_not_equal(self):
+        a = np.array([1, 2, 4])
+        b = np.array([1, np.nan, 3])
+        assert not array_notnull_equiv(a, b)
+
+    def test_wrong_shape(self):
+        a = np.array([[1, np.nan, np.nan, 4]])
+        b = np.array([[1, 2], [np.nan, 4]])
+        assert not array_notnull_equiv(a, b)
+
+    @mark.parametrize("val1, val2, val3, null", [
+        (1, 2, 3, None),
+        (1., 2., 3., np.nan),
+        (1., 2., 3., None),
+        ('foo', 'bar', 'baz', None),
+    ])
+    def test_types(self, val1, val2, val3, null):
+        arr1 = np.array([val1, null, val3, null])
+        arr2 = np.array([val1, val2, null, null])
+        assert array_notnull_equiv(arr1, arr2)