also apply combine_attrs to the attrs of the variables (pydata#4902)

spencerkclark · May 5, 2021 · bd4650c · bd4650c
1 parent 7432032
commit bd4650c
Show file tree

Hide file tree

Showing 8 changed files with 264 additions and 50 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -22,7 +22,9 @@ v0.17.1 (unreleased)
 
 New Features
 ~~~~~~~~~~~~
-
+- apply ``combine_attrs`` on data variables and coordinate variables when concatenating
+  and merging datasets and dataarrays (:pull:`4902`).
+  By `Justus Magin <https://github.com/keewis>`_.
 - Add :py:meth:`Dataset.to_pandas` (:pull:`5247`)
   By `Giacomo Caria <https://github.com/gcaria>`_.
 - Add :py:meth:`DataArray.plot.surface` which wraps matplotlib's `plot_surface` to make
@@ -144,6 +146,10 @@ Breaking changes
   ``ds.coarsen(...).mean(keep_attrs=False)`` instead of ``ds.coarsen(..., keep_attrs=False).mean()``.
   Further, coarsen now keeps attributes per default (:pull:`5227`).
   By `Mathias Hauser <https://github.com/mathause>`_.
+- switch the default of the :py:func:`merge` ``combine_attrs`` parameter to
+  ``"override"``. This will keep the current behavior for merging the ``attrs`` of
+  variables but stop dropping the ``attrs`` of the main objects (:pull:`4902`).
+  By `Justus Magin <https://github.com/keewis>`_.
 
 Deprecations
 ~~~~~~~~~~~~

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
@@ -508,7 +508,7 @@ def ensure_common_dims(vars):
                 vars = ensure_common_dims([ds[k].variable for ds in datasets])
             except KeyError:
                 raise ValueError("%r is not present in all datasets." % k)
-            combined = concat_vars(vars, dim, positions)
+            combined = concat_vars(vars, dim, positions, combine_attrs=combine_attrs)
             assert isinstance(combined, Variable)
             result_vars[k] = combined
         elif k in result_vars:
@@ -572,7 +572,7 @@ def _dataarray_concat(
         positions,
         fill_value=fill_value,
         join=join,
-        combine_attrs="drop",
+        combine_attrs=combine_attrs,
     )
 
     merged_attrs = merge_attrs([da.attrs for da in arrays], combine_attrs)

diff --git a/xarray/core/merge.py b/xarray/core/merge.py
@@ -164,6 +164,7 @@ def merge_collected(
     grouped: Dict[Hashable, List[MergeElement]],
     prioritized: Mapping[Hashable, MergeElement] = None,
     compat: str = "minimal",
+    combine_attrs="override",
 ) -> Tuple[Dict[Hashable, Variable], Dict[Hashable, pd.Index]]:
     """Merge dicts of variables, while resolving conflicts appropriately.
 
@@ -222,11 +223,18 @@ def merge_collected(
                                 % (name, variable.attrs, other_variable.attrs)
                             )
                 merged_vars[name] = variable
+                merged_vars[name].attrs = merge_attrs(
+                    [var.attrs for var, _ in indexed_elements],
+                    combine_attrs=combine_attrs,
+                )
                 merged_indexes[name] = index
             else:
                 variables = [variable for variable, _ in elements_list]
                 try:
                     merged_vars[name] = unique_variable(name, variables, compat)
+                    merged_vars[name].attrs = merge_attrs(
+                        [var.attrs for var in variables], combine_attrs=combine_attrs
+                    )
                 except MergeError:
                     if compat != "minimal":
                         # we need more than "minimal" compatibility (for which
@@ -613,7 +621,9 @@ def merge_core(
     collected = collect_variables_and_indexes(aligned)
 
     prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat)
-    variables, out_indexes = merge_collected(collected, prioritized, compat=compat)
+    variables, out_indexes = merge_collected(
+        collected, prioritized, compat=compat, combine_attrs=combine_attrs
+    )
     assert_unique_multiindex_level_names(variables)
 
     dims = calculate_dimensions(variables)
@@ -649,7 +659,7 @@ def merge(
     compat: str = "no_conflicts",
     join: str = "outer",
     fill_value: object = dtypes.NA,
-    combine_attrs: str = "drop",
+    combine_attrs: str = "override",
 ) -> "Dataset":
     """Merge any number of xarray objects into a single Dataset as variables.
 
@@ -688,7 +698,7 @@ def merge(
         variable names to fill values. Use a data array's name to
         refer to its values.
     combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
-                     "override"}, default: "drop"
+                     "override"}, default: "override"
         String indicating how to combine attrs of the objects being merged:
 
         - "drop": empty attrs on returned Dataset.

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -1771,7 +1771,14 @@ def reduce(
         return Variable(dims, data, attrs=attrs)
 
     @classmethod
-    def concat(cls, variables, dim="concat_dim", positions=None, shortcut=False):
+    def concat(
+        cls,
+        variables,
+        dim="concat_dim",
+        positions=None,
+        shortcut=False,
+        combine_attrs="override",
+    ):
         """Concatenate variables along a new or existing dimension.
 
         Parameters
@@ -1794,13 +1801,27 @@ def concat(cls, variables, dim="concat_dim", positions=None, shortcut=False):
             This option is used internally to speed-up groupby operations.
             If `shortcut` is True, some checks of internal consistency between
             arrays to concatenate are skipped.
+        combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
+                         "override"}, default: "override"
+            String indicating how to combine attrs of the objects being merged:
+
+            - "drop": empty attrs on returned Dataset.
+            - "identical": all attrs must be the same on every object.
+            - "no_conflicts": attrs from all objects are combined, any that have
+              the same name must also have the same value.
+            - "drop_conflicts": attrs from all objects are combined, any that have
+              the same name but different values are dropped.
+            - "override": skip comparing and copy attrs from the first dataset to
+              the result.
 
         Returns
         -------
         stacked : Variable
             Concatenated Variable formed by stacking all the supplied variables
             along the given dimension.
         """
+        from .merge import merge_attrs
+
         if not isinstance(dim, str):
             (dim,) = dim.dims
 
@@ -1825,7 +1846,9 @@ def concat(cls, variables, dim="concat_dim", positions=None, shortcut=False):
             dims = (dim,) + first_var.dims
             data = duck_array_ops.stack(arrays, axis=axis)
 
-        attrs = dict(first_var.attrs)
+        attrs = merge_attrs(
+            [var.attrs for var in variables], combine_attrs=combine_attrs
+        )
         encoding = dict(first_var.encoding)
         if not shortcut:
             for var in variables:
@@ -2581,12 +2604,21 @@ def __setitem__(self, key, value):
         raise TypeError("%s values cannot be modified" % type(self).__name__)
 
     @classmethod
-    def concat(cls, variables, dim="concat_dim", positions=None, shortcut=False):
+    def concat(
+        cls,
+        variables,
+        dim="concat_dim",
+        positions=None,
+        shortcut=False,
+        combine_attrs="override",
+    ):
         """Specialized version of Variable.concat for IndexVariable objects.
 
         This exists because we want to avoid converting Index objects to NumPy
         arrays, if possible.
         """
+        from .merge import merge_attrs
+
         if not isinstance(dim, str):
             (dim,) = dim.dims
 
@@ -2613,12 +2645,13 @@ def concat(cls, variables, dim="concat_dim", positions=None, shortcut=False):
         # keep as str if possible as pandas.Index uses object (converts to numpy array)
         data = maybe_coerce_to_str(data, variables)
 
-        attrs = dict(first_var.attrs)
+        attrs = merge_attrs(
+            [var.attrs for var in variables], combine_attrs=combine_attrs
+        )
         if not shortcut:
             for var in variables:
                 if var.dims != first_var.dims:
                     raise ValueError("inconsistent dimensions")
-                utils.remove_incompatible_items(attrs, var.attrs)
 
         return cls(first_var.dims, data, attrs)
 
@@ -2792,7 +2825,13 @@ def _broadcast_compat_data(self, other):
     return self_data, other_data, dims
 
 
-def concat(variables, dim="concat_dim", positions=None, shortcut=False):
+def concat(
+    variables,
+    dim="concat_dim",
+    positions=None,
+    shortcut=False,
+    combine_attrs="override",
+):
     """Concatenate variables along a new or existing dimension.
 
     Parameters
@@ -2815,6 +2854,18 @@ def concat(variables, dim="concat_dim", positions=None, shortcut=False):
         This option is used internally to speed-up groupby operations.
         If `shortcut` is True, some checks of internal consistency between
         arrays to concatenate are skipped.
+    combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
+                     "override"}, default: "override"
+        String indicating how to combine attrs of the objects being merged:
+
+        - "drop": empty attrs on returned Dataset.
+        - "identical": all attrs must be the same on every object.
+        - "no_conflicts": attrs from all objects are combined, any that have
+          the same name must also have the same value.
+        - "drop_conflicts": attrs from all objects are combined, any that have
+          the same name but different values are dropped.
+        - "override": skip comparing and copy attrs from the first dataset to
+          the result.
 
     Returns
     -------
@@ -2824,9 +2875,9 @@ def concat(variables, dim="concat_dim", positions=None, shortcut=False):
     """
     variables = list(variables)
     if all(isinstance(v, IndexVariable) for v in variables):
-        return IndexVariable.concat(variables, dim, positions, shortcut)
+        return IndexVariable.concat(variables, dim, positions, shortcut, combine_attrs)
     else:
-        return Variable.concat(variables, dim, positions, shortcut)
+        return Variable.concat(variables, dim, positions, shortcut, combine_attrs)
 
 
 def assert_unique_multiindex_level_names(variables):