Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add a drop_conflicts strategy for merging attrs #4827

Merged
merged 25 commits into from
Feb 10, 2021
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
767a668
add the drop_conflicts option for merging attrs
keewis Jan 19, 2021
909c5cd
mention drop_conflicts in a few docstrings
keewis Jan 19, 2021
d35a213
add a test for merge_attrs
keewis Jan 19, 2021
7f9481c
rewrite the drop_conflicts algorithm
keewis Jan 19, 2021
1930d68
verify that drop_conflicts works correctly when passed to concat
keewis Jan 19, 2021
a14a4d9
add a test for combine_nested with combine_attrs="drop_conflicts"
keewis Jan 19, 2021
edc8428
remove test_merge_attrs in favor of test_merge_arrays_attrs
keewis Jan 19, 2021
9344b93
use a dict comprehension instead of a dict intersection
keewis Jan 19, 2021
1791111
reorganize the drop_conflicts concat test
keewis Jan 23, 2021
2ec123f
use different data for the drop_conflicts merge test
keewis Jan 23, 2021
fad94a0
add another test to make sure attrs on variables is also merged
keewis Jan 23, 2021
86c7c5f
correctly set expect_exception
keewis Jan 23, 2021
3531198
skip the combine_attrs merge variables test
keewis Jan 23, 2021
be5ad6a
rewrite the concat combine_attrs test and use different data
keewis Jan 23, 2021
5a6b551
also skip the concat combine_attrs variables test
keewis Jan 23, 2021
af72603
use utils.equivalent instead of == so ndarray objects can be compared
keewis Jan 24, 2021
dbc510a
make sure conflicting keys cannot be added back by other objects
keewis Jan 24, 2021
344d822
don't compare the value with itself if key is not in attrs
keewis Jan 24, 2021
6ed88f7
also check the coordinates
keewis Feb 5, 2021
f43c0be
add docstrings to explain the purpose of the tests
keewis Feb 5, 2021
ac3c102
update whats-new.rst
keewis Feb 5, 2021
ea24b32
add a test for combine_attrs="drop_conflicts" with more than two attrs
keewis Feb 5, 2021
5dd73be
Merge branch 'master' into keep_attrs-drop_conflicts
keewis Feb 5, 2021
81c8271
Merge branch 'master' into keep_attrs-drop_conflicts
keewis Feb 6, 2021
4a4faa5
Merge branch 'master' into keep_attrs-drop_conflicts
keewis Feb 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ New Features
By `Maximilian Roos <https://github.com/max-sixty>`_.

- Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables.
By `Deepak Cherian <https://github.com/dcherian>`_
- add ``"drop_conflicts"`` to the strategies supported by the ``combine_attrs`` kwarg
(:issue:`4749`, :pull:`4827`).
By `Justus Magin <https://github.com/keewis>`_.
By `Deepak Cherian <https://github.com/dcherian>`_.
- :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims
in the form of kwargs as well as a dict, like most similar methods.
Expand Down
12 changes: 8 additions & 4 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,14 +412,16 @@ def combine_nested(
- "override": if indexes are of same size, rewrite indexes to be
those of the first object with that dimension. Indexes for the same
dimension must have the same size in all objects.
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
default: "drop"
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
"override"}, default: "drop"
String indicating how to combine attrs of the objects being merged:

- "drop": empty attrs on returned Dataset.
- "identical": all attrs must be the same on every object.
- "no_conflicts": attrs from all objects are combined, any that have
the same name must also have the same value.
- "drop_conflicts": attrs from all objects are combined, any that have
the same name but different values are dropped.
- "override": skip comparing and copy attrs from the first dataset to
the result.

Expand Down Expand Up @@ -625,14 +627,16 @@ def combine_by_coords(
- "override": if indexes are of same size, rewrite indexes to be
those of the first object with that dimension. Indexes for the same
dimension must have the same size in all objects.
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
default: "drop"
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
"override"}, default: "drop"
String indicating how to combine attrs of the objects being merged:

- "drop": empty attrs on returned Dataset.
- "identical": all attrs must be the same on every object.
- "no_conflicts": attrs from all objects are combined, any that have
the same name must also have the same value.
- "drop_conflicts": attrs from all objects are combined, any that have
the same name but different values are dropped.
- "override": skip comparing and copy attrs from the first dataset to
the result.

Expand Down
6 changes: 4 additions & 2 deletions xarray/core/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,16 @@ def concat(
- "override": if indexes are of same size, rewrite indexes to be
those of the first object with that dimension. Indexes for the same
dimension must have the same size in all objects.
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
default: "override"
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
"override"}, default: "override"
String indicating how to combine attrs of the objects being merged:

- "drop": empty attrs on returned Dataset.
- "identical": all attrs must be the same on every object.
- "no_conflicts": attrs from all objects are combined, any that have
the same name must also have the same value.
- "drop_conflicts": attrs from all objects are combined, any that have
the same name but different values are dropped.
- "override": skip comparing and copy attrs from the first dataset to
the result.

Expand Down
29 changes: 25 additions & 4 deletions xarray/core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from . import dtypes, pdcompat
from .alignment import deep_align
from .duck_array_ops import lazy_array_equiv
from .utils import Frozen, compat_dict_union, dict_equiv
from .utils import Frozen, compat_dict_union, dict_equiv, equivalent
from .variable import Variable, as_variable, assert_unique_multiindex_level_names

if TYPE_CHECKING:
Expand Down Expand Up @@ -513,6 +513,24 @@ def merge_attrs(variable_attrs, combine_attrs):
"the same. Merging %s with %s" % (str(result), str(attrs))
)
return result
elif combine_attrs == "drop_conflicts":
result = {}
dropped_keys = set()
for attrs in variable_attrs:
result.update(
{
key: value
for key, value in attrs.items()
if key not in result and key not in dropped_keys
}
)
result = {
key: value
for key, value in result.items()
if key not in attrs or equivalent(attrs[key], value)
dcherian marked this conversation as resolved.
Show resolved Hide resolved
}
dropped_keys |= {key for key in attrs if key not in result}
return result
elif combine_attrs == "identical":
result = dict(variable_attrs[0])
for attrs in variable_attrs[1:]:
Expand Down Expand Up @@ -556,7 +574,8 @@ def merge_core(
Compatibility checks to use when merging variables.
join : {"outer", "inner", "left", "right"}, optional
How to combine objects with different indexes.
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, optional
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
"override"}, optional
How to combine attributes of objects
priority_arg : int, optional
Optional argument in `objects` that takes precedence over the others.
Expand Down Expand Up @@ -668,14 +687,16 @@ def merge(
Value to use for newly missing values. If a dict-like, maps
variable names to fill values. Use a data array's name to
refer to its values.
combine_attrs : {"drop", "identical", "no_conflicts", "override"}, \
default: "drop"
combine_attrs : {"drop", "identical", "no_conflicts", "drop_conflicts", \
"override"}, default: "drop"
String indicating how to combine attrs of the objects being merged:

- "drop": empty attrs on returned Dataset.
- "identical": all attrs must be the same on every object.
- "no_conflicts": attrs from all objects are combined, any that have
the same name must also have the same value.
- "drop_conflicts": attrs from all objects are combined, any that have
the same name but different values are dropped.
- "override": skip comparing and copy attrs from the first dataset to
the result.

Expand Down
11 changes: 11 additions & 0 deletions xarray/tests/test_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,17 @@ def test_combine_coords_combine_attrs_identical(self):
objs, concat_dim="x", join="outer", combine_attrs="identical"
)

def test_combine_nested_combine_attrs_drop_conflicts(self):
objs = [
Dataset({"x": [0], "y": [0]}, attrs={"a": 1, "b": 2, "c": 3}),
Dataset({"x": [1], "y": [1]}, attrs={"a": 1, "b": 0, "d": 3}),
]
expected = Dataset({"x": [0, 1], "y": [0, 1]}, attrs={"a": 1, "c": 3, "d": 3})
actual = combine_nested(
objs, concat_dim="x", join="outer", combine_attrs="drop_conflicts"
)
assert_identical(expected, actual)

def test_infer_order_from_coords(self):
data = create_test_data()
objs = [data.isel(dim2=slice(4, 9)), data.isel(dim2=slice(4))]
Expand Down
129 changes: 110 additions & 19 deletions xarray/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,27 +258,118 @@ def test_concat_join_kwarg(self):
)
assert_identical(actual, expected)

def test_concat_combine_attrs_kwarg(self):
ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]}, attrs={"b": 42})
ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]}, attrs={"b": 42, "c": 43})

expected = {}
expected["drop"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]})
expected["no_conflicts"] = Dataset(
{"a": ("x", [0, 0])}, {"x": [0, 1]}, {"b": 42, "c": 43}
)
expected["override"] = Dataset({"a": ("x", [0, 0])}, {"x": [0, 1]}, {"b": 42})

with raises_regex(ValueError, "combine_attrs='identical'"):
actual = concat([ds1, ds2], dim="x", combine_attrs="identical")
with raises_regex(ValueError, "combine_attrs='no_conflicts'"):
ds3 = ds2.copy(deep=True)
ds3.attrs["b"] = 44
actual = concat([ds1, ds3], dim="x", combine_attrs="no_conflicts")
@pytest.mark.parametrize(
"combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception",
[
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 1, "c": 3},
{"a": 1, "b": 2, "c": 3},
False,
),
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 4, "c": 3},
{"a": 1, "b": 2, "c": 3},
True,
),
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
(
"override",
{"a": 1, "b": 2},
{"a": 4, "b": 5, "c": 3},
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 41, "b": 42, "c": 43},
{"b": 2, "c": 43, "d": 44},
{"a": 41, "c": 43, "d": 44},
False,
),
],
)
def test_concat_combine_attrs_kwarg(
self, combine_attrs, var1_attrs, var2_attrs, expected_attrs, expect_exception
):
ds1 = Dataset({"a": ("x", [0])}, coords={"x": [0]}, attrs=var1_attrs)
ds2 = Dataset({"a": ("x", [0])}, coords={"x": [1]}, attrs=var2_attrs)

if expect_exception:
with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"):
concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
else:
actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
expected = Dataset(
{"a": ("x", [0, 0])}, {"x": [0, 1]}, attrs=expected_attrs
)

for combine_attrs in expected:
assert_identical(actual, expected)

@pytest.mark.skip(reason="not implemented, yet (see #4827)")
@pytest.mark.parametrize(
"combine_attrs, attrs1, attrs2, expected_attrs, expect_exception",
[
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 1, "c": 3},
{"a": 1, "b": 2, "c": 3},
False,
),
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 4, "c": 3},
{"a": 1, "b": 2, "c": 3},
True,
),
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
(
"override",
{"a": 1, "b": 2},
{"a": 4, "b": 5, "c": 3},
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 41, "b": 42, "c": 43},
{"b": 2, "c": 43, "d": 44},
{"a": 41, "c": 43, "d": 44},
False,
),
],
)
def test_concat_combine_attrs_kwarg_variables(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def test_concat_combine_attrs_kwarg_variables(
def test_concat_combine_attrs_kwarg_dataarrays(

or data_vars (took me a while to figure out what was different from above)

Copy link
Collaborator Author

@keewis keewis Feb 5, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, I forgot to check the coords. The purpose of these tests is to check whether combine_attrs is applied to the variables (coords and data variables) in addition to the top-level Dataset / DataArray. Would it help to have a comment explaining the difference?

self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception
):
"""check that combine_attrs is used on data variables and coords"""
ds1 = Dataset({"a": ("x", [0], attrs1)}, coords={"x": ("x", [0], attrs1)})
ds2 = Dataset({"a": ("x", [0], attrs2)}, coords={"x": ("x", [1], attrs2)})

if expect_exception:
with pytest.raises(ValueError, match=f"combine_attrs='{combine_attrs}'"):
concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
else:
actual = concat([ds1, ds2], dim="x", combine_attrs=combine_attrs)
assert_identical(actual, expected[combine_attrs])
expected = Dataset(
{"a": ("x", [0, 0], expected_attrs)},
{"x": ("x", [0, 1], expected_attrs)},
)

assert_identical(actual, expected)

def test_concat_promote_shape(self):
# mixed dims within variables
Expand Down
85 changes: 85 additions & 0 deletions xarray/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,20 @@ def test_merge_arrays_attrs_default(self):
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 1, "b": 2, "c": 3},
{"b": 1, "c": 3, "d": 4},
{"a": 1, "c": 3, "d": 4},
False,
),
(
"drop_conflicts",
{"a": 1, "b": np.array([2]), "c": np.array([3])},
{"b": 1, "c": np.array([3]), "d": 4},
{"a": 1, "c": np.array([3]), "d": 4},
False,
),
],
)
def test_merge_arrays_attrs(
Expand All @@ -109,13 +123,84 @@ def test_merge_arrays_attrs(
expected.attrs = expected_attrs
assert_identical(actual, expected)

@pytest.mark.skip(reason="not implemented, yet (see #4827)")
@pytest.mark.parametrize(
"combine_attrs, attrs1, attrs2, expected_attrs, expect_exception",
[
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 1, "c": 3},
{"a": 1, "b": 2, "c": 3},
False,
),
("no_conflicts", {"a": 1, "b": 2}, {}, {"a": 1, "b": 2}, False),
("no_conflicts", {}, {"a": 1, "c": 3}, {"a": 1, "c": 3}, False),
(
"no_conflicts",
{"a": 1, "b": 2},
{"a": 4, "c": 3},
{"a": 1, "b": 2, "c": 3},
True,
),
("drop", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "b": 2}, {"a": 1, "b": 2}, False),
("identical", {"a": 1, "b": 2}, {"a": 1, "c": 3}, {"a": 1, "b": 2}, True),
(
"override",
{"a": 1, "b": 2},
{"a": 4, "b": 5, "c": 3},
{"a": 1, "b": 2},
False,
),
(
"drop_conflicts",
{"a": 1, "b": 2, "c": 3},
{"b": 1, "c": 3, "d": 4},
{"a": 1, "c": 3, "d": 4},
False,
),
],
)
def test_merge_arrays_attrs_variables(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dito

self, combine_attrs, attrs1, attrs2, expected_attrs, expect_exception
):
"""check that combine_attrs is used on data variables and coords"""
data = create_test_data()
data1 = data.copy()
data1.var1.attrs = attrs1
data1.dim1.attrs = attrs1
data2 = data.copy()
data2.var1.attrs = attrs2
data2.dim1.attrs = attrs2

if expect_exception:
with raises_regex(MergeError, "combine_attrs"):
actual = xr.merge([data1, data2], combine_attrs=combine_attrs)
else:
actual = xr.merge([data1, data2], combine_attrs=combine_attrs)
expected = data.copy()
expected.var1.attrs = expected_attrs
expected.dim1.attrs = expected_attrs

assert_identical(actual, expected)

def test_merge_attrs_override_copy(self):
ds1 = xr.Dataset(attrs={"x": 0})
ds2 = xr.Dataset(attrs={"x": 1})
ds3 = xr.merge([ds1, ds2], combine_attrs="override")
ds3.attrs["x"] = 2
assert ds1.x == 0

def test_merge_attrs_drop_conflicts(self):
ds1 = xr.Dataset(attrs={"a": 0, "b": 0, "c": 0})
ds2 = xr.Dataset(attrs={"b": 0, "c": 1, "d": 0})
ds3 = xr.Dataset(attrs={"a": 0, "b": 1, "c": 0, "e": 0})

actual = xr.merge([ds1, ds2, ds3], combine_attrs="drop_conflicts")
expected = xr.Dataset(attrs={"a": 0, "d": 0, "e": 0})
assert_identical(actual, expected)

def test_merge_dicts_simple(self):
actual = xr.merge([{"foo": 0}, {"bar": "one"}, {"baz": 3.5}])
expected = xr.Dataset({"foo": 0, "bar": "one", "baz": 3.5})
Expand Down