Skip to content

Default to phony_dims="access" in h5netcdf-backend #10058

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,13 @@ New Features

Breaking changes
~~~~~~~~~~~~~~~~

- Warn instead of raise if phony_dims are detected when using h5netcdf-backend and ``phony_dims=None`` (:issue:`10049`, :pull:`10058`)
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Deprecations
~~~~~~~~~~~~

- Move from phony_dims=None to phony_dims="access" for h5netcdf-backend(:issue:`10049`, :pull:`10058`)
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Bug fixes
~~~~~~~~~
Expand Down
39 changes: 39 additions & 0 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,23 @@ def close(self, **kwargs):
self._manager.close(**kwargs)


def _check_phony_dims(phony_dims):
emit_phony_dims_warning = False
if phony_dims is None:
emit_phony_dims_warning = True
phony_dims = "access"
return emit_phony_dims_warning, phony_dims


def _emit_phony_dims_warning():
emit_user_level_warning(
"The 'phony_dims' kwarg now defaults to 'access'. "
"Previously 'phony_dims=None' would raise an error. "
"For full netcdf equivalence please use phony_dims='sort'.",
UserWarning,
)


class H5netcdfBackendEntrypoint(BackendEntrypoint):
"""
Backend for netCDF files based on the h5netcdf package.
Expand Down Expand Up @@ -434,6 +451,10 @@ def open_dataset(
driver_kwds=None,
storage_options: dict[str, Any] | None = None,
) -> Dataset:
# Keep this message for some versions
# remove and set phony_dims="access" above
emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims)

filename_or_obj = _normalize_path(filename_or_obj)
store = H5NetCDFStore.open(
filename_or_obj,
Expand All @@ -460,6 +481,13 @@ def open_dataset(
use_cftime=use_cftime,
decode_timedelta=decode_timedelta,
)

# only warn if phony_dims exist in file
# remove together with the above check
# after some versions
if store.ds._root._phony_dim_count > 0 and emit_phony_dims_warning:
_emit_phony_dims_warning()

return ds

def open_datatree(
Expand Down Expand Up @@ -530,6 +558,10 @@ def open_groups_as_dict(
from xarray.core.treenode import NodePath
from xarray.core.utils import close_on_error

# Keep this message for some versions
# remove and set phony_dims="access" above
emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims)

filename_or_obj = _normalize_path(filename_or_obj)
store = H5NetCDFStore.open(
filename_or_obj,
Expand All @@ -542,6 +574,7 @@ def open_groups_as_dict(
driver=driver,
driver_kwds=driver_kwds,
)

# Check for a group and make it a parent if it exists
if group:
parent = NodePath("/") / NodePath(group)
Expand Down Expand Up @@ -571,6 +604,12 @@ def open_groups_as_dict(
group_name = str(NodePath(path_group))
groups_dict[group_name] = group_ds

# only warn if phony_dims exist in file
# remove together with the above check
# after some versions
if store.ds._phony_dim_count > 0 and emit_phony_dims_warning:
_emit_phony_dims_warning()

return groups_dict


Expand Down
22 changes: 22 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -4170,6 +4170,28 @@
with self.roundtrip(expected) as actual:
assert_equal(expected, actual)

def test_phony_dims_warning(self) -> None:
import h5py

foo_data = np.arange(125).reshape(5, 5, 5)
bar_data = np.arange(625).reshape(25, 5, 5)
var = {"foo1": foo_data, "foo2": bar_data, "foo3": foo_data, "foo4": bar_data}
with create_tmp_file() as tmp_file:
with h5py.File(tmp_file, "w") as f:
grps = ["bar", "baz"]
for grp in grps:
fx = f.create_group(grp)
for k, v in var.items():
fx.create_dataset(k, data=v)
with pytest.warns(UserWarning, match="The 'phony_dims' kwarg"):
with xr.open_dataset(tmp_file, engine="h5netcdf", group="bar") as ds:
assert ds.dims == {

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4188 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.
"phony_dim_0": 5,
"phony_dim_1": 5,
"phony_dim_2": 5,
"phony_dim_3": 25,
}


@requires_h5netcdf
@requires_netCDF4
Expand Down
23 changes: 23 additions & 0 deletions xarray/tests/test_backends_datatree.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,29 @@ def test_open_datatree_specific_group(self, tmpdir, simple_datatree) -> None:
class TestH5NetCDFDatatreeIO(DatatreeIOBase):
engine: T_DataTreeNetcdfEngine | None = "h5netcdf"

def test_phony_dims_warning(self, tmpdir) -> None:
filepath = tmpdir + "/phony_dims.nc"
import h5py

foo_data = np.arange(125).reshape(5, 5, 5)
bar_data = np.arange(625).reshape(25, 5, 5)
var = {"foo1": foo_data, "foo2": bar_data, "foo3": foo_data, "foo4": bar_data}
with h5py.File(filepath, "w") as f:
grps = ["bar", "baz"]
for grp in grps:
fx = f.create_group(grp)
for k, v in var.items():
fx.create_dataset(k, data=v)

with pytest.warns(UserWarning, match="The 'phony_dims' kwarg"):
with open_datatree(filepath, engine=self.engine) as tree:
assert tree.bar.dims == {
"phony_dim_0": 5,
"phony_dim_1": 5,
"phony_dim_2": 5,
"phony_dim_3": 25,
}


@pytest.mark.skipif(
have_zarr_v3, reason="datatree support for zarr 3 is not implemented yet"
Expand Down
Loading