Skip to content

Add support for remote string paths to h5netcdf engine #8424

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
18 changes: 13 additions & 5 deletions xarray/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,24 +81,32 @@ def _find_absolute_paths(
['common.py']
"""
if isinstance(paths, str):
if is_remote_uri(paths) and kwargs.get("engine", None) == "zarr":
if is_remote_uri(paths) and (engine := kwargs.get("engine", None)) in [
"zarr",
"h5netcdf",
]:
try:
from fsspec.core import get_fs_token_paths
except ImportError as e:
raise ImportError(
"The use of remote URLs for opening zarr requires the package fsspec"
"The use of remote URLs for opening zarr and h5netcdf requires the package fsspec"
) from e

mode = kwargs.get("mode", "rb")
mode_ = "rb" if mode == "r" else mode
fs, _, _ = get_fs_token_paths(
paths,
mode="rb",
mode=mode_,
storage_options=kwargs.get("backend_kwargs", {}).get(
"storage_options", {}
),
expand=False,
)
tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories
paths = [fs.get_mapper(path) for path in tmp_paths]
if engine == "h5netcdf":
paths = fs.open(paths, mode=mode_)
else:
tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories
paths = [fs.get_mapper(path) for path in tmp_paths]
elif is_remote_uri(paths):
raise ValueError(
"cannot do wild-card matching for paths that are remote URLs "
Expand Down
8 changes: 7 additions & 1 deletion xarray/backends/file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,17 @@ def __init__(

def _make_key(self):
"""Make a key for caching files in the LRU cache."""
kwargs = self._kwargs
# storage_options is a non-hashable dict, so we implement special logic for hashing
if self._kwargs.get("storage_options", None) is not None:
kwargs = self._kwargs.copy()
kwargs["storage_options"] = tuple(sorted(kwargs["storage_options"].items()))

value = (
self._opener,
self._args,
"a" if self._mode == "w" else self._mode,
tuple(sorted(self._kwargs.items())),
tuple(sorted(kwargs.items())),
self._manager_id,
)
return _HashedSequence(value)
Expand Down
32 changes: 32 additions & 0 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,24 @@ def _h5netcdf_create_group(dataset, name):
return dataset.create_group(name)


def _h5netcdf_opener(filename, mode, storage_options=None, **kwargs):
import h5netcdf

if (
isinstance(filename, str)
and is_remote_uri(filename)
and kwargs["driver"] is None
):
import fsspec

mode_ = "rb" if mode == "r" else mode
fs, _, _ = fsspec.get_fs_token_paths(
filename, mode=mode_, storage_options=storage_options
)
filename = fs.open(filename, mode=mode_)
return h5netcdf.File(filename, mode=mode, **kwargs)


class H5NetCDFStore(WritableCFDataStore):
"""Store for reading and writing data via h5netcdf"""

Expand Down Expand Up @@ -142,6 +160,7 @@ def open(
decode_vlen_strings=True,
driver=None,
driver_kwds=None,
storage_options=None,
):
import h5netcdf

Expand All @@ -160,6 +179,17 @@ def open(
if format not in [None, "NETCDF4"]:
raise ValueError("invalid format for h5netcdf backend")

# get open fsspec-handle first
from xarray.backends.common import _find_absolute_paths

if storage_options is not None:
filename = _find_absolute_paths(
filename,
engine="h5netcdf",
mode=mode,
backend_kwargs=dict(storage_options=storage_options),
)

kwargs = {
"invalid_netcdf": invalid_netcdf,
"decode_vlen_strings": decode_vlen_strings,
Expand Down Expand Up @@ -395,6 +425,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti
decode_vlen_strings=True,
driver=None,
driver_kwds=None,
storage_options=None,
) -> Dataset:
filename_or_obj = _normalize_path(filename_or_obj)
store = H5NetCDFStore.open(
Expand All @@ -407,6 +438,7 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti
decode_vlen_strings=decode_vlen_strings,
driver=driver,
driver_kwds=driver_kwds,
storage_options=storage_options,
)

store_entrypoint = StoreBackendEntrypoint()
Expand Down
21 changes: 21 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -3014,6 +3014,27 @@ def test_zarr_storage_options() -> None:
assert_identical(ds, ds_a)


@requires_h5netcdf
@requires_fsspec
def test_h5netcdf_storage_options() -> None:
with create_tmp_files(2, allow_cleanup_failure=ON_WINDOWS) as (f1, f2):
ds1 = create_test_data()
ds1.to_netcdf(f1, engine="h5netcdf")

ds2 = create_test_data()
ds2.to_netcdf(f2, engine="h5netcdf")

files = [f"file://{f}" for f in [f1, f2]]
ds = xr.open_mfdataset(
files,
engine="h5netcdf",
concat_dim="time",
combine="nested",
storage_options={"skip_instance_cache": False},
)
assert_identical(xr.concat([ds1, ds2], dim="time"), ds)


@requires_scipy
class TestScipyInMemoryData(CFEncodedBase, NetCDF3Only):
engine: T_NetcdfEngine = "scipy"
Expand Down