zarr-developers · sharkinsspatial · Apr 19, 2024 · Apr 19, 2024 · Apr 19, 2024 · Apr 22, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ test = [
     "pytest",
     "scipy",
     "pooch",
+    "h5netcdf",
 ]
 
 

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
@@ -0,0 +1,206 @@
+from typing import List, Mapping, Optional
+
+import fsspec
+import h5py
+import numpy as np
+import xarray as xr
+
+from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
+from virtualizarr.zarr import ZArray
+
+
+def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
+    """
+    Generate ChunkManifest for HDF5 dataset.
+
+    Parameters
+    ----------
+    path: str
+        The path the HDF5 container file
+     dset : h5py.Dataset
+        HDF5 dataset for which to create a ChunkManifest
+
+    Returns
+    -------
+    ChunkManifest
+        A Virtualizarr ChunkManifest
+    """
+    dsid = dataset.id
+
+    if dataset.chunks is None:
+        if dsid.get_offset() is None:
+            raise ValueError("Dataset has no space allocated in the file")
+        else:
+            key_list = [0] * (len(dataset.shape) or 1)
+            key = ".".join(map(str, key_list))
+            chunk_entry = ChunkEntry(
+                path=path,
+                offset=dsid.get_offset(),
+                length=dsid.get_storage_size()
+            )
+            chunk_entries = {key: chunk_entry}
+            chunk_manifest = ChunkManifest(
+                entries=chunk_entries
+            )
+            return chunk_manifest
+    else:
+        num_chunks = dsid.get_num_chunks()
+        if num_chunks == 0:
+            raise ValueError("The dataset is chunked but contains no chunks")
+
+        chunk_entries = dict()
+
+        def get_key(blob):
+            key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]
+            key = ".".join(map(str, key_list))
+            return key
+
+        def store_chunk_entry(blob):
+            chunk_entries[get_key(blob)] = ChunkEntry(
+                path=path,
+                offset=blob.byte_offset,
+                length=blob.size
+            )
+
+        has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
+        if has_chunk_iter:
+            dsid.chunk_iter(store_chunk_entry)
+        else:
+            for index in range(num_chunks):
+                store_chunk_entry(dsid.get_chunk_info(index))
+
+        chunk_manifest = ChunkManifest(
+            entries=chunk_entries
+        )
+        return chunk_manifest
+
+
+def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
+    """
+    Get a list of dimension scale names attached to input HDF5 dataset.
+
+    This is required by the xarray package to work with Zarr arrays. Only
+    one dimension scale per dataset dimension is allowed. If dataset is
+    dimension scale, it will be considered as the dimension to itself.
+
+    Parameters
+    ----------
+    dataset : h5py.Dataset
+        HDF5 dataset.
+
+    Returns
+    -------
+    list
+        List with HDF5 path names of dimension scales attached to input
+        dataset.
+    """
+    dims = list()
+    rank = len(dataset.shape)
+    if rank:
+        for n in range(rank):
+            num_scales = len(dataset.dims[n])
+            if num_scales == 1:
+                dims.append(dataset.dims[n][0].name[1:])
+            elif h5py.h5ds.is_scale(dataset.id):
+                dims.append(dataset.name[1:])
+            elif num_scales > 1:
+                raise ValueError(
+                    f"{dataset.name}: {len(dataset.dims[n])} "
+                    f"dimension scales attached to dimension #{n}"
+                )
+            elif num_scales == 0:
+                # Some HDF5 files do not have dimension scales.
+                # If this is the case, `num_scales` will be 0.
+                # In this case, we mimic netCDF4 and assign phony dimension names.
+                # See https://github.com/fsspec/kerchunk/issues/41
+                dims.append(f"phony_dim_{n}")
+        return dims
+
+
+def _extract_attrs(dataset: h5py.Dataset):
+    """
+    Extract attributes from an HDF5 dataset.
+
+    Parameters
+    ----------
+    dataset : h5py.Dataset
+        An HDF5 dataset.
+    """
+    _HIDDEN_ATTRS = {
+        "REFERENCE_LIST",
+        "CLASS",
+        "DIMENSION_LIST",
+        "NAME",
+        "_Netcdf4Dimid",
+        "_Netcdf4Coordinates",
+        "_nc3_strict",
+        "_NCProperties",
+    }
+    attrs = {}
+    for n, v in dataset.attrs.items():
+        if n in _HIDDEN_ATTRS:
+            continue
+        # Fix some attribute values to avoid JSON encoding exceptions...
+        if isinstance(v, bytes):
+            v = v.decode("utf-8") or " "
+        elif isinstance(v, (np.ndarray, np.number, np.bool_)):
+            if v.dtype.kind == "S":
+                v = v.astype(str)
+            if n == "_FillValue":
+                continue
+            elif v.size == 1:
+                v = v.flatten()[0]
+                if isinstance(v, (np.ndarray, np.number, np.bool_)):
+                    v = v.tolist()
+            else:
+                v = v.tolist()
+        elif isinstance(v, h5py._hl.base.Empty):
+            v = ""
+        if v == "DIMENSION_SCALE":
+            continue
+
+        attrs[n] = v
+        return attrs
+
+
+def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
+    # This chunk determination logic mirrors zarr-python's create
+    # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
+    chunks = dataset.chunks if dataset.chunks else dataset.shape
+    zarray = ZArray(
+        chunks=chunks,
+        compressor=dataset.compression,
+        dtype=dataset.dtype,
+        fill_value=dataset.fillvalue,
+        filters=None,
+        order="C",
+        shape=dataset.shape,
+        zarr_format=2,
+    )
+    manifest = _dataset_chunk_manifest(path, dataset)
+    marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
+    dims = _dataset_dims(dataset)
+    attrs = _extract_attrs(dataset)
+    variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
+    return variable
+
+
+def virtual_vars_from_hdf(
+    path: str,
+    drop_variables: Optional[List[str]] = None,
+) -> Mapping[str, xr.Variable]:
+    if drop_variables is None:
+        drop_variables = []
+    fs, file_path = fsspec.core.url_to_fs(path)
+    open_file = fs.open(path, "rb")
+    f = h5py.File(open_file, mode="r")
+    variables = {}
+    for key in f.keys():
+        if key not in drop_variables:
+            if isinstance(f[key], h5py.Dataset):
+                variable = _dataset_to_variable(path, f[key])
+                variables[key] = variable
+            else:
+                raise NotImplementedError("Nested groups are not yet supported")
+
+    return variables
diff --git a/virtualizarr/tests/test_readers/__init__.py b/virtualizarr/tests/test_readers/__init__.py
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
@@ -0,0 +1,119 @@
+import h5py
+import numpy as np
+import pytest
+import xarray as xr
+
+
+@pytest.fixture
+def empty_chunks_netcdf4_file(tmpdir):
+    ds = xr.Dataset({"data": []})
+    filepath = f"{tmpdir}/empty_chunks.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf")
+    return filepath
+
+
+@pytest.fixture
+def empty_dataset_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/empty_dataset.nc"
+    f = h5py.File(filepath, "w")
+    f.create_dataset("data", shape=(0,), dtype="f")
+    return filepath
+
+
+@pytest.fixture
+def no_chunks_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/no_chunks.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    return filepath
+
+
+@pytest.fixture
+def chunked_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/chunks.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((100, 100))
+    f.create_dataset(name="data", data=data, chunks=(50, 50))
+    return filepath
+
+
+@pytest.fixture
+def single_dimension_scale_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/single_dimension_scale.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    x = [0, 1]
+    f.create_dataset(name="data", data=data)
+    f.create_dataset(name="x", data=x)
+    f["x"].make_scale()
+    f["data"].dims[0].attach_scale(f["x"])
+    return filepath
+
+
+@pytest.fixture
+def is_scale_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/is_scale.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    f.create_dataset(name="data", data=data)
+    f["data"].make_scale()
+    return filepath
+
+
+@pytest.fixture
+def multiple_dimension_scales_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/multiple_dimension_scales.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    f.create_dataset(name="data", data=data)
+    f.create_dataset(name="x", data=[0, 1])
+    f.create_dataset(name="y", data=[0, 1])
+    f["x"].make_scale()
+    f["y"].make_scale()
+    f["data"].dims[0].attach_scale(f["x"])
+    f["data"].dims[0].attach_scale(f["y"])
+    return filepath
+
+
+@pytest.fixture
+def chunked_dimensions_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/chunks_dimension.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((100, 100))
+    x = np.random.random((100))
+    y = np.random.random((100))
+    f.create_dataset(name="data", data=data, chunks=(50, 50))
+    f.create_dataset(name="x", data=x)
+    f.create_dataset(name="y", data=y)
+    f["data"].dims[0].attach_scale(f["x"])
+    f["data"].dims[1].attach_scale(f["y"])
+    return filepath
+
+
+@pytest.fixture
+def string_attribute_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/attributes.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    f["data"].attrs["attribute_name"] = "attribute_name"
+    return filepath
+
+
+@pytest.fixture
+def group_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/group.nc"
+    f = h5py.File(filepath, "w")
+    f.create_group("group")
+    return filepath
+
+
+@pytest.fixture
+def multiple_datasets_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/multiple_datasets.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    f.create_dataset(name="data2", data=data, chunks=None)
+    return filepath