Skip to content

Add a function to produce a ManifestStore from HDF5 files #516

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 4, 2025
Merged
171 changes: 140 additions & 31 deletions virtualizarr/readers/hdf/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Dict,
Hashable,
Iterable,
Expand All @@ -23,6 +24,8 @@
ChunkEntry,
ChunkManifest,
ManifestArray,
ManifestGroup,
ManifestStore,
)
from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri
from virtualizarr.manifests.utils import create_v3_array_metadata
Expand All @@ -41,6 +44,7 @@
if TYPE_CHECKING:
from h5py import Dataset as H5Dataset
from h5py import Group as H5Group
from obstore.store import ObjectStore

FillValueType = Union[
int,
Expand All @@ -58,6 +62,111 @@


class HDFVirtualBackend(VirtualBackend):
@staticmethod
def _construct_manifest_array(
Comment on lines +65 to +66
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Every staticmethod here aside from open_virtual_dataset may as well just be an actual function. You will need to do that anyway later to refactor to make the reader just one function.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But you could leave that to do in #498.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 That is the plan. I think @maxrjones intent here was to make a very small change set to demonstrate the feasibility of using ManifestStore and that we would refactor the structure in a PR for #498.

path: str,
dataset: H5Dataset,
group: str,
) -> ManifestArray:
"""
Construct a ManifestArray from an h5py dataset
Parameters
----------
path: str
The path of the hdf5 file.
dataset : h5py.Dataset
An h5py dataset.
group : str
Name of the group containing this h5py.Dataset.
Returns
-------
ManifestArray
"""
chunks = dataset.chunks if dataset.chunks else dataset.shape
codecs = codecs_from_dataset(dataset)
attrs = HDFVirtualBackend._extract_attrs(dataset)
dtype = dataset.dtype

codec_configs = [
numcodec_config_to_configurable(codec.get_config()) for codec in codecs
]

fill_value = dataset.fillvalue.item()
dims = tuple(HDFVirtualBackend._dataset_dims(dataset, group=group))
metadata = create_v3_array_metadata(
shape=dataset.shape,
data_type=dtype,
chunk_shape=chunks,
fill_value=fill_value,
codecs=codec_configs,
dimension_names=dims,
attributes=attrs,
)

manifest = HDFVirtualBackend._dataset_chunk_manifest(path, dataset)
return ManifestArray(metadata=metadata, chunkmanifest=manifest)

@staticmethod
def _construct_manifest_group(
store: ObjectStore,
filepath: str,
*,
group: str | None = None,
drop_variables: Optional[List[str]] = None,
) -> ManifestGroup:
"""
Construct a virtual Group from a HDF dataset.
"""
from virtualizarr.utils import ObstoreReader

if drop_variables is None:
drop_variables = []

reader = ObstoreReader(store=store, path=filepath)
f = h5py.File(reader, mode="r")

if group is not None and group != "":
g = f[group]
group_name = group
if not isinstance(g, h5py.Group):
raise ValueError("The provided group is not an HDF group")
else:
g = f["/"]
group_name = "/"

manifest_dict = {}
non_coordinate_dimesion_vars = HDFVirtualBackend._find_non_coord_dimension_vars(
group=g
)
drop_variables = list(set(drop_variables + non_coordinate_dimesion_vars))
attrs: dict[str, Any] = {}
for key in g.keys():
if key not in drop_variables:
if isinstance(g[key], h5py.Dataset):
variable = HDFVirtualBackend._construct_manifest_array(
path=filepath,
dataset=g[key],
group=group_name,
)
if variable is not None:
manifest_dict[key] = variable
return ManifestGroup(arrays=manifest_dict, attributes=attrs)

@staticmethod
def _create_manifest_store(
filepath: str,
*,
prefix: str,
store: ObjectStore,
group: str | None = None,
) -> ManifestStore:
# Create a group containing dataset level metadata and all the manifest arrays
manifest_group = HDFVirtualBackend._construct_manifest_group(
store=store, filepath=filepath, group=group
)
# Convert to a manifest store
return ManifestStore(stores={prefix: store}, group=manifest_group)

@staticmethod
def open_virtual_dataset(
filepath: str,
Expand Down Expand Up @@ -119,7 +228,7 @@ def open_virtual_dataset(
def _dataset_chunk_manifest(
path: str,
dataset: H5Dataset,
) -> Optional[ChunkManifest]:
) -> ChunkManifest:
"""
Generate ChunkManifest for HDF5 dataset.

Expand All @@ -138,7 +247,7 @@ def _dataset_chunk_manifest(
dsid = dataset.id
if dataset.chunks is None:
if dsid.get_offset() is None:
return None
chunk_manifest = ChunkManifest(entries={}, shape=dataset.shape)
else:
key_list = [0] * (len(dataset.shape) or 1)
key = ".".join(map(str, key_list))
Expand All @@ -149,42 +258,42 @@ def _dataset_chunk_manifest(
chunk_key = ChunkKey(key)
chunk_entries = {chunk_key: chunk_entry}
chunk_manifest = ChunkManifest(entries=chunk_entries)
return chunk_manifest
else:
num_chunks = dsid.get_num_chunks()
if num_chunks == 0:
raise ValueError("The dataset is chunked but contains no chunks")
shape = tuple(
math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks)
)
paths = np.empty(shape, dtype=np.dtypes.StringDType) # type: ignore
offsets = np.empty(shape, dtype=np.uint64)
lengths = np.empty(shape, dtype=np.uint64)

def get_key(blob):
return tuple(
[a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]
chunk_manifest = ChunkManifest(entries={}, shape=dataset.shape)
else:
shape = tuple(
math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks)
)
paths = np.empty(shape, dtype=np.dtypes.StringDType) # type: ignore
offsets = np.empty(shape, dtype=np.uint64)
lengths = np.empty(shape, dtype=np.uint64)

def get_key(blob):
return tuple(
[a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]
)

def add_chunk_info(blob):
key = get_key(blob)
paths[key] = path
offsets[key] = blob.byte_offset
lengths[key] = blob.size
def add_chunk_info(blob):
key = get_key(blob)
paths[key] = path
offsets[key] = blob.byte_offset
lengths[key] = blob.size

has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
if has_chunk_iter:
dsid.chunk_iter(add_chunk_info)
else:
for index in range(num_chunks):
add_chunk_info(dsid.get_chunk_info(index))
has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
if has_chunk_iter:
dsid.chunk_iter(add_chunk_info)
else:
for index in range(num_chunks):
add_chunk_info(dsid.get_chunk_info(index))

chunk_manifest = ChunkManifest.from_arrays(
paths=paths, # type: ignore
offsets=offsets,
lengths=lengths,
)
return chunk_manifest
chunk_manifest = ChunkManifest.from_arrays(
paths=paths, # type: ignore
offsets=offsets,
lengths=lengths,
)
return chunk_manifest

@staticmethod
def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
Expand Down
17 changes: 8 additions & 9 deletions virtualizarr/tests/test_readers/test_hdf/test_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,18 @@ class TestDatasetChunkManifest:
def test_empty_chunks(self, empty_chunks_hdf5_file):
f = h5py.File(empty_chunks_hdf5_file)
ds = f["data"]
with pytest.raises(ValueError, match="chunked but contains no chunks"):
HDFVirtualBackend._dataset_chunk_manifest(
path=empty_chunks_hdf5_file, dataset=ds
)
manifest = HDFVirtualBackend._dataset_chunk_manifest(
path=empty_chunks_hdf5_file, dataset=ds
)
assert manifest.shape_chunk_grid == (0,)

@pytest.mark.skip("Need to differentiate non coordinate dimensions from empty")
def test_empty_dataset(self, empty_dataset_hdf5_file):
f = h5py.File(empty_dataset_hdf5_file)
ds = f["data"]
with pytest.raises(ValueError, match="no space allocated in the file"):
HDFVirtualBackend._dataset_chunk_manifest(
path=empty_dataset_hdf5_file, dataset=ds
)
manifest = HDFVirtualBackend._dataset_chunk_manifest(
path=empty_dataset_hdf5_file, dataset=ds
)
Comment on lines +27 to +29
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally it's better to write tests that use higher-level public API instead of directly calling internals if you can help it, but that's just a nit.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 I think the tests will be easier to structure when we move away from VirtualBacked base to something more functional and the returned ManifestStore will be simpler to introspect and reason about than a complete xr.Dataset.

assert manifest.shape_chunk_grid == (0,)

def test_no_chunking(self, no_chunks_hdf5_file):
f = h5py.File(no_chunks_hdf5_file)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import numpy as np
import pytest
import xarray as xr

from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import (
requires_hdf5plugin,
requires_obstore,
)


@pytest.fixture(name="basic_ds")
def basic_ds():
x = np.arange(100)
y = np.arange(100)
temperature = 0.1 * x[:, None] + 0.1 * y[None, :]
ds = xr.Dataset(
{"temperature": (["x", "y"], temperature)},
coords={"x": np.arange(100), "y": np.arange(100)},
)
return ds


@requires_hdf5plugin
@requires_obstore
class TestHDFManifestStore:
def test_rountrip_simple_virtualdataset(self, tmpdir, basic_ds):
from obstore.store import LocalStore

"Roundtrip a dataset to/from NetCDF with the HDF reader and ManifestStore"

filepath = f"{tmpdir}/basic_ds_roundtrip.nc"
basic_ds.to_netcdf(filepath, engine="h5netcdf")
store = HDFVirtualBackend._create_manifest_store(
filepath=filepath, store=LocalStore(), prefix="file://"
)
rountripped_ds = xr.open_dataset(
store, engine="zarr", consolidated=False, zarr_format=3
)
xr.testing.assert_allclose(basic_ds, rountripped_ds)
21 changes: 20 additions & 1 deletion virtualizarr/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import importlib
import io
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union

from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec
Expand All @@ -12,14 +13,32 @@
if TYPE_CHECKING:
import fsspec.core
import fsspec.spec
from obstore import ReadableFile
from obstore.store import ObjectStore

# See pangeo_forge_recipes.storage
OpenFileType = Union[
fsspec.core.OpenFile, fsspec.spec.AbstractBufferedFile, io.IOBase
]


from dataclasses import dataclass, field
class ObstoreReader:
_reader: ReadableFile

def __init__(self, store: ObjectStore, path: str) -> None:
import obstore as obs

self._reader = obs.open_reader(store, path)

def read(self, size: int, /) -> bytes:
return self._reader.read(size).to_bytes()

def seek(self, offset: int, whence: int = 0, /):
# TODO: Check on default for whence
return self._reader.seek(offset, whence)

def tell(self) -> int:
return self._reader.tell()


@dataclass
Expand Down