Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Changelog.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
Version 1.0.0
-------------

**2025-11-12**

* Added consolidated metadata functionality by `Ezequiel Cimadevilla <https://github.com/zequihg50>`_ in https://github.com/NCAS-CMS/pyfive/pull/145

Version 0.9.0
-------------

Expand Down
14 changes: 9 additions & 5 deletions pyfive/h5d.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,19 @@ def __init__(self, dataobject, noindex=False, pseudo_chunking_size_MB=4):
# No file descriptor => Not Posix
self.posix = False
self.__fh = fh
self.pseudo_chunking_size = pseudo_chunking_size_MB*1024*1024
self.pseudo_chunking_size = pseudo_chunking_size_MB * 1024 * 1024
try:
# maybe this is an S3File instance?
self._filename = getattr(fh,'path')
self._filename = getattr(fh, 'path')
except:
# maybe a remote https file opened as bytes?
# failing that, maybe a memory file, return as None
self._filename = getattr(fh,'full_name','None')
self._filename = getattr(fh, 'full_name', 'None')
else:
# Has a file descriptor => Posix
self.posix = True
self._filename = fh.name
self.pseudo_chunking_size = 0
self.pseudo_chunking_size = 0

self.filter_pipeline = dataobject.filter_pipeline
self.shape = dataobject.shape
Expand Down Expand Up @@ -284,7 +284,11 @@ def first_chunk(self):

"""
self.__chunk_init_check()
return self.get_chunk_info(0).byte_offset
min_offset = None
for k in self._index:
if min_offset is None or self._index[k].byte_offset < min_offset:
min_offset = self._index[k].byte_offset
return min_offset

#### The following method can be used to set pseudo chunking size after the
#### file has been closed and before data transactions. This is pyfive specific
Expand Down
60 changes: 39 additions & 21 deletions pyfive/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from pyfive.h5py import Datatype



class Group(Mapping):
"""
An HDF5 Group which may hold attributes, datasets, or other groups.
Expand Down Expand Up @@ -64,7 +63,6 @@ def __getitem__(self, y):
"""
return self.__getitem_lazy_control(y, noindex=False)


def get_lazy_view(self, y):
"""
This instantiates the object y, and if it is a
Expand All @@ -81,7 +79,6 @@ def get_lazy_view(self, y):

return self.__getitem_lazy_control(y, noindex=True)


def __getitem_lazy_control(self, y, noindex):
"""
This is the routine which actually does the get item
Expand Down Expand Up @@ -130,7 +127,7 @@ def __getitem_lazy_control(self, y, noindex):
if additional_obj != '.':
raise KeyError('%s is a dataset, not a group' % (obj_name))
return Dataset(obj_name, DatasetID(dataobjs, noindex=noindex), self)

try:
# if true, this may well raise a NotImplementedError, if so, we need
# to warn the user, who may be able to use other parts of the data.
Expand Down Expand Up @@ -263,14 +260,35 @@ def __init__(self, filename, mode='r'):
self.userblock_size = 0
super(File, self).__init__('/', dataobjects, self)

@property
def consolidated_metadata(self):
"""Returns True if all B-tree nodes for chunked datasets are located before the first chunk in the file."""
is_consolidated = True
f = self

# for all chunked datasets, check if all btree nodes are located before any dataset chunk
max_btree, min_chunk = None, None
for ds in f:
if isinstance(f[ds], Dataset):
if f[ds].id.layout_class == 2:
if max_btree is None or f[ds].id.btree_range[1] > max_btree:
max_btree = f[ds].id.btree_range[1]
if min_chunk is None or f[ds].id.first_chunk < min_chunk:
min_chunk = f[ds].id.first_chunk

if max_btree is not None and min_chunk is not None:
is_consolidated = max_btree < min_chunk

return is_consolidated

def __repr__(self):
return '<HDF5 file "%s" (mode r)>' % (os.path.basename(self.filename))

def _get_object_by_address(self, obj_addr):
""" Return the object pointed to by a given address. """
if self._dataobjects.offset == obj_addr:
return self

queue = deque([(self.name.rstrip('/'), self)])
while queue:
base, grp = queue.popleft()
Expand All @@ -288,6 +306,7 @@ def close(self):
""" Close the file. """
if self._close:
self._fh.close()

__del__ = close

def __enter__(self):
Expand Down Expand Up @@ -340,7 +359,6 @@ class Dataset(object):
Group instance containing this dataset.

"""


def __init__(self, name, datasetid, parent):
""" initalize. """
Expand All @@ -349,15 +367,14 @@ def __init__(self, name, datasetid, parent):
self.name = name
self._attrs = None
self._astype = None
self.id=datasetid

self.id = datasetid
""" This is the DatasetID instance which provides the actual data access methods. """

#horrible kludge for now,
#https://github.com/NCAS-CMS/pyfive/issues/13#issuecomment-2557121461
#we hide stuff we need here
# horrible kludge for now,
# https://github.com/NCAS-CMS/pyfive/issues/13#issuecomment-2557121461
# we hide stuff we need here
self._dataobjects = self.id._meta


def __repr__(self):
info = (os.path.basename(self.name), self.shape, self.dtype)
Expand Down Expand Up @@ -392,16 +409,15 @@ def astype(self, dtype):
def len(self):
""" Return the size of the first axis. """
return self.shape[0]

def iter_chunks(self, *args):
return self.id.iter_chunks(args)


@property
def shape(self):
""" shape attribute. """
return self.id.shape

@property
def maxshape(self):
""" maxshape attribute. (None for unlimited dimensions) """
Expand Down Expand Up @@ -473,15 +489,17 @@ def dims(self):
def attrs(self):
""" attrs attribute. """
return self.id._meta.attributes



class DimensionManager(Sequence):
""" Represents a collection of dimensions associated with a dataset. """

def __init__(self, dset):
ndim = len(dset.shape)
dim_list = [[]]*ndim
dim_list = [[]] * ndim
if 'DIMENSION_LIST' in dset.attrs:
dim_list = dset.attrs['DIMENSION_LIST']
dim_labels = [b'']*ndim
dim_labels = [b''] * ndim
if 'DIMENSION_LABELS' in dset.attrs:
dim_labels = dset.attrs['DIMENSION_LABELS']
self._dims = [
Expand Down Expand Up @@ -521,8 +539,9 @@ class AstypeContext(object):
"""
Context manager which allows changing the type read from a dataset.
"""
#FIXME:ENUM should this allow a conversion from enum base types to values using dictionary?
#Probably not, as it would be additional functionality to the h5py interface???

# FIXME:ENUM should this allow a conversion from enum base types to values using dictionary?
# Probably not, as it would be additional functionality to the h5py interface???

def __init__(self, dset, dtype):
self._dset = dset
Expand All @@ -533,4 +552,3 @@ def __enter__(self):

def __exit__(self, *args):
self._dset._astype = None

53 changes: 53 additions & 0 deletions tests/test_consolidated_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import numpy as np
import pytest

import pyfive
import h5py


def test_consolidated_metadata(name, name_consolidated, data, vname):
# non consolidated metadata
with pyfive.File(name) as hfile:
assert ((hfile[vname].id.btree_range[1] > hfile[vname].id.first_chunk) and (not hfile.consolidated_metadata))

# all btree nodes before first chunk (aka consolidated metadata)
with pyfive.File(name_consolidated) as hfile:
assert ((hfile[vname].id.btree_range[1] < hfile[vname].id.first_chunk) and hfile.consolidated_metadata)


@pytest.fixture(scope='module')
def data():
return np.arange(365 * 721 * 1440, dtype="f4").reshape((365, 721, 1440))


@pytest.fixture(scope='module')
def vname():
return "a"


@pytest.fixture(scope='module')
def name(data, vname, modular_tmp_path):
name = modular_tmp_path / 'non-consolidated-metadata.hdf5'

with h5py.File(name, 'w') as hfile:
hfile.create_dataset(vname, dtype="float32", shape=data.shape, chunks=(1, 721, 1440),
compression="gzip", shuffle=True)
# in this way first logical chunk (0,0,0) will not be first physical chunk (byte offset)
hfile["a"][250:] = data[250:]
hfile["a"][:250] = data[:250]

return name


@pytest.fixture(scope='module')
def name_consolidated(data, vname, modular_tmp_path):
name_co = modular_tmp_path / 'consolidated-metadata.hdf5'

with h5py.File(name_co, 'w', meta_block_size=2 ** 20) as hfile:
hfile.create_dataset(vname, dtype="float32", shape=data.shape, chunks=(1, 721, 1440),
compression="gzip", shuffle=True)
# in this way first logical chunk (0,0,0) will not be first physical chunk (byte offset)
hfile["a"][250:] = data[250:]
hfile["a"][:250] = data[:250]

return name_co
Loading