Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions pyfive/dataobjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap, GLOBAL_HEAP_ID
from pyfive.h5d import DatasetID
from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub
from pyfive.h5py import Empty

# these constants happen to have the same value...
UNLIMITED_SIZE = UNDEFINED_ADDRESS
Expand Down Expand Up @@ -230,11 +231,17 @@ def _parse_attribute_msg(self, buffer, offset):

# Read the dataspace information
shape, maxshape = determine_data_shape(buffer, offset)
items = int(np.prod(shape))
offset += _padded_size(attr_dict['dataspace_size'], padding_multiple)

# Read the value(s)
value = self._attr_value(dtype, buffer, items, offset)
# detect Empty/NULL dataspace
if shape is None:
value = Empty(dtype=dtype)
else:
items = int(np.prod(shape))

offset += _padded_size(attr_dict['dataspace_size'], padding_multiple)

# Read the value(s)
value = self._attr_value(dtype, buffer, items, offset)

if shape == ():
value = value[0]
Expand All @@ -250,6 +257,12 @@ def _parse_attribute_msg(self, buffer, offset):

def _attr_value(self, dtype, buf, count, offset):
""" Retrieve an HDF5 attribute value from a buffer. """

# first handle ENUMERATION, we just extract the dtype
if isinstance(dtype, tuple):
if dtype[0] == "ENUMERATION":
dtype = np.dtype(dtype[1], metadata={'enum': dtype[2]})

if isinstance(dtype, tuple):
dtype_class = dtype[0]
if dtype_class == 'VLEN_STRING':
Expand All @@ -272,8 +285,6 @@ def _attr_value(self, dtype, buf, count, offset):
vlen, vlen_data = self._vlen_size_and_data(buf, offset)
value[i] = self._attr_value(base_dtype, vlen_data, vlen, 0)
offset += 16
elif dtype_class == 'ENUMERATION':
return np.dtype(dtype[1],metadata={'enum':dtype[2]})
else:
raise NotImplementedError
else:
Expand Down Expand Up @@ -719,6 +730,9 @@ def determine_data_shape(buf, offset):
elif version == 2:
header = _unpack_struct_from(DATASPACE_MSG_HEADER_V2, buf, offset)
assert header['version'] == 2
# check for Empty aka NULL dataspace in V2 and return early
if header["type"] == 2:
return None, None
offset += DATASPACE_MSG_HEADER_V2_SIZE
else:
raise InvalidHDF5File('unknown dataspace message version')
Expand Down
3 changes: 2 additions & 1 deletion pyfive/h5py.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ class Empty:
Proxy object to represent empty/null dataspaces (a.k.a H5S_NULL).
This can have an associated dtype, but has no shape or data. This is not
the same as an array with shape (0,). This class provided for compatibility
with the H5Py API to support h5netcdf. It is not used by pyfive.
with the H5Py API to support h5netcdf. In pyfive this is used to wrap
attributes associated with null dataspaces.
"""
shape = None
size = None
Expand Down
191 changes: 106 additions & 85 deletions tests/make_attr_datatypes_file.py
Original file line number Diff line number Diff line change
@@ -1,90 +1,111 @@
#! /usr/bin/env python

""" Create a HDF5 file with all the supported attribute types. """
import sys
import h5py
import numpy as np


f = h5py.File('attr_datatypes.hdf5', 'w')
attrs = f.attrs

# intergers
attrs.create('int08_little', -123, dtype='<i1')
attrs.create('int16_little', -123, dtype='<i2')
attrs.create('int32_little', -123, dtype='<i4')
attrs.create('int64_little', -123, dtype='<i8')

attrs.create('uint08_little', 130, dtype='<u1')
attrs.create('uint16_little', 32770, dtype='<u2')
attrs.create('uint32_little', 2147483650, dtype='<u4')
attrs.create('uint64_little', 9223372036854775810, dtype='<u8')

attrs.create('int08_big', -123, dtype='>i1')
attrs.create('int16_big', -123, dtype='>i2')
attrs.create('int32_big', -123, dtype='>i4')
attrs.create('int64_big', -123, dtype='>i8')

attrs.create('uint08_big', 130, dtype='>u1')
attrs.create('uint16_big', 32770, dtype='>u2')
attrs.create('uint32_big', 2147483650, dtype='>u4')
attrs.create('uint64_big', 9223372036854775810, dtype='>u8')

# floating point
attrs.create('float32_little', 123, dtype='<f4')
attrs.create('float64_little', 123, dtype='<f8')

attrs.create('float32_big', 123, dtype='>f4')
attrs.create('float64_big', 123, dtype='>f8')

# fixed-length strings
attrs.create('string_one', b'H', dtype='|S1')
attrs.create('string_two', b'Hi', dtype='|S2')

# variable length strings
attrs['vlen_string'] = b'Hello'

# variable length unicode
attrs['vlen_unicode'] = u'Hello' + chr(0x00A7)

# arrayed numeric types
attrs.create('int32_array', [-123, 45], dtype='<i4')
attrs.create('uint64_array', [12, 34], dtype='>u8')
attrs.create('float32_array', [123, 456], dtype='<f4')

# arrayed variable length strings
attrs['vlen_str_array'] = [b'Hello', b'World!']

# variables length sequences
val = np.empty((2, ), dtype=np.object)
val[0] = np.array([-1, 2], dtype='<i4')
val[1] = np.array([3, 4, 5], dtype='<i4')
dt = h5py.special_dtype(vlen=np.dtype('<i4'))
attrs.create('vlen_int32', val, dtype=dt)

val = np.empty((3, ), dtype=np.object)
val[0] = np.array([1, 2], dtype='>u8')
val[1] = np.array([3, 4, 5], dtype='>u8')
val[2] = np.array([42], dtype='>u8')
dt = h5py.special_dtype(vlen=np.dtype('>u8'))
attrs.create('vlen_uint64', val, dtype=dt)

val = np.empty((3, ), dtype=np.object)
val[0] = np.array([0, ], dtype='<f4')
val[1] = np.array([1, 2, 3], dtype='<f4')
val[2] = np.array([4, 5], dtype='<f4')
dt = h5py.special_dtype(vlen=np.dtype('<f4'))
attrs.create('vlen_float32', val, dtype=dt)

# TODO more complex datatypes
# complex H5T_COMPOUND
attrs.create('complex64_little', 123+456.j, dtype='<c8')
attrs.create('complex128_little', 123+456.j, dtype='<c16')

attrs.create('complex64_big', 123+456.j, dtype='<c8')
attrs.create('complex128_big', 123+456.j, dtype='<c16')

# booleans HT5_ENUM
#attrs.create('bool', True, dtype=np.bool_)


f.close()
from pathlib import Path


def create_file(path):

with h5py.File(path, 'w') as f:
attrs = f.attrs

# integers
attrs.create('int08_little', -123, dtype='<i1')
attrs.create('int16_little', -123, dtype='<i2')
attrs.create('int32_little', -123, dtype='<i4')
attrs.create('int64_little', -123, dtype='<i8')

attrs.create('uint08_little', 130, dtype='<u1')
attrs.create('uint16_little', 32770, dtype='<u2')
attrs.create('uint32_little', 2147483650, dtype='<u4')
attrs.create('uint64_little', 9223372036854775810, dtype='<u8')

attrs.create('int08_big', -123, dtype='>i1')
attrs.create('int16_big', -123, dtype='>i2')
attrs.create('int32_big', -123, dtype='>i4')
attrs.create('int64_big', -123, dtype='>i8')

attrs.create('uint08_big', 130, dtype='>u1')
attrs.create('uint16_big', 32770, dtype='>u2')
attrs.create('uint32_big', 2147483650, dtype='>u4')
attrs.create('uint64_big', 9223372036854775810, dtype='>u8')

# floating point
attrs.create('float32_little', 123, dtype='<f4')
attrs.create('float64_little', 123, dtype='<f8')

attrs.create('float32_big', 123, dtype='>f4')
attrs.create('float64_big', 123, dtype='>f8')

# fixed-length strings
attrs.create('string_one', b'H', dtype='|S1')
attrs.create('string_two', b'Hi', dtype='|S2')

# variable length strings
attrs['vlen_string'] = b'Hello'

# variable length unicode
attrs['vlen_unicode'] = u'Hello' + chr(0x00A7)

# arrayed numeric types
attrs.create('int32_array', [-123, 45], dtype='<i4')
attrs.create('uint64_array', [12, 34], dtype='>u8')
attrs.create('float32_array', [123, 456], dtype='<f4')

# arrayed variable length strings and bytes
# see https://github.com/NCAS-CMS/pyfive/pull/102#discussion_r2393563713
attrs['vlen_str_array'] = np.array([b'Hello', b'World!'], dtype="|S")
attrs['vlen_str_array1'] = [b'Hello', b'World!']

# variables length sequences
val = np.empty((2, ), dtype=object)
val[0] = np.array([-1, 2], dtype='<i4')
val[1] = np.array([3, 4, 5], dtype='<i4')
dt = h5py.special_dtype(vlen=np.dtype('<i4'))
attrs.create('vlen_int32', val, dtype=dt)

val = np.empty((3, ), dtype=object)
val[0] = np.array([1, 2], dtype='>u8')
val[1] = np.array([3, 4, 5], dtype='>u8')
val[2] = np.array([42], dtype='>u8')
dt = h5py.special_dtype(vlen=np.dtype('>u8'))
attrs.create('vlen_uint64', val, dtype=dt)

val = np.empty((3, ), dtype=object)
val[0] = np.array([0, ], dtype='<f4')
val[1] = np.array([1, 2, 3], dtype='<f4')
val[2] = np.array([4, 5], dtype='<f4')
dt = h5py.special_dtype(vlen=np.dtype('<f4'))
attrs.create('vlen_float32', val, dtype=dt)

# TODO more complex datatypes
# complex H5T_COMPOUND
attrs.create('complex64_little', 123+456.j, dtype='<c8')
attrs.create('complex128_little', 123+456.j, dtype='<c16')

attrs.create('complex64_big', 123+456.j, dtype='<c8')
attrs.create('complex128_big', 123+456.j, dtype='<c16')

# booleans HT5_ENUM
#attrs.create('bool', True, dtype=np.bool_)

# H5T_ENUM
# Define an enum dtype
enum_dtype = h5py.special_dtype(
enum=(np.int32, {'one': 1, 'two': 2, 'three': 3})
)
# Create an attribute with that enum dtype
attrs.create('enum', 2, dtype=enum_dtype)

# empty string with NULL dataspace
# see https://github.com/NCAS-CMS/pyfive/issues/100
attrs.create('empty_string', h5py.Empty(dtype=np.dtype('|S1')))


if __name__ == "__main__":
default_path = Path(__file__).parent / "attr_datatypes.hdf5"
filepath = Path(sys.argv[1]) if len(sys.argv) > 1 else default_path
create_file(filepath)
49 changes: 49 additions & 0 deletions tests/test_attr_datatypes.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@
""" Unit tests for pyfive. """
import os
import sys
import subprocess

import numpy as np
from numpy.testing import assert_array_equal
import pytest

import pyfive

DIRNAME = os.path.dirname(__file__)
ATTR_DATATYPES_HDF5_FILE = os.path.join(DIRNAME, 'attr_datatypes.hdf5')
MAKE_ATTR_DATATYPES_SCRIPT = os.path.join(DIRNAME, 'make_attr_datatypes_file.py')
ATTR_DATATYPES_HDF5_FILE_2 = os.path.join(DIRNAME, 'attr_datatypes_2.hdf5')


@pytest.fixture(scope="module")
def attr_datatypes_hdf5(tmp_path_factory):
tmp_dir = tmp_path_factory.mktemp("attr_datatypes")
path = tmp_dir / "attr_datatypes.hdf5"
subprocess.run([sys.executable, MAKE_ATTR_DATATYPES_SCRIPT, str(path)], check=True)
return str(path)


def test_numeric_scalar_attr_datatypes():

with pyfive.File(ATTR_DATATYPES_HDF5_FILE) as hfile:
Expand Down Expand Up @@ -85,6 +97,24 @@ def test_numeric_array_attr_datatypes():
assert hfile.attrs['vlen_str_array'].dtype == np.dtype('S6')


def test_string_array_attr_datatypes(attr_datatypes_hdf5):

with pyfive.File(attr_datatypes_hdf5) as hfile:

# bytes
assert hfile.attrs['vlen_str_array'][0] == b'Hello'
assert hfile.attrs['vlen_str_array'][1] == b'World!'

assert hfile.attrs['vlen_str_array'].dtype == np.dtype('S6')

# strings
assert hfile.attrs['vlen_str_array1'][0] == 'Hello'
assert hfile.attrs['vlen_str_array1'][1] == 'World!'

assert hfile.attrs['vlen_str_array1'].dtype == np.dtype('O')
assert hfile.attrs['vlen_str_array1'].dtype.metadata == {'h5py_encoding': 'utf-8'}


def test_vlen_sequence_attr_datatypes():

with pyfive.File(ATTR_DATATYPES_HDF5_FILE) as hfile:
Expand All @@ -107,6 +137,25 @@ def test_vlen_sequence_attr_datatypes():
assert_array_equal(vlen_attr[2], [4, 5])


def test_enum_attr_datatypes(attr_datatypes_hdf5):

with pyfive.File(attr_datatypes_hdf5) as hfile:
import h5py
enum_attr = hfile.attrs['enum']
assert enum_attr == 2
assert enum_attr.dtype == h5py.special_dtype(
enum=(np.int32, {'one': 1, 'two': 2, 'three': 3})
)


def test_empty_string_datatypes(attr_datatypes_hdf5):

with pyfive.File(attr_datatypes_hdf5) as hfile:
enum_attr = hfile.attrs['empty_string']
assert enum_attr == pyfive.Empty(dtype=np.dtype('|S1'))
assert enum_attr.dtype == np.dtype('|S1')


def test_attributes_2():

ascii = "ascii"
Expand Down
38 changes: 38 additions & 0 deletions tests/test_buffer_issue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os

import pyfive
import s3fs


def _load_nc_file(ncvar):
"""
Get the netcdf file and its b-tree.

Fixture to test loading an issue file.
"""
issue_file = "da193a_25_6hr_t_pt_cordex__198807-198807.nc"
storage_options = {
'anon': True,
'client_kwargs': {'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk"}, # final proxy
}
test_file_uri = os.path.join(
"esmvaltool-zarr",
issue_file
)
fs = s3fs.S3FileSystem(**storage_options)
s3file = fs.open(test_file_uri, 'rb')
nc = pyfive.File(s3file)
ds = nc[ncvar]

return ds


def test_buffer_issue():
"""
Test the case when the attribute contains no data.

This happens when DATASPACE is NULL and DATA is empty.
"""
print("File with issue da193a_25_6hr_t_pt_cordex__198807-198807.nc")
print("Variable m01s30i111")
_load_nc_file('m01s30i111')
Loading