Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 38 additions & 43 deletions pyfive/dataobjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from collections import OrderedDict
import struct
import warnings
from io import UnsupportedOperation

import numpy as np

Expand All @@ -19,10 +18,11 @@
from pyfive.btree import BTreeV2GroupNames, BTreeV2GroupOrders
from pyfive.btree import BTreeV2AttrCreationOrder, BTreeV2AttrNames
from pyfive.btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER, LZF_FILTER
from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap, GLOBAL_HEAP_ID
from pyfive.h5d import DatasetID
from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap, GLOBAL_HEAP_ID, _decode_array, dtype_replace_refs_with_object
from pyfive.p5t import P5Type, P5StringType, P5CompoundType, P5VlenStringType, P5ReferenceType, P5SequenceType, P5EnumType
from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub
from pyfive.h5py import Empty
from pyfive.h5d import DatasetID

# these constants happen to have the same value...
UNLIMITED_SIZE = UNDEFINED_ADDRESS
Expand Down Expand Up @@ -218,30 +218,28 @@ def _parse_attribute_msg(self, buffer, offset):

# Read the datatype information
try:
dtype = DatatypeMessage(buffer, offset).dtype
ptype = DatatypeMessage(buffer, offset).ptype
except NotImplementedError:
if name == 'REFERENCE_LIST':
pass #suppress this one, no one actually cares about these as far as I know
else:
warnings.warn(
f"Attribute {name} type not implemented, set to None."
)
warnings.warn(
f"Attribute {name} type not implemented, set to None."
)
return name, None

offset += _padded_size(attr_dict['datatype_size'], padding_multiple)

# Read the dataspace information
shape, maxshape = determine_data_shape(buffer, offset)

# detect Empty/NULL dataspace
if shape is None:
value = Empty(dtype=dtype)
value = Empty(dtype=ptype.dtype)
else:
items = int(np.prod(shape))

offset += _padded_size(attr_dict['dataspace_size'], padding_multiple)

# Read the value(s)
value = self._attr_value(dtype, buffer, items, offset)
value = self._attr_value(ptype, buffer, items, offset)

if shape == ():
value = value[0]
Expand All @@ -255,42 +253,44 @@ def _parse_attribute_msg(self, buffer, offset):

return name, value

def _attr_value(self, dtype, buf, count, offset):
def _attr_value(self, ptype, buf, count, offset):
""" Retrieve an HDF5 attribute value from a buffer. """

# first handle ENUMERATION, we just extract the dtype
if isinstance(dtype, tuple):
if dtype[0] == "ENUMERATION":
dtype = np.dtype(dtype[1], metadata={'enum': dtype[2]})
elif dtype[0] == "COMPOUND":
dtype = np.dtype(dtype[1])

if isinstance(dtype, tuple):
dtype_class = dtype[0]
if dtype_class == 'VLEN_STRING':
fdtype = np.dtype('O', metadata={'h5py_encoding': 'utf-8'})
else:
fdtype= np.dtype('O')
# numpy storage dtype
dtype = ptype.dtype

value = np.empty(count, dtype=fdtype)
if isinstance(ptype, (P5SequenceType, P5ReferenceType, P5VlenStringType)):
# todo: check, if this can be done in the P5Type
# small hack to get Reference output ptype right
if isinstance(ptype, P5ReferenceType):
dtype = dtype_replace_refs_with_object(dtype)

value = np.empty(count, dtype=dtype)
for i in range(count):
if dtype_class == 'VLEN_STRING':
if isinstance(ptype, P5StringType):
_, vlen_data = self._vlen_size_and_data(buf, offset)
value[i] = vlen_data.decode('utf-8')
offset += 16
elif dtype_class == 'REFERENCE':
elif isinstance(ptype, P5ReferenceType):
address, = struct.unpack_from('<Q', buf, offset=offset)
value[i] = Reference(address)
offset += 8
elif dtype_class == "VLEN_SEQUENCE":
base_dtype = dtype[1]
elif isinstance(ptype, P5SequenceType):
vlen, vlen_data = self._vlen_size_and_data(buf, offset)
value[i] = self._attr_value(base_dtype, vlen_data, vlen, 0)
value[i] = self._attr_value(ptype.base_dtype, vlen_data, vlen, 0)
offset += 16
else:
raise NotImplementedError
else:
value = np.frombuffer(buf, dtype=dtype, count=count, offset=offset)
if not ptype.is_atomic:
# todo: check for Enum etc types
value = value.view(dtype)
if isinstance(ptype, P5CompoundType):
new_dtype = dtype_replace_refs_with_object(ptype.dtype)
new_array = np.empty(value.shape, dtype=new_dtype)
new_array[:] = value
value = _decode_array(value, new_array)
return value

def _vlen_size_and_data(self, buf, offset):
Expand Down Expand Up @@ -358,27 +358,22 @@ def fillvalue(self):
size = 0

if size:
if isinstance(self.dtype, tuple):
if self.dtype[0] == 'VLEN_STRING':
fillvalue = self._attr_value(self.dtype, self.msg_data, 1, offset)[0]
elif self.dtype[0] in ['ENUMERATION']:
fillvalue = 0
else:
raise ValueError(f'Unrecognised dtype [{self.dtype}]')
if isinstance(self.ptype, P5VlenStringType):
fillvalue = self._attr_value(self.ptype, self.msg_data, 1, offset)[0]
else:
payload = self.msg_data[offset:offset+size]
fillvalue = np.frombuffer(payload, self.dtype, count=1)[0]
fillvalue = np.frombuffer(payload, self.ptype.dtype, count=1)[0]
else:
fillvalue = 0
return fillvalue

@property
def dtype(self):
def ptype(self):
""" Datatype of the dataset. """
msg = self.find_msg_type(DATATYPE_MSG_TYPE)[0]
msg_offset = msg['offset_to_message']
dtype = DatatypeMessage(self.msg_data, msg_offset).dtype
return dtype
ptype = DatatypeMessage(self.msg_data, msg_offset).ptype
return ptype

@property
def chunks(self):
Expand Down
103 changes: 41 additions & 62 deletions pyfive/datatype_msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from .core import _padded_size, _structure_size, _unpack_struct_from
from .core import InvalidHDF5File

from .p5t import P5Type, P5CompoundType, P5CompoundField, P5FixedStringType, P5VlenStringType, P5SequenceType, P5EnumType, P5OpaqueType, P5FloatType, P5ReferenceType, P5StringType, P5IntegerType

import numpy as np
import warnings


class DatatypeMessage(object):
""" Representation of a HDF5 Datatype Message. """
Expand All @@ -15,7 +17,7 @@ class DatatypeMessage(object):
def __init__(self, buf, offset):
self.buf = buf
self.offset = offset
self.dtype = self.determine_dtype()
self.ptype = self.determine_dtype()

def determine_dtype(self):
""" Return the dtype (often numpy-like) for the datatype message. """
Expand All @@ -39,17 +41,13 @@ def determine_dtype(self):
elif datatype_class == DATATYPE_COMPOUND:
return self._determine_dtype_compound(datatype_msg)
elif datatype_class == DATATYPE_REFERENCE:
return ('REFERENCE', datatype_msg['size'])
return P5ReferenceType(datatype_msg['size'], f"V{datatype_msg['size']}")
elif datatype_class == DATATYPE_ENUMERATED:
return self._determine_dtype_enum(datatype_msg)
elif datatype_class == DATATYPE_ARRAY:
raise NotImplementedError("Array datatype class not supported.")
elif datatype_class == DATATYPE_VARIABLE_LENGTH:
vlen_type = self._determine_dtype_vlen(datatype_msg)
if vlen_type[0] == 'VLEN_SEQUENCE':
base_type = self.determine_dtype()
vlen_type = ('VLEN_SEQUENCE', base_type)
return vlen_type
return self._determine_dtype_vlen(datatype_msg)
raise InvalidHDF5File('Invalid datatype class %i' % (datatype_class))

def _determine_dtype_fixed_point(self, datatype_msg):
Expand All @@ -75,7 +73,7 @@ def _determine_dtype_fixed_point(self, datatype_msg):
# not read, assumed to be IEEE standard format
self.offset += 4

return byte_order_char + dtype_char + str(length_in_bytes)
return P5IntegerType(byte_order_char + dtype_char + str(length_in_bytes))

def _determine_dtype_floating_point(self, datatype_msg):
""" Return the NumPy dtype for a floating point class. """
Expand All @@ -96,70 +94,53 @@ def _determine_dtype_floating_point(self, datatype_msg):
# not read, assumed to be IEEE standard format
self.offset += 12

return byte_order_char + dtype_char + str(length_in_bytes)

return P5FloatType(byte_order_char + dtype_char + str(length_in_bytes))

@staticmethod
def _determine_dtype_string(datatype_msg):
""" Return the NumPy dtype for a string class. """
return 'S' + str(datatype_msg['size'])
return P5FixedStringType(datatype_msg['size'])

def _determine_dtype_compound(self, datatype_msg):
""" Return the dtype of a compound class if supported. """
bit_field_0 = datatype_msg['class_bit_field_0']
bit_field_1 = datatype_msg['class_bit_field_1']
n_comp = bit_field_0 + (bit_field_1 << 4)
version = datatype_msg['class_and_version'] >> 4

# read in the members of the compound datatype
# read in the fields of the compound datatype
# at the moment we need to skip two bytes which I do
#
members = []
fields = []
for _ in range(n_comp):
null_location = self.buf.index(b'\x00', self.offset)
name_size = _padded_size(null_location - self.offset + 1, 8)
# we read with padding and without
name_size = null_location - self.offset + 1 if version == 3 else _padded_size(
null_location - self.offset + 1, 8)
name = self.buf[self.offset:self.offset+name_size]
name = name.strip(b'\x00').decode('utf-8')
self.offset += name_size

prop_desc = _unpack_struct_from(
COMPOUND_PROP_DESC_V1, self.buf, self.offset)
self.offset += COMPOUND_PROP_DESC_V1_SIZE
# handle different message type versions
if version == 1:
prop_desc = _unpack_struct_from(
COMPOUND_PROP_DESC_V1, self.buf, self.offset)
self.offset += COMPOUND_PROP_DESC_V1_SIZE
elif version == 3:
# according HDF5 manual
# https://support.hdfgroup.org/documentation/hdf5/latest/_f_m_t4.html#subsec_fmt4_intro_doc
offset_len = max(1, (datatype_msg["size"] - 1).bit_length() + 7 >> 3)
fmt = {1: 'B', 2: 'H', 4: 'I', 8: 'Q'}[offset_len]
offset_struct = OrderedDict((('offset', fmt),))
prop_desc = _unpack_struct_from(offset_struct, self.buf, self.offset)
self.offset += offset_len

comp_dtype = self.determine_dtype()
members.append((name, prop_desc, comp_dtype))

# determine if the compound dtype is complex64/complex128
if len(members) == 2:
name1, prop1, dtype1 = members[0]
name2, prop2, dtype2 = members[1]
names_valid = (name1 == 'r' and name2 == 'i')
complex_dtype_map = {
'>f4': '>c8',
'<f4': '<c8',
'>f8': '>c16',
'<f8': '<c16',
}
dtypes_valid = (dtype1 == dtype2) and dtype1 in complex_dtype_map
half = datatype_msg['size'] // 2
offsets_valid = (prop1['offset'] == 0 and prop2['offset'] == half)
props_valid = (
prop1['dimensionality'] == 0 and
prop1['permutation'] == 0 and
prop1['dim_size_1'] == 0 and
prop1['dim_size_2'] == 0 and
prop1['dim_size_3'] == 0 and
prop1['dim_size_4'] == 0 and
prop2['dimensionality'] == 0 and
prop2['permutation'] == 0 and
prop2['dim_size_1'] == 0 and
prop2['dim_size_2'] == 0 and
prop2['dim_size_3'] == 0 and
prop2['dim_size_4'] == 0
)
if names_valid and dtypes_valid and offsets_valid and props_valid:
return "COMPOUND", complex_dtype_map[dtype1]

raise NotImplementedError("Compound dtype not supported.")
if not isinstance(comp_dtype, P5Type):
raise TypeError(f"Field {name} is not an H5Type instance")
fields.append(P5CompoundField(name=name, offset=prop_desc["offset"], ptype=comp_dtype))

return P5CompoundType(fields=fields, size=datatype_msg["size"])


def _determine_dtype_opaque(self, datatype_msg):
""" Return the dtype information for an opaque class. """
Expand All @@ -177,17 +158,15 @@ def _determine_dtype_opaque(self, datatype_msg):
if tag == '':
tag = None

return ('OPAQUE', tag, size)
return P5OpaqueType(tag, size)

@staticmethod
def _determine_dtype_vlen(datatype_msg):
def _determine_dtype_vlen(self, datatype_msg):
""" Return the dtype information for a variable length class. """
vlen_type = datatype_msg['class_bit_field_0'] & 0x01
if vlen_type != 1:
return ('VLEN_SEQUENCE', 0, 0)
padding_type = datatype_msg['class_bit_field_0'] >> 4 # bits 4-7
return P5SequenceType(base_dtype=self.determine_dtype())
character_set = datatype_msg['class_bit_field_1'] & 0x01
return ('VLEN_STRING', padding_type, character_set)
return P5VlenStringType(character_set=character_set)

def _determine_dtype_enum(self,datatype_msg):
""" Return the basetype and the underlying enum dictionary """
Expand All @@ -197,8 +176,8 @@ def _determine_dtype_enum(self,datatype_msg):
num_members = enum_msg['number_of_members']
value_size = enum_msg['size']
enum_keys = []
dtype = DatatypeMessage(self.buf,self.offset).dtype
self.offset+=12
dtype = DatatypeMessage(self.buf, self.offset).ptype.dtype
self.offset+=12
# An extra 4 bytes are read as part of establishing the data type
# FIXME:ENUM Need to be sure that some other base type in the future
# wouldn't silently need more bytes and screw this all up. Should
Expand All @@ -216,7 +195,7 @@ def _determine_dtype_enum(self,datatype_msg):
nbytes = value_size*num_members
values = np.frombuffer(self.buf[self.offset:], dtype=dtype, count=num_members)
enum_dict = {k:v for k,v in zip(enum_keys,values)}
return 'ENUMERATION', dtype, enum_dict
return P5EnumType(dtype, enum_dict)


# IV.A.2.d The Datatype Message
Expand Down
Loading
Loading