NCAS-CMS · valeriupredoi · Oct 27, 2025 · Oct 7, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py
@@ -5,7 +5,6 @@
 from collections import OrderedDict
 import struct
 import warnings
-from io import UnsupportedOperation
 
 import numpy as np
 
@@ -19,10 +18,11 @@
 from pyfive.btree import BTreeV2GroupNames, BTreeV2GroupOrders
 from pyfive.btree import BTreeV2AttrCreationOrder, BTreeV2AttrNames
 from pyfive.btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER, LZF_FILTER
-from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap, GLOBAL_HEAP_ID
-from pyfive.h5d import DatasetID
+from pyfive.misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap, GLOBAL_HEAP_ID, _decode_array, dtype_replace_refs_with_object
+from pyfive.p5t import P5Type, P5StringType, P5CompoundType, P5VlenStringType, P5ReferenceType, P5SequenceType, P5EnumType
 from pyfive.indexing import OrthogonalIndexer, ZarrArrayStub
 from pyfive.h5py import Empty
+from pyfive.h5d import DatasetID
 
 # these constants happen to have the same value...
 UNLIMITED_SIZE = UNDEFINED_ADDRESS
@@ -218,30 +218,28 @@ def _parse_attribute_msg(self, buffer, offset):
 
         # Read the datatype information
         try:
-            dtype = DatatypeMessage(buffer, offset).dtype
+            ptype = DatatypeMessage(buffer, offset).ptype
         except NotImplementedError:
-            if name == 'REFERENCE_LIST':
-                pass #suppress this one, no one actually cares about these as far as I know
-            else:
-                warnings.warn(
-                    f"Attribute {name} type not implemented, set to None."
-                )
+            warnings.warn(
+                f"Attribute {name} type not implemented, set to None."
+            )
             return name, None
+
         offset += _padded_size(attr_dict['datatype_size'], padding_multiple)
 
         # Read the dataspace information
         shape, maxshape = determine_data_shape(buffer, offset)
 
         # detect Empty/NULL dataspace
         if shape is None:
-            value = Empty(dtype=dtype)
+            value = Empty(dtype=ptype.dtype)
         else:
             items = int(np.prod(shape))
 
             offset += _padded_size(attr_dict['dataspace_size'], padding_multiple)
 
             # Read the value(s)
-            value = self._attr_value(dtype, buffer, items, offset)
+            value = self._attr_value(ptype, buffer, items, offset)
 
         if shape == ():
             value = value[0]
@@ -255,42 +253,44 @@ def _parse_attribute_msg(self, buffer, offset):
 
         return name, value
 
-    def _attr_value(self, dtype, buf, count, offset):
+    def _attr_value(self, ptype, buf, count, offset):
         """ Retrieve an HDF5 attribute value from a buffer. """
 
-        # first handle ENUMERATION, we just extract the dtype
-        if isinstance(dtype, tuple):
-            if dtype[0] == "ENUMERATION":
-                dtype = np.dtype(dtype[1], metadata={'enum': dtype[2]})
-            elif dtype[0] == "COMPOUND":
-                dtype = np.dtype(dtype[1])
-
-        if isinstance(dtype, tuple):
-            dtype_class = dtype[0]
-            if dtype_class == 'VLEN_STRING':
-                fdtype = np.dtype('O', metadata={'h5py_encoding': 'utf-8'})
-            else:
-                fdtype=  np.dtype('O')
+        # numpy storage dtype
+        dtype = ptype.dtype
 
-            value = np.empty(count, dtype=fdtype)
+        if isinstance(ptype, (P5SequenceType, P5ReferenceType, P5VlenStringType)):
+            # todo: check, if this can be done in the P5Type
+            # small hack to get Reference output ptype right
+            if isinstance(ptype, P5ReferenceType):
+                dtype = dtype_replace_refs_with_object(dtype)
+
+            value = np.empty(count, dtype=dtype)
             for i in range(count):
-                if dtype_class == 'VLEN_STRING':
+                if isinstance(ptype, P5StringType):
                     _, vlen_data = self._vlen_size_and_data(buf, offset)
                     value[i] = vlen_data.decode('utf-8')
                     offset += 16
-                elif dtype_class == 'REFERENCE':
+                elif isinstance(ptype, P5ReferenceType):
                     address, = struct.unpack_from('<Q', buf, offset=offset)
                     value[i] = Reference(address)
                     offset += 8
-                elif dtype_class == "VLEN_SEQUENCE":
-                    base_dtype = dtype[1]
+                elif isinstance(ptype, P5SequenceType):
                     vlen, vlen_data = self._vlen_size_and_data(buf, offset)
-                    value[i] = self._attr_value(base_dtype, vlen_data, vlen, 0)
+                    value[i] = self._attr_value(ptype.base_dtype, vlen_data, vlen, 0)
                     offset += 16
                 else:
                     raise NotImplementedError
         else:
             value = np.frombuffer(buf, dtype=dtype, count=count, offset=offset)
+            if not ptype.is_atomic:
+                # todo: check for Enum etc types
+                value = value.view(dtype)
+                if isinstance(ptype, P5CompoundType):
+                    new_dtype = dtype_replace_refs_with_object(ptype.dtype)
+                    new_array = np.empty(value.shape, dtype=new_dtype)
+                    new_array[:] = value
+                    value = _decode_array(value, new_array)
         return value
 
     def _vlen_size_and_data(self, buf, offset):
@@ -358,27 +358,22 @@ def fillvalue(self):
             size = 0
 
         if size:
-            if isinstance(self.dtype, tuple):
-                if self.dtype[0] == 'VLEN_STRING':
-                    fillvalue = self._attr_value(self.dtype, self.msg_data, 1, offset)[0]
-                elif self.dtype[0] in ['ENUMERATION']:
-                    fillvalue = 0
-                else:
-                    raise ValueError(f'Unrecognised dtype [{self.dtype}]')
+            if isinstance(self.ptype, P5VlenStringType):
+                fillvalue = self._attr_value(self.ptype, self.msg_data, 1, offset)[0]
             else:
                 payload = self.msg_data[offset:offset+size]
-                fillvalue = np.frombuffer(payload, self.dtype, count=1)[0]
+                fillvalue = np.frombuffer(payload, self.ptype.dtype, count=1)[0]
         else:
             fillvalue = 0
         return fillvalue
 
     @property
-    def dtype(self):
+    def ptype(self):
         """ Datatype of the dataset. """
         msg = self.find_msg_type(DATATYPE_MSG_TYPE)[0]
         msg_offset = msg['offset_to_message']
-        dtype = DatatypeMessage(self.msg_data, msg_offset).dtype
-        return dtype
+        ptype = DatatypeMessage(self.msg_data, msg_offset).ptype
+        return ptype
 
     @property
     def chunks(self):

diff --git a/pyfive/datatype_msg.py b/pyfive/datatype_msg.py
@@ -5,8 +5,10 @@
 from .core import _padded_size, _structure_size, _unpack_struct_from
 from .core import InvalidHDF5File
 
+from .p5t import P5Type, P5CompoundType, P5CompoundField, P5FixedStringType, P5VlenStringType, P5SequenceType, P5EnumType, P5OpaqueType, P5FloatType, P5ReferenceType, P5StringType, P5IntegerType
+
 import numpy as np
-import warnings
+
 
 class DatatypeMessage(object):
     """ Representation of a HDF5 Datatype Message. """
@@ -15,7 +17,7 @@ class DatatypeMessage(object):
     def __init__(self, buf, offset):
         self.buf = buf
         self.offset = offset
-        self.dtype = self.determine_dtype()
+        self.ptype = self.determine_dtype()
 
     def determine_dtype(self):
         """ Return the dtype (often numpy-like) for the datatype message.  """
@@ -39,17 +41,13 @@ def determine_dtype(self):
         elif datatype_class == DATATYPE_COMPOUND:
             return self._determine_dtype_compound(datatype_msg)
         elif datatype_class == DATATYPE_REFERENCE:
-            return ('REFERENCE', datatype_msg['size'])
+            return P5ReferenceType(datatype_msg['size'], f"V{datatype_msg['size']}")
         elif datatype_class == DATATYPE_ENUMERATED:
             return self._determine_dtype_enum(datatype_msg)
         elif datatype_class == DATATYPE_ARRAY:
             raise NotImplementedError("Array datatype class not supported.")
         elif datatype_class == DATATYPE_VARIABLE_LENGTH:
-            vlen_type = self._determine_dtype_vlen(datatype_msg)
-            if vlen_type[0] == 'VLEN_SEQUENCE':
-                base_type = self.determine_dtype()
-                vlen_type = ('VLEN_SEQUENCE', base_type)
-            return vlen_type
+            return self._determine_dtype_vlen(datatype_msg)
         raise InvalidHDF5File('Invalid datatype class %i' % (datatype_class))
 
     def _determine_dtype_fixed_point(self, datatype_msg):
@@ -75,7 +73,7 @@ def _determine_dtype_fixed_point(self, datatype_msg):
         # not read, assumed to be IEEE standard format
         self.offset += 4
 
-        return byte_order_char + dtype_char + str(length_in_bytes)
+        return P5IntegerType(byte_order_char + dtype_char + str(length_in_bytes))
 
     def _determine_dtype_floating_point(self, datatype_msg):
         """ Return the NumPy dtype for a floating point class. """
@@ -96,70 +94,53 @@ def _determine_dtype_floating_point(self, datatype_msg):
         # not read, assumed to be IEEE standard format
         self.offset += 12
 
-        return byte_order_char + dtype_char + str(length_in_bytes)
-
+        return P5FloatType(byte_order_char + dtype_char + str(length_in_bytes))
 
     @staticmethod
     def _determine_dtype_string(datatype_msg):
         """ Return the NumPy dtype for a string class. """
-        return 'S' + str(datatype_msg['size'])
+        return P5FixedStringType(datatype_msg['size'])
 
     def _determine_dtype_compound(self, datatype_msg):
         """ Return the dtype of a compound class if supported. """
         bit_field_0 = datatype_msg['class_bit_field_0']
         bit_field_1 = datatype_msg['class_bit_field_1']
         n_comp = bit_field_0 + (bit_field_1 << 4)
+        version = datatype_msg['class_and_version'] >> 4
 
-        # read in the members of the compound datatype
+        # read in the fields of the compound datatype
         # at the moment we need to skip two bytes which I do
-        # 
-        members = []
+        fields = []
         for _ in range(n_comp):
             null_location = self.buf.index(b'\x00', self.offset)
-            name_size = _padded_size(null_location - self.offset + 1, 8)
+            # we read with padding and without
+            name_size = null_location - self.offset + 1 if version == 3 else _padded_size(
+                null_location - self.offset + 1, 8)
             name = self.buf[self.offset:self.offset+name_size]
             name = name.strip(b'\x00').decode('utf-8')
             self.offset += name_size
 
-            prop_desc = _unpack_struct_from(
-                COMPOUND_PROP_DESC_V1, self.buf, self.offset)
-            self.offset += COMPOUND_PROP_DESC_V1_SIZE
+            # handle different message type versions
+            if version == 1:
+                prop_desc = _unpack_struct_from(
+                    COMPOUND_PROP_DESC_V1, self.buf, self.offset)
+                self.offset += COMPOUND_PROP_DESC_V1_SIZE
+            elif version == 3:
+                # according HDF5 manual
+                # https://support.hdfgroup.org/documentation/hdf5/latest/_f_m_t4.html#subsec_fmt4_intro_doc
+                offset_len = max(1, (datatype_msg["size"] - 1).bit_length() + 7 >> 3)
+                fmt = {1: 'B', 2: 'H', 4: 'I', 8: 'Q'}[offset_len]
+                offset_struct = OrderedDict((('offset', fmt),))
+                prop_desc = _unpack_struct_from(offset_struct, self.buf, self.offset)
+                self.offset += offset_len
 
             comp_dtype = self.determine_dtype()
-            members.append((name, prop_desc, comp_dtype))
-
-        # determine if the compound dtype is complex64/complex128
-        if len(members) == 2:
-            name1, prop1, dtype1 = members[0]
-            name2, prop2, dtype2 = members[1]
-            names_valid = (name1 == 'r' and name2 == 'i')
-            complex_dtype_map = {
-                '>f4': '>c8',
-                '<f4': '<c8',
-                '>f8': '>c16',
-                '<f8': '<c16',
-            }
-            dtypes_valid = (dtype1 == dtype2) and dtype1 in complex_dtype_map
-            half = datatype_msg['size'] // 2
-            offsets_valid = (prop1['offset'] == 0 and prop2['offset'] == half)
-            props_valid = (
-                prop1['dimensionality'] == 0 and
-                prop1['permutation'] == 0 and
-                prop1['dim_size_1'] == 0 and
-                prop1['dim_size_2'] == 0 and
-                prop1['dim_size_3'] == 0 and
-                prop1['dim_size_4'] == 0 and
-                prop2['dimensionality'] == 0 and
-                prop2['permutation'] == 0 and
-                prop2['dim_size_1'] == 0 and
-                prop2['dim_size_2'] == 0 and
-                prop2['dim_size_3'] == 0 and
-                prop2['dim_size_4'] == 0
-            )
-            if names_valid and dtypes_valid and offsets_valid and props_valid:
-                return "COMPOUND", complex_dtype_map[dtype1]
-
-        raise NotImplementedError("Compound dtype not supported.")
+            if not isinstance(comp_dtype, P5Type):
+                raise TypeError(f"Field {name} is not an H5Type instance")
+            fields.append(P5CompoundField(name=name, offset=prop_desc["offset"], ptype=comp_dtype))
+
+        return P5CompoundType(fields=fields, size=datatype_msg["size"])
+
 
     def _determine_dtype_opaque(self, datatype_msg):
         """ Return the dtype information for an opaque class. """
@@ -177,17 +158,15 @@ def _determine_dtype_opaque(self, datatype_msg):
         if tag == '':
             tag = None  
 
-        return ('OPAQUE', tag, size)
+        return P5OpaqueType(tag, size)
 
-    @staticmethod
-    def _determine_dtype_vlen(datatype_msg):
+    def _determine_dtype_vlen(self, datatype_msg):
         """ Return the dtype information for a variable length class. """
         vlen_type = datatype_msg['class_bit_field_0'] & 0x01
         if vlen_type != 1:
-            return ('VLEN_SEQUENCE', 0, 0)
-        padding_type = datatype_msg['class_bit_field_0'] >> 4  # bits 4-7
+            return P5SequenceType(base_dtype=self.determine_dtype())
         character_set = datatype_msg['class_bit_field_1'] & 0x01
-        return ('VLEN_STRING', padding_type, character_set)
+        return P5VlenStringType(character_set=character_set)
 
     def _determine_dtype_enum(self,datatype_msg):
         """ Return the basetype and the underlying enum dictionary """
@@ -197,8 +176,8 @@ def _determine_dtype_enum(self,datatype_msg):
         num_members = enum_msg['number_of_members']
         value_size = enum_msg['size']
         enum_keys = []
-        dtype = DatatypeMessage(self.buf,self.offset).dtype
-        self.offset+=12   
+        dtype = DatatypeMessage(self.buf, self.offset).ptype.dtype
+        self.offset+=12
         # An extra 4 bytes are read as part of establishing the data type
         # FIXME:ENUM Need to be sure that some other base type in the future
         # wouldn't silently need more bytes and screw this all up. Should 
@@ -216,7 +195,7 @@ def _determine_dtype_enum(self,datatype_msg):
         nbytes = value_size*num_members
         values = np.frombuffer(self.buf[self.offset:], dtype=dtype, count=num_members)
         enum_dict = {k:v for k,v in zip(enum_keys,values)}
-        return 'ENUMERATION', dtype, enum_dict
+        return P5EnumType(dtype, enum_dict)
 
 
 # IV.A.2.d The Datatype Message