From ef8aa9c6e30590a28fef5414ac817e2656764675 Mon Sep 17 00:00:00 2001
From: ayushnag <35325113+ayushnag@users.noreply.github.com>
Date: Sat, 3 Aug 2024 11:41:09 +0530
Subject: [PATCH] modularize dmrpp.py

---
 virtualizarr/readers/dmrpp.py                 | 392 +++++++++++++-----
 virtualizarr/tests/test_readers/test_dmrpp.py |  50 +--
 virtualizarr/xarray.py                        |  19 +-
 3 files changed, 316 insertions(+), 145 deletions(-)
diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py
index 1ff66c8b..0d6453f5 100644
--- a/virtualizarr/readers/dmrpp.py
+++ b/virtualizarr/readers/dmrpp.py
@@ -16,6 +16,7 @@ class DMRParser:
     """
     Parses a DMR++ file and creates a virtual xr.Dataset.
     Handles groups, dimensions, coordinates, data variables, encoding, chunk manifests, and attributes.
+    Highly modular to allow support for older dmrpp schema versions
     """
 
     # DAP and DMRPP XML namespaces
@@ -35,13 +36,13 @@ class DMRParser:
         "UInt32": "uint32",
         "Int64": "int64",
         "UInt64": "uint64",
-        "Url": "str",
+        "Url": "object",
         "Float32": "float32",
         "Float64": "float64",
-        "String": "str",
+        "String": "object",
     }
-    # Default zlib compression value (-1 means default, currently level 6 is default)
-    _default_zlib_value = -1
+    # Default zlib compression value
+    _default_zlib_value = 6
     # Encoding keys that should be cast to float
     _encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"}
 
@@ -82,7 +83,7 @@ def parse_dataset(self, group=None) -> xr.Dataset:
             group = os.path.normpath(group).removeprefix(
                 "/"
             )  # ensure group is in form "a/b/c"
-        if self.data_filepath.endswith(".h5"):
+        if self._is_hdf5(self.root):
             return self._parse_hdf5_dataset(self.root, group)
         if self.data_filepath.endswith(".nc"):
             return self._parse_netcdf4_dataset(self.root, group)
@@ -103,6 +104,7 @@ def _parse_netcdf4_dataset(
         group : str
             The group to parse. If None, and no groups are present, the dataset is parsed.
             If None and groups are present, the first group is parsed.
+
         Returns
         -------
         xr.Dataset
@@ -132,6 +134,11 @@ def _split_netcdf4(self, root: ET.Element) -> dict[str, ET.Element]:
         Split the input <Group> element into several <Dataset> ET.Elements by netcdf4 group
         E.g. {"left": <Dataset>, "right": <Dataset>}
 
+        Parameters
+        ----------
+        root : ET.Element
+            The root element of the DMR file.
+
         Returns
         -------
         dict[str, ET.Element]
@@ -144,7 +151,17 @@ def _split_netcdf4(self, root: ET.Element) -> dict[str, ET.Element]:
             all_groups[os.path.normpath(group_tag.attrib["name"])] = group_tag
         return all_groups
 
-    def _parse_hdf5_dataset(self, root: ET.Element, group: str) -> xr.Dataset:
+    def _is_hdf5(self, root: ET.Element) -> bool:
+        """Check if the DMR file is HDF5 based."""
+        if root.find(".//dap:Attribute[@name='fullnamepath']", self._ns) is not None:
+            return True
+        if root.find("./dap:Attribute[@name='HDF5_GLOBAL']", self._ns) is not None:
+            return True
+        return False
+
+    def _parse_hdf5_dataset(
+        self, root: ET.Element, group: Optional[str] = None
+    ) -> xr.Dataset:
         """
         Parse the dataset from the HDF5 based dmrpp with groups, starting at the given group.
         Set root to the given group.
@@ -163,25 +180,43 @@ def _parse_hdf5_dataset(self, root: ET.Element, group: str) -> xr.Dataset:
         xr.Dataset
         """
         all_groups = self._split_hdf5(root=root)
+        if len(all_groups) == 0:
+            raise ValueError("No groups found in HDF based DMR file")
+        if group is None:
+            # pick a random group if no group is specified
+            group = next(iter(all_groups))
+        attrs = {}
+        for attr_tag in root.iterfind("dap:Attribute", self._ns):
+            if attr_tag.attrib["type"] != "Container":
+                attrs.update(self._parse_attribute(attr_tag))
         if group in all_groups:
             # replace aliased variable names with original names: gt1r_heights -> heights
-            orignames = {}
-            vars_tags: list[ET.Element] = []
-            for dap_dtype in self._dap_np_dtype:
-                vars_tags += all_groups[group].findall(f"dap:{dap_dtype}", self._ns)
-            for var_tag in vars_tags:
-                origname_tag = var_tag.find(
-                    "./dap:Attribute[@name='origname']/dap:Value", self._ns
-                )
-                if origname_tag is not None and origname_tag.text is not None:
-                    orignames[var_tag.attrib["name"]] = origname_tag.text
-            return self._parse_dataset(all_groups[group]).rename(orignames)
+            orignames = self.find_original_names(all_groups[group])
+            vds = self._parse_dataset(all_groups[group])
+            # Only one group so found attrs are global attrs
+            if len(all_groups) == 1:
+                vds.attrs.update(attrs)
+            return vds.rename(orignames)
         raise ValueError(f"Group {group} not found in HDF5 DMR file")
 
+    def find_original_names(self, root: ET.Element) -> dict[str, str]:
+        orignames: dict[str, str] = {}
+        vars_tags: list[ET.Element] = []
+        for dap_dtype in self._dap_np_dtype:
+            vars_tags += root.findall(f"dap:{dap_dtype}", self._ns)
+        for var_tag in vars_tags:
+            origname_tag = var_tag.find(
+                "./dap:Attribute[@name='origname']/dap:Value", self._ns
+            )
+            if origname_tag is not None and origname_tag.text is not None:
+                orignames[var_tag.attrib["name"]] = origname_tag.text
+        return orignames
+
     def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]:
         """
         Split the input <Dataset> element into several <Dataset> ET.Elements by HDF5 group
-        E.g. {"gtr1/heights": <Dataset>, "gtr1/temperatures": <Dataset>}
+        E.g. {"gtr1/heights": <Dataset>, "gtr1/temperatures": <Dataset>}. Builds up new <Dataset> elements
+        each with dimensions, variables, and attributes.
 
         Parameters
         ----------
@@ -192,7 +227,8 @@ def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]:
         -------
         dict[str, ET.Element]
         """
-        all_groups: dict[str, ET.Element] = defaultdict(
+        # Add all variable, dimension, and attribute tags to their respective groups
+        groups_roots: dict[str, ET.Element] = defaultdict(
             lambda: ET.Element(root.tag, root.attrib)
         )
         group_dims: dict[str, set[str]] = defaultdict(
@@ -209,80 +245,182 @@ def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]:
             if fullname_tag is not None and fullname_tag.text is not None:
                 # '/gt1r/heights/ph_id_pulse' -> 'gt1r/heights'
                 group_name = os.path.dirname(fullname_tag.text).removeprefix("/")
-                all_groups[group_name].append(var_tag)
-                for dim_tag in var_tag.iterfind("dap:Dim", self._ns):
-                    group_dims[group_name].add(dim_tag.attrib["name"].removeprefix("/"))
+                groups_roots[group_name].append(var_tag)
+                dim_tags = var_tag.findall("dap:Dim", self._ns)
+                dims = self._parse_multi_dims(dim_tags)
+                group_dims[group_name].update(dims.keys())
         # Dimensions
         for dim_tag in root.iterfind("dap:Dimension", self._ns):
-            for group_name, dims in group_dims.items():
-                if dim_tag.attrib["name"] in dims:
-                    all_groups[group_name].append(dim_tag)
+            for g, d in group_dims.items():
+                if dim_tag.attrib["name"] in d:
+                    groups_roots[g].append(dim_tag)
         # Attributes
-        for attr_tag in root.iterfind("dap:Attribute", self._ns):
-            fullname_tag = attr_tag.find(
-                "./dap:Attribute[@name='fullnamepath']/dap:Value", self._ns
-            )
-            if fullname_tag is not None and fullname_tag.text is not None:
-                group_name = fullname_tag.text
-                # Add all attributes to the new dataset; fullnamepath is generally excluded
-                if group_name in all_groups:
-                    all_groups[group_name].extend(
-                        [a for a in attr_tag if a.attrib["name"] != "fullnamepath"]
-                    )
-        return all_groups
+        container_attr_tag = root.find("dap:Attribute[@name='HDF5_GLOBAL']", self._ns)
+        if container_attr_tag is None:
+            attrs_tags = root.findall("dap:Attribute", self._ns)
+            for attr_tag in attrs_tags:
+                fullname_tag = attr_tag.find(
+                    "./dap:Attribute[@name='fullnamepath']/dap:Value", self._ns
+                )
+                if fullname_tag is not None and fullname_tag.text is not None:
+                    group_name = os.path.dirname(fullname_tag.text).removeprefix("/")
+                    # Add all attributes to the new dataset
+                    groups_roots[group_name].extend(attr_tag)
+        else:
+            groups_roots[next(iter(groups_roots))].extend(container_attr_tag)
+        return groups_roots
 
     def _parse_dataset(self, root: ET.Element) -> xr.Dataset:
         """
         Parse the dataset using the root element of the DMR file.
 
+        Parameters
+        ----------
+        root : ET.Element
+            The root element of the DMR file.
+
         Returns
         -------
         xr.Dataset
         """
         # Dimension names and sizes
-        dataset_dims: dict[str, int] = {}
-        for dim_tag in root.iterfind("dap:Dimension", self._ns):
-            dataset_dims[dim_tag.attrib["name"]] = int(dim_tag.attrib["size"])
+        dim_tags = root.findall("dap:Dimension", self._ns)
+        dataset_dims = self._parse_multi_dims(dim_tags)
         # Data variables and coordinates
-        vars_tags: list[ET.Element] = []
-        for dap_dtype in self._dap_np_dtype:
-            vars_tags += root.findall(f"dap:{dap_dtype}", self._ns)
-        # Coordinate names (using Map tags and coordinates attribute)
-        coord_names: set[str] = set()
-        for var_tag in vars_tags:
-            coord_tag = var_tag.find(
-                "./dap:Attribute[@name='coordinates']/dap:Value", self._ns
-            )
-            if coord_tag is not None and coord_tag.text is not None:
-                coord_names.update(coord_tag.text.split(" "))
-            for map_tag in var_tag.iterfind("dap:Map", self._ns):
-                coord_names.add(map_tag.attrib["name"].removeprefix("/"))
+        coord_names = self._find_coord_names(root)
         # if no coord_names are found or coords don't include dims, dims are used as coords
         if len(coord_names) == 0 or len(coord_names) < len(dataset_dims):
             coord_names = set(dataset_dims.keys())
         # Seperate and parse coords + data variables
-        coords: dict[str, xr.Variable] = {}
+        coord_vars: dict[str, xr.Variable] = {}
         data_vars: dict[str, xr.Variable] = {}
-        for var_tag in vars_tags:
+        for var_tag in self._find_var_tags(root):
+            variable = self._parse_variable(var_tag, dataset_dims)
             if var_tag.attrib["name"] in coord_names:
-                coords[var_tag.attrib["name"]] = self._parse_variable(
-                    var_tag, dataset_dims
-                )
+                coord_vars[var_tag.attrib["name"]] = variable
             else:
-                data_vars[var_tag.attrib["name"]] = self._parse_variable(
-                    var_tag, dataset_dims
-                )
+                data_vars[var_tag.attrib["name"]] = variable
         # Attributes
         attrs: dict[str, str] = {}
         for attr_tag in self.root.iterfind("dap:Attribute", self._ns):
-            if attr_tag.attrib["type"] != "Container":  # container = nested attributes
-                attrs.update(self._parse_attribute(attr_tag))
+            attrs.update(self._parse_attribute(attr_tag))
         return xr.Dataset(
             data_vars=data_vars,
-            coords=xr.Coordinates(coords=coords, indexes={}),
+            coords=xr.Coordinates(coords=coord_vars, indexes={}),
             attrs=attrs,
         )
 
+    def _find_var_tags(self, root: ET.Element) -> list[ET.Element]:
+        """
+        Find all variable tags in the DMR file. Also known as array tags.
+        Tags are labeled with the DAP data type. E.g. <Float32>, <Int16>, <String>
+
+        Parameters
+        ----------
+        root : ET.Element
+            The root element of the DMR file.
+
+        Returns
+        -------
+        list[ET.Element]
+        """
+        vars_tags: list[ET.Element] = []
+        for dap_dtype in self._dap_np_dtype:
+            vars_tags += root.findall(f"dap:{dap_dtype}", self._ns)
+        return vars_tags
+
+    def _find_coord_names(self, root: ET.Element) -> set[str]:
+        """
+        Find the name of all coordinates in root. Checks inside all variables and global attributes.
+
+        Parameters
+        ----------
+        root : ET.Element
+            The root element of the DMR file.
+
+        Returns
+        -------
+        set[str] : The set of unique coordinate names.
+        """
+        # Check for coordinate names within each variable attributes
+        coord_names: set[str] = set()
+        for var_tag in self._find_var_tags(root):
+            coord_tag = var_tag.find(
+                "./dap:Attribute[@name='coordinates']/dap:Value", self._ns
+            )
+            if coord_tag is not None and coord_tag.text is not None:
+                coord_names.update(coord_tag.text.split(" "))
+            for map_tag in var_tag.iterfind("dap:Map", self._ns):
+                coord_names.add(map_tag.attrib["name"].removeprefix("/"))
+        # Check for coordinate names in a global attribute
+        coord_tag = var_tag.find("./dap:Attribute[@name='coordinates']", self._ns)
+        if coord_tag is not None and coord_tag.text is not None:
+            coord_names.update(coord_tag.text.split(" "))
+        return coord_names
+
+    def _parse_dim(self, root: ET.Element) -> dict[str, int | None]:
+        """
+        Parse single <Dim> or <Dimension> tag
+
+        If the tag has no name attribute, it is a phony dimension. E.g. <Dim size="300"/> --> {"phony_dim": 300}
+        If the tag has no size attribute, it is an unlimited dimension. E.g. <Dim name="time"/> --> {"time": None}
+        If the tag has both name and size attributes, it is a regular dimension. E.g. <Dim name="lat" size="1447"/> --> {"lat": 1447}
+
+        Parameters
+        ----------
+        root : ET.Element
+            The root element Dim/Dimension tag
+
+        Returns
+        -------
+        dict
+            E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim": 300}, {"time": None, "lat": None, "lon": None}
+        """
+        if "name" not in root.attrib and "size" in root.attrib:
+            return {"phony_dim": int(root.attrib["size"])}
+        if "name" in root.attrib and "size" not in root.attrib:
+            return {os.path.basename(root.attrib["name"]): None}
+        if "name" in root.attrib and "size" in root.attrib:
+            return {os.path.basename(root.attrib["name"]): int(root.attrib["size"])}
+        raise ValueError("Not enough information to parse Dim/Dimension tag")
+
+    def _parse_multi_dims(
+        self, dim_tags: list[ET.Element], global_dims: dict[str, int] = {}
+    ) -> dict:
+        """
+        Parse multiple <Dim> or <Dimension> tags. Generally <Dim> tags are found within dmrpp variable tags.
+
+        Returns best possible matching of {dimension: shape} present in the list and global_dims. E.g tags=(Dim("lat", None), Dim("lon", None)) and global_dims={"lat": 100, "lon": 100, "time": 5} --> {"lat": 100, "lon": 100}
+
+        E.g. tags=(Dim("time", None), Dim("", 200)) and global_dims={"lat": 100, "lon": 100, "time": 5} --> {"time": 5, "phony_dim0": 200}
+
+        This function is often used to fill in missing sizes from the global_dims. E.g. Variable tags may contain only dimension names and not sizes. If the {name: size} matching is known from the global_dims, it is used to fill in the missing sizes.
+
+        Parameters
+        ----------
+        dim_tags : tuple[ET.Element]
+            A tuple of ElementTree Elements representing dimensions in the DMR file.
+
+        global_dims : dict
+            A dictionary of dimension names and sizes. E.g. {"time": 1, "lat": 1447, "lon": 2895}
+
+        Returns
+        -------
+        dict
+            E.g. {"time": 1, "lat": 1447, "lon": 2895}
+        """
+        dims: dict[str, int | None] = {}
+        for dim_tag in dim_tags:
+            dim: dict[str, int | None] = self._parse_dim(dim_tag)
+            if "phony_dim" in dim:
+                dims["phony_dim_" + str(len(dims))] = dim["phony_dim"]
+            else:
+                dims.update(dim)
+        for name, size in list(dims.items()):
+            if name in global_dims and size is None:
+                dims[name] = global_dims[name]
+        return dims
+
     def _parse_variable(
         self, var_tag: ET.Element, dataset_dims: dict[str, int]
     ) -> xr.Variable:
@@ -296,52 +434,41 @@ def _parse_variable(
 
         dataset_dims : dict
             A dictionary of dimension names and sizes. E.g. {"time": 1, "lat": 1447, "lon": 2895}
-            Must contain at least all the dimensions used by the variable.
+            Must contain at least all the dimensions used by the variable. Necessary since the variable
+            metadata only contains the dimension names and not the sizes.
 
         Returns
         -------
         xr.Variable
         """
         # Dimension names
-        dim_names: list[str] = []
-        for dim_tag in var_tag.iterfind("dap:Dim", self._ns):
-            dim_names.append(os.path.basename(dim_tag.attrib["name"]))
+        dim_tags = var_tag.findall("dap:Dim", self._ns)
+        dim_shapes = self._parse_multi_dims(dim_tags, dataset_dims)
         # convert DAP dtype to numpy dtype
         dtype = np.dtype(
             self._dap_np_dtype[var_tag.tag.removeprefix("{" + self._ns["dap"] + "}")]
         )
         # Chunks and Filters
         filters = None
-        fill_value = np.nan
-        shape = tuple([dataset_dims[d] for d in dim_names])
+        shape = tuple(dim_shapes.values())
         chunks_shape = shape
         chunks_tag = var_tag.find("dmr:chunks", self._ns)
         if chunks_tag is not None:
             # Chunks
-            chunk_dim_tag = chunks_tag.find("dmr:chunkDimensionSizes", self._ns)
-            if chunk_dim_tag is not None and chunk_dim_tag.text is not None:
-                # 1 1447 2895 -> (1, 1447, 2895)
-                chunks_shape = tuple(map(int, chunk_dim_tag.text.split()))
+            found_chunk_dims = self._parse_chunks_dimensions(chunks_tag)
+            chunks_shape = found_chunk_dims if found_chunk_dims is not None else shape
             chunkmanifest = self._parse_chunks(chunks_tag, chunks_shape)
             # Filters
-            if "compressionType" in chunks_tag.attrib:
-                filters = []
-                # shuffle deflate --> ["shuffle", "deflate"]
-                compression_types = chunks_tag.attrib["compressionType"].split(" ")
-                for c in compression_types:
-                    if c == "shuffle":
-                        filters.append({"id": "shuffle", "elementsize": dtype.itemsize})
-                    elif c == "deflate":
-                        filters.append(
-                            {"id": "zlib", "level": self._default_zlib_value}
-                        )
+            filters = self._parse_filters(chunks_tag, dtype)
         # Attributes
         attrs: dict[str, Any] = {}
         for attr_tag in var_tag.iterfind("dap:Attribute", self._ns):
             attrs.update(self._parse_attribute(attr_tag))
-        if "_FillValue" in attrs and attrs["_FillValue"] != "*":
-            fill_value = attrs["_FillValue"]
-        attrs.pop("_FillValue", None)
+        # Remove attributes only used for parsing logic
+        fill_value = attrs.pop("_FillValue", 0.0)
+        attrs.pop("fullnamepath", None)
+        attrs.pop("origname", None)
+        attrs.pop("coordinates", None)
         # create ManifestArray and ZArray
         zarray = ZArray(
             chunks=chunks_shape,
@@ -353,11 +480,13 @@ def _parse_variable(
         )
         marr = ManifestArray(zarray=zarray, chunkmanifest=chunkmanifest)
         encoding = {k: attrs.get(k) for k in self._encoding_keys if k in attrs}
-        return xr.Variable(dims=dim_names, data=marr, attrs=attrs, encoding=encoding)
+        return xr.Variable(
+            dims=dim_shapes.keys(), data=marr, attrs=attrs, encoding=encoding
+        )
 
-    def _parse_attribute(self, attr_tag: ET.Element) -> dict:
+    def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]:
         """
-        Parse an attribute from a DMR attr tag.
+        Parse an attribute from a DMR attr tag. Converts the attribute value to a native python type.
 
         Parameters
         ----------
@@ -368,16 +497,87 @@ def _parse_attribute(self, attr_tag: ET.Element) -> dict:
         -------
         dict
         """
-        attr = {}
+        attr: dict[str, Any] = {}
         values = []
+        if "type" in attr_tag.attrib and attr_tag.attrib["type"] == "Container":
+            return attr
         dtype = np.dtype(self._dap_np_dtype[attr_tag.attrib["type"]])
         # if multiple Value tags are present, store as "key": "[v1, v2, ...]"
         for value_tag in attr_tag:
             # cast attribute to native python type using dmr provided dtype
-            values.append(dtype.type(value_tag.text).item())
+            val = (
+                dtype.type(value_tag.text).item()
+                if dtype != np.object_
+                else value_tag.text
+            )
+            if val == "*":
+                val = np.nan
+            values.append(val)
         attr[attr_tag.attrib["name"]] = values[0] if len(values) == 1 else values
         return attr
 
+    def _parse_filters(
+        self, chunks_tag: ET.Element, dtype: np.dtype
+    ) -> list[dict] | None:
+        """
+        Parse filters from a DMR chunks tag.
+
+        Parameters
+        ----------
+        chunks_tag : ET.Element
+            An ElementTree Element with a <chunks> tag.
+
+        dtype : np.dtype
+            The numpy dtype of the variable.
+
+        Returns
+        -------
+        list[dict] | None
+            E.g. [{"id": "shuffle", "elementsize": 4}, {"id": "zlib", "level": 4}]
+        """
+        if "compressionType" in chunks_tag.attrib:
+            filters: list[dict] = []
+            # shuffle deflate --> ["shuffle", "deflate"]
+            compression_types = chunks_tag.attrib["compressionType"].split(" ")
+            for c in compression_types:
+                if c == "shuffle":
+                    filters.append({"id": "shuffle", "elementsize": dtype.itemsize})
+                elif c == "deflate":
+                    filters.append(
+                        {
+                            "id": "zlib",
+                            "level": int(
+                                chunks_tag.attrib.get(
+                                    "deflateLevel", self._default_zlib_value
+                                )
+                            ),
+                        }
+                    )
+            return filters
+        return None
+
+    def _parse_chunks_dimensions(
+        self, chunks_tag: ET.Element
+    ) -> tuple[int, ...] | None:
+        """
+        Parse the chunk dimensions from a DMR chunks tag. Returns None if no chunk dimensions are found.
+
+        Parameters
+        ----------
+        chunks_tag : ET.Element
+            An ElementTree Element with a <chunks> tag.
+
+        Returns
+        -------
+        tuple[int, ...] | None
+
+        """
+        chunk_dim_tag = chunks_tag.find("dmr:chunkDimensionSizes", self._ns)
+        if chunk_dim_tag is not None and chunk_dim_tag.text is not None:
+            # 1 1447 2895 -> (1, 1447, 2895)
+            return tuple(map(int, chunk_dim_tag.text.split()))
+        return None
+
     def _parse_chunks(
         self, chunks_tag: ET.Element, chunks_shape: tuple[int, ...]
     ) -> ChunkManifest:
@@ -397,8 +597,10 @@ def _parse_chunks(
         ChunkManifest
         """
         chunkmanifest: dict[ChunkKey, object] = {}
-        default_num: list[int] = [0 for i in range(len(chunks_shape))]
-        chunk_key_template = ".".join(["{}" for i in range(len(chunks_shape))])
+        default_num: list[int] = (
+            [0 for i in range(len(chunks_shape))] if chunks_shape else [0]
+        )
+        chunk_key_template = ".".join(["{}" for i in range(len(default_num))])
         for chunk_tag in chunks_tag.iterfind("dmr:chunk", self._ns):
             chunk_num = default_num
             if "chunkPositionInArray" in chunk_tag.attrib:
diff --git a/virtualizarr/tests/test_readers/test_dmrpp.py b/virtualizarr/tests/test_readers/test_dmrpp.py
index b6d8d240..d2b19d60 100644
--- a/virtualizarr/tests/test_readers/test_dmrpp.py
+++ b/virtualizarr/tests/test_readers/test_dmrpp.py
@@ -2,53 +2,21 @@
 import xarray as xr
 
 from virtualizarr import open_virtual_dataset
-from virtualizarr.kerchunk import FileType
+from virtualizarr.tests import network
 
 urls = [
     (
-        "https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR25-JPL-L4-GLOB-v04.2/20240713090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc.dmrpp",
-        "https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR25-JPL-L4-GLOB-v04.2/20240713090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc",
         "netcdf4",
+        "https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5",
+        "dmrpp",
+        "https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5.dmrpp",
     )
 ]
 
 
-def match_zlib_level(result: xr.Dataset, expected: xr.Dataset):
-    # Fix the zlib level in the result to match the expected
-    # Many dmrpp's currently do not have the zlib level in the metadata (so default of -1 is used)
-    # Usage of the virtual Dataset is not affected, but the comparison will fail
-    # Remove once NASA dmrpps are updated with new dmrpp version: https://github.com/OPENDAP/bes/issues/954
-    for x in result.variables:
-        if result[x].data.zarray.dict()["filters"] is not None:
-            e_filters = [
-                z
-                for z in expected[x].data.zarray.dict()["filters"]
-                if "id" in z and z["id"] == "zlib"
-            ][0]
-            r_filters = [
-                z
-                for z in result[x].data.zarray.dict()["filters"]
-                if "id" in z and z["id"] == "zlib"
-            ][0]
-            r_filters["level"] = e_filters["level"]
-
-
-@pytest.mark.parametrize("dmrpp_url, data_url, data_type", urls)
-def test_dmrpp_reader(dmrpp_url: str, data_url: str, data_type: str):
-    import earthaccess
-
-    fs = earthaccess.get_fsspec_https_session()
-    result = open_virtual_dataset(
-        dmrpp_url,
-        indexes={},
-        filetype=FileType("dmrpp"),
-        reader_options={"storage_options": fs.storage_options},
-    )
-    expected = open_virtual_dataset(
-        data_url,
-        indexes={},
-        filetype=FileType(data_type),
-        reader_options={"storage_options": fs.storage_options},
-    )
-    match_zlib_level(result, expected)
+@network
+@pytest.mark.parametrize("data_type, data_url, dmrpp_type, dmrpp_url", urls)
+def test_dmrpp_reader(data_type, data_url, dmrpp_type, dmrpp_url):
+    result = open_virtual_dataset(dmrpp_url, indexes={}, filetype=dmrpp_type)
+    expected = open_virtual_dataset(data_url, indexes={})
     xr.testing.assert_identical(result, expected)
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 3a493a7c..3b3cb7fd 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -43,9 +43,7 @@ def open_virtual_dataset(
     cftime_variables: Iterable[str] | None = None,
     indexes: Mapping[str, Index] | None = None,
     virtual_array_class=ManifestArray,
-    reader_options: Optional[dict] = {
-        "storage_options": {"key": "", "secret": "", "anon": True}
-    },
+    reader_options: Optional[dict] = None,
 ) -> xr.Dataset:
     """
     Open a file or store as an xarray Dataset wrapping virtualized zarr arrays.
@@ -60,7 +58,7 @@ def open_virtual_dataset(
         File path to open as a set of virtualized zarr arrays.
     filetype : FileType, default None
         Type of file to be opened. Used to determine which kerchunk file format backend to use.
-        Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3', 'dmrpp'}.
+        Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}.
         If not provided will attempt to automatically infer the correct filetype from header bytes.
     drop_variables: list[str], default is None
         Variables in the file to drop before returning.
@@ -343,13 +341,16 @@ def variable_from_kerchunk_refs(
 
     arr_refs = kerchunk.extract_array_refs(refs, var_name)
     chunk_dict, zarray, zattrs = kerchunk.parse_array_refs(arr_refs)
-
-    manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict)
-
     # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs
     dims = zattrs.pop("_ARRAY_DIMENSIONS")
-
-    varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest)
+    if chunk_dict:
+        manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict)
+        varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest)
+    else:
+        # This means we encountered a scalar variable of dimension 0,
+        # very likely that it actually has no numeric value and its only purpose
+        # is to communicate dataset attributes.
+        varr = zarray.fill_value
 
     return xr.Variable(data=varr, dims=dims, attrs=zattrs)