From ef8aa9c6e30590a28fef5414ac817e2656764675 Mon Sep 17 00:00:00 2001 From: ayushnag <35325113+ayushnag@users.noreply.github.com> Date: Sat, 3 Aug 2024 11:41:09 +0530 Subject: [PATCH] modularize dmrpp.py --- virtualizarr/readers/dmrpp.py | 392 +++++++++++++----- virtualizarr/tests/test_readers/test_dmrpp.py | 50 +-- virtualizarr/xarray.py | 19 +- 3 files changed, 316 insertions(+), 145 deletions(-) diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py index 1ff66c8b..0d6453f5 100644 --- a/virtualizarr/readers/dmrpp.py +++ b/virtualizarr/readers/dmrpp.py @@ -16,6 +16,7 @@ class DMRParser: """ Parses a DMR++ file and creates a virtual xr.Dataset. Handles groups, dimensions, coordinates, data variables, encoding, chunk manifests, and attributes. + Highly modular to allow support for older dmrpp schema versions """ # DAP and DMRPP XML namespaces @@ -35,13 +36,13 @@ class DMRParser: "UInt32": "uint32", "Int64": "int64", "UInt64": "uint64", - "Url": "str", + "Url": "object", "Float32": "float32", "Float64": "float64", - "String": "str", + "String": "object", } - # Default zlib compression value (-1 means default, currently level 6 is default) - _default_zlib_value = -1 + # Default zlib compression value + _default_zlib_value = 6 # Encoding keys that should be cast to float _encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"} @@ -82,7 +83,7 @@ def parse_dataset(self, group=None) -> xr.Dataset: group = os.path.normpath(group).removeprefix( "/" ) # ensure group is in form "a/b/c" - if self.data_filepath.endswith(".h5"): + if self._is_hdf5(self.root): return self._parse_hdf5_dataset(self.root, group) if self.data_filepath.endswith(".nc"): return self._parse_netcdf4_dataset(self.root, group) @@ -103,6 +104,7 @@ def _parse_netcdf4_dataset( group : str The group to parse. If None, and no groups are present, the dataset is parsed. If None and groups are present, the first group is parsed. + Returns ------- xr.Dataset @@ -132,6 +134,11 @@ def _split_netcdf4(self, root: ET.Element) -> dict[str, ET.Element]: Split the input element into several ET.Elements by netcdf4 group E.g. {"left": , "right": } + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + Returns ------- dict[str, ET.Element] @@ -144,7 +151,17 @@ def _split_netcdf4(self, root: ET.Element) -> dict[str, ET.Element]: all_groups[os.path.normpath(group_tag.attrib["name"])] = group_tag return all_groups - def _parse_hdf5_dataset(self, root: ET.Element, group: str) -> xr.Dataset: + def _is_hdf5(self, root: ET.Element) -> bool: + """Check if the DMR file is HDF5 based.""" + if root.find(".//dap:Attribute[@name='fullnamepath']", self._ns) is not None: + return True + if root.find("./dap:Attribute[@name='HDF5_GLOBAL']", self._ns) is not None: + return True + return False + + def _parse_hdf5_dataset( + self, root: ET.Element, group: Optional[str] = None + ) -> xr.Dataset: """ Parse the dataset from the HDF5 based dmrpp with groups, starting at the given group. Set root to the given group. @@ -163,25 +180,43 @@ def _parse_hdf5_dataset(self, root: ET.Element, group: str) -> xr.Dataset: xr.Dataset """ all_groups = self._split_hdf5(root=root) + if len(all_groups) == 0: + raise ValueError("No groups found in HDF based DMR file") + if group is None: + # pick a random group if no group is specified + group = next(iter(all_groups)) + attrs = {} + for attr_tag in root.iterfind("dap:Attribute", self._ns): + if attr_tag.attrib["type"] != "Container": + attrs.update(self._parse_attribute(attr_tag)) if group in all_groups: # replace aliased variable names with original names: gt1r_heights -> heights - orignames = {} - vars_tags: list[ET.Element] = [] - for dap_dtype in self._dap_np_dtype: - vars_tags += all_groups[group].findall(f"dap:{dap_dtype}", self._ns) - for var_tag in vars_tags: - origname_tag = var_tag.find( - "./dap:Attribute[@name='origname']/dap:Value", self._ns - ) - if origname_tag is not None and origname_tag.text is not None: - orignames[var_tag.attrib["name"]] = origname_tag.text - return self._parse_dataset(all_groups[group]).rename(orignames) + orignames = self.find_original_names(all_groups[group]) + vds = self._parse_dataset(all_groups[group]) + # Only one group so found attrs are global attrs + if len(all_groups) == 1: + vds.attrs.update(attrs) + return vds.rename(orignames) raise ValueError(f"Group {group} not found in HDF5 DMR file") + def find_original_names(self, root: ET.Element) -> dict[str, str]: + orignames: dict[str, str] = {} + vars_tags: list[ET.Element] = [] + for dap_dtype in self._dap_np_dtype: + vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) + for var_tag in vars_tags: + origname_tag = var_tag.find( + "./dap:Attribute[@name='origname']/dap:Value", self._ns + ) + if origname_tag is not None and origname_tag.text is not None: + orignames[var_tag.attrib["name"]] = origname_tag.text + return orignames + def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]: """ Split the input element into several ET.Elements by HDF5 group - E.g. {"gtr1/heights": , "gtr1/temperatures": } + E.g. {"gtr1/heights": , "gtr1/temperatures": }. Builds up new elements + each with dimensions, variables, and attributes. Parameters ---------- @@ -192,7 +227,8 @@ def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]: ------- dict[str, ET.Element] """ - all_groups: dict[str, ET.Element] = defaultdict( + # Add all variable, dimension, and attribute tags to their respective groups + groups_roots: dict[str, ET.Element] = defaultdict( lambda: ET.Element(root.tag, root.attrib) ) group_dims: dict[str, set[str]] = defaultdict( @@ -209,80 +245,182 @@ def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]: if fullname_tag is not None and fullname_tag.text is not None: # '/gt1r/heights/ph_id_pulse' -> 'gt1r/heights' group_name = os.path.dirname(fullname_tag.text).removeprefix("/") - all_groups[group_name].append(var_tag) - for dim_tag in var_tag.iterfind("dap:Dim", self._ns): - group_dims[group_name].add(dim_tag.attrib["name"].removeprefix("/")) + groups_roots[group_name].append(var_tag) + dim_tags = var_tag.findall("dap:Dim", self._ns) + dims = self._parse_multi_dims(dim_tags) + group_dims[group_name].update(dims.keys()) # Dimensions for dim_tag in root.iterfind("dap:Dimension", self._ns): - for group_name, dims in group_dims.items(): - if dim_tag.attrib["name"] in dims: - all_groups[group_name].append(dim_tag) + for g, d in group_dims.items(): + if dim_tag.attrib["name"] in d: + groups_roots[g].append(dim_tag) # Attributes - for attr_tag in root.iterfind("dap:Attribute", self._ns): - fullname_tag = attr_tag.find( - "./dap:Attribute[@name='fullnamepath']/dap:Value", self._ns - ) - if fullname_tag is not None and fullname_tag.text is not None: - group_name = fullname_tag.text - # Add all attributes to the new dataset; fullnamepath is generally excluded - if group_name in all_groups: - all_groups[group_name].extend( - [a for a in attr_tag if a.attrib["name"] != "fullnamepath"] - ) - return all_groups + container_attr_tag = root.find("dap:Attribute[@name='HDF5_GLOBAL']", self._ns) + if container_attr_tag is None: + attrs_tags = root.findall("dap:Attribute", self._ns) + for attr_tag in attrs_tags: + fullname_tag = attr_tag.find( + "./dap:Attribute[@name='fullnamepath']/dap:Value", self._ns + ) + if fullname_tag is not None and fullname_tag.text is not None: + group_name = os.path.dirname(fullname_tag.text).removeprefix("/") + # Add all attributes to the new dataset + groups_roots[group_name].extend(attr_tag) + else: + groups_roots[next(iter(groups_roots))].extend(container_attr_tag) + return groups_roots def _parse_dataset(self, root: ET.Element) -> xr.Dataset: """ Parse the dataset using the root element of the DMR file. + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + Returns ------- xr.Dataset """ # Dimension names and sizes - dataset_dims: dict[str, int] = {} - for dim_tag in root.iterfind("dap:Dimension", self._ns): - dataset_dims[dim_tag.attrib["name"]] = int(dim_tag.attrib["size"]) + dim_tags = root.findall("dap:Dimension", self._ns) + dataset_dims = self._parse_multi_dims(dim_tags) # Data variables and coordinates - vars_tags: list[ET.Element] = [] - for dap_dtype in self._dap_np_dtype: - vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) - # Coordinate names (using Map tags and coordinates attribute) - coord_names: set[str] = set() - for var_tag in vars_tags: - coord_tag = var_tag.find( - "./dap:Attribute[@name='coordinates']/dap:Value", self._ns - ) - if coord_tag is not None and coord_tag.text is not None: - coord_names.update(coord_tag.text.split(" ")) - for map_tag in var_tag.iterfind("dap:Map", self._ns): - coord_names.add(map_tag.attrib["name"].removeprefix("/")) + coord_names = self._find_coord_names(root) # if no coord_names are found or coords don't include dims, dims are used as coords if len(coord_names) == 0 or len(coord_names) < len(dataset_dims): coord_names = set(dataset_dims.keys()) # Seperate and parse coords + data variables - coords: dict[str, xr.Variable] = {} + coord_vars: dict[str, xr.Variable] = {} data_vars: dict[str, xr.Variable] = {} - for var_tag in vars_tags: + for var_tag in self._find_var_tags(root): + variable = self._parse_variable(var_tag, dataset_dims) if var_tag.attrib["name"] in coord_names: - coords[var_tag.attrib["name"]] = self._parse_variable( - var_tag, dataset_dims - ) + coord_vars[var_tag.attrib["name"]] = variable else: - data_vars[var_tag.attrib["name"]] = self._parse_variable( - var_tag, dataset_dims - ) + data_vars[var_tag.attrib["name"]] = variable # Attributes attrs: dict[str, str] = {} for attr_tag in self.root.iterfind("dap:Attribute", self._ns): - if attr_tag.attrib["type"] != "Container": # container = nested attributes - attrs.update(self._parse_attribute(attr_tag)) + attrs.update(self._parse_attribute(attr_tag)) return xr.Dataset( data_vars=data_vars, - coords=xr.Coordinates(coords=coords, indexes={}), + coords=xr.Coordinates(coords=coord_vars, indexes={}), attrs=attrs, ) + def _find_var_tags(self, root: ET.Element) -> list[ET.Element]: + """ + Find all variable tags in the DMR file. Also known as array tags. + Tags are labeled with the DAP data type. E.g. , , + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + list[ET.Element] + """ + vars_tags: list[ET.Element] = [] + for dap_dtype in self._dap_np_dtype: + vars_tags += root.findall(f"dap:{dap_dtype}", self._ns) + return vars_tags + + def _find_coord_names(self, root: ET.Element) -> set[str]: + """ + Find the name of all coordinates in root. Checks inside all variables and global attributes. + + Parameters + ---------- + root : ET.Element + The root element of the DMR file. + + Returns + ------- + set[str] : The set of unique coordinate names. + """ + # Check for coordinate names within each variable attributes + coord_names: set[str] = set() + for var_tag in self._find_var_tags(root): + coord_tag = var_tag.find( + "./dap:Attribute[@name='coordinates']/dap:Value", self._ns + ) + if coord_tag is not None and coord_tag.text is not None: + coord_names.update(coord_tag.text.split(" ")) + for map_tag in var_tag.iterfind("dap:Map", self._ns): + coord_names.add(map_tag.attrib["name"].removeprefix("/")) + # Check for coordinate names in a global attribute + coord_tag = var_tag.find("./dap:Attribute[@name='coordinates']", self._ns) + if coord_tag is not None and coord_tag.text is not None: + coord_names.update(coord_tag.text.split(" ")) + return coord_names + + def _parse_dim(self, root: ET.Element) -> dict[str, int | None]: + """ + Parse single or tag + + If the tag has no name attribute, it is a phony dimension. E.g. --> {"phony_dim": 300} + If the tag has no size attribute, it is an unlimited dimension. E.g. --> {"time": None} + If the tag has both name and size attributes, it is a regular dimension. E.g. --> {"lat": 1447} + + Parameters + ---------- + root : ET.Element + The root element Dim/Dimension tag + + Returns + ------- + dict + E.g. {"time": 1, "lat": 1447, "lon": 2895}, {"phony_dim": 300}, {"time": None, "lat": None, "lon": None} + """ + if "name" not in root.attrib and "size" in root.attrib: + return {"phony_dim": int(root.attrib["size"])} + if "name" in root.attrib and "size" not in root.attrib: + return {os.path.basename(root.attrib["name"]): None} + if "name" in root.attrib and "size" in root.attrib: + return {os.path.basename(root.attrib["name"]): int(root.attrib["size"])} + raise ValueError("Not enough information to parse Dim/Dimension tag") + + def _parse_multi_dims( + self, dim_tags: list[ET.Element], global_dims: dict[str, int] = {} + ) -> dict: + """ + Parse multiple or tags. Generally tags are found within dmrpp variable tags. + + Returns best possible matching of {dimension: shape} present in the list and global_dims. E.g tags=(Dim("lat", None), Dim("lon", None)) and global_dims={"lat": 100, "lon": 100, "time": 5} --> {"lat": 100, "lon": 100} + + E.g. tags=(Dim("time", None), Dim("", 200)) and global_dims={"lat": 100, "lon": 100, "time": 5} --> {"time": 5, "phony_dim0": 200} + + This function is often used to fill in missing sizes from the global_dims. E.g. Variable tags may contain only dimension names and not sizes. If the {name: size} matching is known from the global_dims, it is used to fill in the missing sizes. + + Parameters + ---------- + dim_tags : tuple[ET.Element] + A tuple of ElementTree Elements representing dimensions in the DMR file. + + global_dims : dict + A dictionary of dimension names and sizes. E.g. {"time": 1, "lat": 1447, "lon": 2895} + + Returns + ------- + dict + E.g. {"time": 1, "lat": 1447, "lon": 2895} + """ + dims: dict[str, int | None] = {} + for dim_tag in dim_tags: + dim: dict[str, int | None] = self._parse_dim(dim_tag) + if "phony_dim" in dim: + dims["phony_dim_" + str(len(dims))] = dim["phony_dim"] + else: + dims.update(dim) + for name, size in list(dims.items()): + if name in global_dims and size is None: + dims[name] = global_dims[name] + return dims + def _parse_variable( self, var_tag: ET.Element, dataset_dims: dict[str, int] ) -> xr.Variable: @@ -296,52 +434,41 @@ def _parse_variable( dataset_dims : dict A dictionary of dimension names and sizes. E.g. {"time": 1, "lat": 1447, "lon": 2895} - Must contain at least all the dimensions used by the variable. + Must contain at least all the dimensions used by the variable. Necessary since the variable + metadata only contains the dimension names and not the sizes. Returns ------- xr.Variable """ # Dimension names - dim_names: list[str] = [] - for dim_tag in var_tag.iterfind("dap:Dim", self._ns): - dim_names.append(os.path.basename(dim_tag.attrib["name"])) + dim_tags = var_tag.findall("dap:Dim", self._ns) + dim_shapes = self._parse_multi_dims(dim_tags, dataset_dims) # convert DAP dtype to numpy dtype dtype = np.dtype( self._dap_np_dtype[var_tag.tag.removeprefix("{" + self._ns["dap"] + "}")] ) # Chunks and Filters filters = None - fill_value = np.nan - shape = tuple([dataset_dims[d] for d in dim_names]) + shape = tuple(dim_shapes.values()) chunks_shape = shape chunks_tag = var_tag.find("dmr:chunks", self._ns) if chunks_tag is not None: # Chunks - chunk_dim_tag = chunks_tag.find("dmr:chunkDimensionSizes", self._ns) - if chunk_dim_tag is not None and chunk_dim_tag.text is not None: - # 1 1447 2895 -> (1, 1447, 2895) - chunks_shape = tuple(map(int, chunk_dim_tag.text.split())) + found_chunk_dims = self._parse_chunks_dimensions(chunks_tag) + chunks_shape = found_chunk_dims if found_chunk_dims is not None else shape chunkmanifest = self._parse_chunks(chunks_tag, chunks_shape) # Filters - if "compressionType" in chunks_tag.attrib: - filters = [] - # shuffle deflate --> ["shuffle", "deflate"] - compression_types = chunks_tag.attrib["compressionType"].split(" ") - for c in compression_types: - if c == "shuffle": - filters.append({"id": "shuffle", "elementsize": dtype.itemsize}) - elif c == "deflate": - filters.append( - {"id": "zlib", "level": self._default_zlib_value} - ) + filters = self._parse_filters(chunks_tag, dtype) # Attributes attrs: dict[str, Any] = {} for attr_tag in var_tag.iterfind("dap:Attribute", self._ns): attrs.update(self._parse_attribute(attr_tag)) - if "_FillValue" in attrs and attrs["_FillValue"] != "*": - fill_value = attrs["_FillValue"] - attrs.pop("_FillValue", None) + # Remove attributes only used for parsing logic + fill_value = attrs.pop("_FillValue", 0.0) + attrs.pop("fullnamepath", None) + attrs.pop("origname", None) + attrs.pop("coordinates", None) # create ManifestArray and ZArray zarray = ZArray( chunks=chunks_shape, @@ -353,11 +480,13 @@ def _parse_variable( ) marr = ManifestArray(zarray=zarray, chunkmanifest=chunkmanifest) encoding = {k: attrs.get(k) for k in self._encoding_keys if k in attrs} - return xr.Variable(dims=dim_names, data=marr, attrs=attrs, encoding=encoding) + return xr.Variable( + dims=dim_shapes.keys(), data=marr, attrs=attrs, encoding=encoding + ) - def _parse_attribute(self, attr_tag: ET.Element) -> dict: + def _parse_attribute(self, attr_tag: ET.Element) -> dict[str, Any]: """ - Parse an attribute from a DMR attr tag. + Parse an attribute from a DMR attr tag. Converts the attribute value to a native python type. Parameters ---------- @@ -368,16 +497,87 @@ def _parse_attribute(self, attr_tag: ET.Element) -> dict: ------- dict """ - attr = {} + attr: dict[str, Any] = {} values = [] + if "type" in attr_tag.attrib and attr_tag.attrib["type"] == "Container": + return attr dtype = np.dtype(self._dap_np_dtype[attr_tag.attrib["type"]]) # if multiple Value tags are present, store as "key": "[v1, v2, ...]" for value_tag in attr_tag: # cast attribute to native python type using dmr provided dtype - values.append(dtype.type(value_tag.text).item()) + val = ( + dtype.type(value_tag.text).item() + if dtype != np.object_ + else value_tag.text + ) + if val == "*": + val = np.nan + values.append(val) attr[attr_tag.attrib["name"]] = values[0] if len(values) == 1 else values return attr + def _parse_filters( + self, chunks_tag: ET.Element, dtype: np.dtype + ) -> list[dict] | None: + """ + Parse filters from a DMR chunks tag. + + Parameters + ---------- + chunks_tag : ET.Element + An ElementTree Element with a tag. + + dtype : np.dtype + The numpy dtype of the variable. + + Returns + ------- + list[dict] | None + E.g. [{"id": "shuffle", "elementsize": 4}, {"id": "zlib", "level": 4}] + """ + if "compressionType" in chunks_tag.attrib: + filters: list[dict] = [] + # shuffle deflate --> ["shuffle", "deflate"] + compression_types = chunks_tag.attrib["compressionType"].split(" ") + for c in compression_types: + if c == "shuffle": + filters.append({"id": "shuffle", "elementsize": dtype.itemsize}) + elif c == "deflate": + filters.append( + { + "id": "zlib", + "level": int( + chunks_tag.attrib.get( + "deflateLevel", self._default_zlib_value + ) + ), + } + ) + return filters + return None + + def _parse_chunks_dimensions( + self, chunks_tag: ET.Element + ) -> tuple[int, ...] | None: + """ + Parse the chunk dimensions from a DMR chunks tag. Returns None if no chunk dimensions are found. + + Parameters + ---------- + chunks_tag : ET.Element + An ElementTree Element with a tag. + + Returns + ------- + tuple[int, ...] | None + + """ + chunk_dim_tag = chunks_tag.find("dmr:chunkDimensionSizes", self._ns) + if chunk_dim_tag is not None and chunk_dim_tag.text is not None: + # 1 1447 2895 -> (1, 1447, 2895) + return tuple(map(int, chunk_dim_tag.text.split())) + return None + def _parse_chunks( self, chunks_tag: ET.Element, chunks_shape: tuple[int, ...] ) -> ChunkManifest: @@ -397,8 +597,10 @@ def _parse_chunks( ChunkManifest """ chunkmanifest: dict[ChunkKey, object] = {} - default_num: list[int] = [0 for i in range(len(chunks_shape))] - chunk_key_template = ".".join(["{}" for i in range(len(chunks_shape))]) + default_num: list[int] = ( + [0 for i in range(len(chunks_shape))] if chunks_shape else [0] + ) + chunk_key_template = ".".join(["{}" for i in range(len(default_num))]) for chunk_tag in chunks_tag.iterfind("dmr:chunk", self._ns): chunk_num = default_num if "chunkPositionInArray" in chunk_tag.attrib: diff --git a/virtualizarr/tests/test_readers/test_dmrpp.py b/virtualizarr/tests/test_readers/test_dmrpp.py index b6d8d240..d2b19d60 100644 --- a/virtualizarr/tests/test_readers/test_dmrpp.py +++ b/virtualizarr/tests/test_readers/test_dmrpp.py @@ -2,53 +2,21 @@ import xarray as xr from virtualizarr import open_virtual_dataset -from virtualizarr.kerchunk import FileType +from virtualizarr.tests import network urls = [ ( - "https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR25-JPL-L4-GLOB-v04.2/20240713090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc.dmrpp", - "https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR25-JPL-L4-GLOB-v04.2/20240713090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc", "netcdf4", + "https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5", + "dmrpp", + "https://github.com/OPENDAP/bes/raw/3e518f6dc2f625b0b83cfb6e6fd5275e4d6dcef1/modules/dmrpp_module/data/dmrpp/chunked_threeD.h5.dmrpp", ) ] -def match_zlib_level(result: xr.Dataset, expected: xr.Dataset): - # Fix the zlib level in the result to match the expected - # Many dmrpp's currently do not have the zlib level in the metadata (so default of -1 is used) - # Usage of the virtual Dataset is not affected, but the comparison will fail - # Remove once NASA dmrpps are updated with new dmrpp version: https://github.com/OPENDAP/bes/issues/954 - for x in result.variables: - if result[x].data.zarray.dict()["filters"] is not None: - e_filters = [ - z - for z in expected[x].data.zarray.dict()["filters"] - if "id" in z and z["id"] == "zlib" - ][0] - r_filters = [ - z - for z in result[x].data.zarray.dict()["filters"] - if "id" in z and z["id"] == "zlib" - ][0] - r_filters["level"] = e_filters["level"] - - -@pytest.mark.parametrize("dmrpp_url, data_url, data_type", urls) -def test_dmrpp_reader(dmrpp_url: str, data_url: str, data_type: str): - import earthaccess - - fs = earthaccess.get_fsspec_https_session() - result = open_virtual_dataset( - dmrpp_url, - indexes={}, - filetype=FileType("dmrpp"), - reader_options={"storage_options": fs.storage_options}, - ) - expected = open_virtual_dataset( - data_url, - indexes={}, - filetype=FileType(data_type), - reader_options={"storage_options": fs.storage_options}, - ) - match_zlib_level(result, expected) +@network +@pytest.mark.parametrize("data_type, data_url, dmrpp_type, dmrpp_url", urls) +def test_dmrpp_reader(data_type, data_url, dmrpp_type, dmrpp_url): + result = open_virtual_dataset(dmrpp_url, indexes={}, filetype=dmrpp_type) + expected = open_virtual_dataset(data_url, indexes={}) xr.testing.assert_identical(result, expected) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 3a493a7c..3b3cb7fd 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -43,9 +43,7 @@ def open_virtual_dataset( cftime_variables: Iterable[str] | None = None, indexes: Mapping[str, Index] | None = None, virtual_array_class=ManifestArray, - reader_options: Optional[dict] = { - "storage_options": {"key": "", "secret": "", "anon": True} - }, + reader_options: Optional[dict] = None, ) -> xr.Dataset: """ Open a file or store as an xarray Dataset wrapping virtualized zarr arrays. @@ -60,7 +58,7 @@ def open_virtual_dataset( File path to open as a set of virtualized zarr arrays. filetype : FileType, default None Type of file to be opened. Used to determine which kerchunk file format backend to use. - Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3', 'dmrpp'}. + Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}. If not provided will attempt to automatically infer the correct filetype from header bytes. drop_variables: list[str], default is None Variables in the file to drop before returning. @@ -343,13 +341,16 @@ def variable_from_kerchunk_refs( arr_refs = kerchunk.extract_array_refs(refs, var_name) chunk_dict, zarray, zattrs = kerchunk.parse_array_refs(arr_refs) - - manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) - # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs dims = zattrs.pop("_ARRAY_DIMENSIONS") - - varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) + if chunk_dict: + manifest = ChunkManifest._from_kerchunk_chunk_dict(chunk_dict) + varr = virtual_array_class(zarray=zarray, chunkmanifest=manifest) + else: + # This means we encountered a scalar variable of dimension 0, + # very likely that it actually has no numeric value and its only purpose + # is to communicate dataset attributes. + varr = zarray.fill_value return xr.Variable(data=varr, dims=dims, attrs=zattrs)