Skip to content

Commit

Permalink
adding reader_options kwargs to open_virtual_dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
norlandrhagen committed Mar 29, 2024
1 parent 2c5be3f commit 4c6cb63
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 6 deletions.
17 changes: 11 additions & 6 deletions virtualizarr/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@


def read_kerchunk_references_from_file(
filepath: str, filetype: Optional[str]
filepath: str, filetype: Optional[str],
reader_options: Optional[dict] = {'storage_options': {'anon': True}}


) -> KerchunkStoreRefs:
"""
Read a single legacy file and return kerchunk references to its contents.
Expand All @@ -32,31 +35,33 @@ def read_kerchunk_references_from_file(
filetype : str, default: None
Type of file to be opened. Used to determine which kerchunk file format backend to use.
If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
reader_options: dict, default {'storage_options': {'anon': True}}
Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments,
so ensure reader_options match selected Kerchunk reader arguments.
"""

if filetype is None:
filetype = _automatically_determine_filetype(filepath)

if filetype.lower() == "netcdf3":
from kerchunk.netCDF3 import NetCDF3ToZarr
refs = NetCDF3ToZarr(filepath).translate()
refs = NetCDF3ToZarr(filepath, **reader_options).translate()

Check warning on line 48 in virtualizarr/kerchunk.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/kerchunk.py#L48

Added line #L48 was not covered by tests

elif filetype.lower() == "netcdf4":
from kerchunk.hdf import SingleHdf5ToZarr

refs = SingleHdf5ToZarr(filepath).translate()
refs = SingleHdf5ToZarr(filepath, **reader_options).translate()

Check warning on line 52 in virtualizarr/kerchunk.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/kerchunk.py#L52

Added line #L52 was not covered by tests
elif filetype == "grib":
# TODO Grib files should be handled as a DataTree object
# see https://github.com/TomNicholas/VirtualiZarr/issues/11
raise NotImplementedError(f"Unsupported file type: {filetype}")
elif filetype.lower() == "tiff":
from kerchunk.tiff import tiff_to_zarr

refs = tiff_to_zarr(filepath)
refs = tiff_to_zarr(filepath, **reader_options)

Check warning on line 60 in virtualizarr/kerchunk.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/kerchunk.py#L60

Added line #L60 was not covered by tests
elif filetype.lower() == "fits":
from kerchunk.fits import process_file

refs = process_file(filepath)
refs = process_file(filepath, **reader_options)

Check warning on line 64 in virtualizarr/kerchunk.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/kerchunk.py#L64

Added line #L64 was not covered by tests
else:
raise NotImplementedError(f"Unsupported file type: {filetype}")

Expand Down
10 changes: 10 additions & 0 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import xarray as xr
from xarray.core.indexes import Index
import pytest

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests import ChunkManifest, ManifestArray
Expand Down Expand Up @@ -268,3 +269,12 @@ def test_combine_by_coords(self, netcdf4_files):
)

assert combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing


pytest.importorskip("s3fs")
@pytest.mark.xfail(reason="currently should xfail for None filetype and None indexes.",run=False)
@pytest.mark.parametrize("filetype", ['netcdf4', None], ids=["netcdf4 filetype", "None filetype"])
@pytest.mark.parametrize("indexes", [None, {}], ids=["None index", "empty dict index"])
def test_anon_read_s3(filetype, indexes):
fpath = 's3://nex-gddp-cmip6/NEX-GDDP-CMIP6/CESM2/historical/r4i1p1f1/pr/pr_day_CESM2_historical_r4i1p1f1_gn_2010.nc'
assert open_virtual_dataset(fpath,filetype=filetype,indexes=indexes,reader_options={'storage_options': {'anon': True}})

Check warning on line 280 in virtualizarr/tests/test_xarray.py

View check run for this annotation

Codecov / codecov/patch

virtualizarr/tests/test_xarray.py#L275-L280

Added lines #L275 - L280 were not covered by tests
5 changes: 5 additions & 0 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def open_virtual_dataset(
drop_variables: Optional[List[str]] = None,
indexes: Optional[Mapping[str, Index]] = None,
virtual_array_class=ManifestArray,
reader_options: Optional[dict] = {'storage_options': {'anon': True}}
) -> xr.Dataset:
"""
Open a file or store as an xarray Dataset wrapping virtualized zarr arrays.
Expand All @@ -48,12 +49,16 @@ def open_virtual_dataset(
virtual_array_class
Virtual array class to use to represent the references to the chunks in each on-disk array.
Currently can only be ManifestArray, but once VirtualZarrArray is implemented the default should be changed to that.
reader_options: dict, default {'storage_options': {'anon': True}}
Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments,
so ensure reader_options match selected Kerchunk reader arguments.
"""

# this is the only place we actually always need to use kerchunk directly
vds_refs = kerchunk.read_kerchunk_references_from_file(
filepath=filepath,
filetype=filetype,
reader_options=reader_options,
)

if indexes is None:
Expand Down

0 comments on commit 4c6cb63

Please sign in to comment.