diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml index 7d0f29c0960..22076f88854 100644 --- a/ci/requirements/min-all-deps.yml +++ b/ci/requirements/min-all-deps.yml @@ -16,11 +16,11 @@ dependencies: - dask-core=2022.7 - distributed=2022.7 - flox=0.5 - - h5netcdf=1.0 + - h5netcdf=1.1 # h5py and hdf5 tend to cause conflicts # for e.g. hdf5 1.12 conflicts with h5py=3.1 # prioritize bumping other packages instead - - h5py=3.6 + - h5py=3.7 - hdf5=1.12 - hypothesis - iris=3.2 diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2fb76cfe8c2..928a2c95cac 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -32,13 +32,14 @@ New Features By `Sam Levang `_. - Allow the usage of h5py drivers (eg: ros3) via h5netcdf (:pull:`8360`). By `Ezequiel Cimadevilla `_. +- Enable VLEN string fill_values, preserve VLEN string dtypes (:issue:`1647`, :issue:`7652`, :issue:`7868`, :pull:`7869`). + By `Kai Mühlbauer `_. Breaking changes ~~~~~~~~~~~~~~~~ - drop support for `cdms2 `_. Please use `xcdat `_ instead (:pull:`8441`). By `Justus Magin `_. - - Following pandas, :py:meth:`infer_freq` will return ``"Y"``, ``"YS"``, ``"QE"``, ``"ME"``, ``"h"``, ``"min"``, ``"s"``, ``"ms"``, ``"us"``, or ``"ns"`` instead of ``"A"``, ``"AS"``, ``"Q"``, ``"M"``, ``"H"``, ``"T"``, @@ -46,6 +47,8 @@ Breaking changes deprecation of the latter frequency strings (:issue:`8394`, :pull:`8415`). By `Spencer Clark `_. - Bump minimum tested pint version to ``>=0.22``. By `Deepak Cherian `_. +- Minimum supported versions for the following packages have changed: ``h5py >=3.7``, ``h5netcdf>=1.1``. + By `Kai Mühlbauer `_. Deprecations ~~~~~~~~~~~~ diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index a68a44b5f6f..d9385fc68a9 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -271,15 +271,6 @@ def prepare_variable( dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding) fillvalue = attrs.pop("_FillValue", None) - if dtype is str and fillvalue is not None: - raise NotImplementedError( - "h5netcdf does not yet support setting a fill value for " - "variable-length strings " - "(https://github.com/h5netcdf/h5netcdf/issues/37). " - f"Either remove '_FillValue' from encoding on variable {name!r} " - "or set {'dtype': 'S1'} in encoding to use the fixed width " - "NC_CHAR type." - ) if dtype is str: dtype = h5py.special_dtype(vlen=str) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index f21f15bf795..1aee4c1c726 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -490,16 +490,6 @@ def prepare_variable( fill_value = attrs.pop("_FillValue", None) - if datatype is str and fill_value is not None: - raise NotImplementedError( - "netCDF4 does not yet support setting a fill value for " - "variable-length strings " - "(https://github.com/Unidata/netcdf4-python/issues/730). " - f"Either remove '_FillValue' from encoding on variable {name!r} " - "or set {'dtype': 'S1'} in encoding to use the fixed width " - "NC_CHAR type." - ) - encoding = _extract_nc4_variable_encoding( variable, raise_on_invalid=check_encoding, unlimited_dims=unlimited_dims ) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index df660f90d9e..487197605e8 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -562,3 +562,15 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: def decode(self): raise NotImplementedError() + + +class ObjectVLenStringCoder(VariableCoder): + def encode(self): + return NotImplementedError + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + if variable.dtype == object and variable.encoding.get("dtype", False) == str: + variable = variable.astype(variable.encoding["dtype"]) + return variable + else: + return variable diff --git a/xarray/conventions.py b/xarray/conventions.py index cf207f0c37a..75f816e6cb4 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -265,6 +265,10 @@ def decode_cf_variable( var = strings.CharacterArrayCoder().decode(var, name=name) var = strings.EncodedStringCoder().decode(var) + if original_dtype == object: + var = variables.ObjectVLenStringCoder().decode(var) + original_dtype = var.dtype + if mask_and_scale: for coder in [ variables.UnsignedIntegerCoder(), diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 80b6951dbff..85248b5c40a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -864,12 +864,13 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: assert check_vlen_dtype(original["a"].dtype) == str with self.roundtrip(original) as actual: assert_identical(original, actual) - assert object == actual["a"].dtype - assert actual["a"].dtype == original["a"].dtype - # only check metadata for capable backends - # eg. NETCDF3 based backends do not roundtrip metadata - if actual["a"].dtype.metadata is not None: - assert check_vlen_dtype(actual["a"].dtype) == str + if np.issubdtype(actual["a"].dtype, object): + # only check metadata for capable backends + # eg. NETCDF3 based backends do not roundtrip metadata + if actual["a"].dtype.metadata is not None: + assert check_vlen_dtype(actual["a"].dtype) == str + else: + assert actual["a"].dtype == np.dtype(" None: with self.open(tmp_file, group="data/2") as actual2: assert_identical(data2, actual2) - def test_encoding_kwarg_vlen_string(self) -> None: - for input_strings in [[b"foo", b"bar", b"baz"], ["foo", "bar", "baz"]]: - original = Dataset({"x": input_strings}) - expected = Dataset({"x": ["foo", "bar", "baz"]}) - kwargs = dict(encoding={"x": {"dtype": str}}) - with self.roundtrip(original, save_kwargs=kwargs) as actual: - assert actual["x"].encoding["dtype"] is str - assert_identical(actual, expected) - - def test_roundtrip_string_with_fill_value_vlen(self) -> None: + @pytest.mark.parametrize( + "input_strings, is_bytes", + [ + ([b"foo", b"bar", b"baz"], True), + (["foo", "bar", "baz"], False), + (["foó", "bár", "baź"], False), + ], + ) + def test_encoding_kwarg_vlen_string( + self, input_strings: list[str], is_bytes: bool + ) -> None: + original = Dataset({"x": input_strings}) + + expected_string = ["foo", "bar", "baz"] if is_bytes else input_strings + expected = Dataset({"x": expected_string}) + kwargs = dict(encoding={"x": {"dtype": str}}) + with self.roundtrip(original, save_kwargs=kwargs) as actual: + assert actual["x"].encoding["dtype"] == " None: values = np.array(["ab", "cdef", np.nan], dtype=object) expected = Dataset({"x": ("t", values)}) - # netCDF4-based backends don't support an explicit fillvalue - # for variable length strings yet. - # https://github.com/Unidata/netcdf4-python/issues/730 - # https://github.com/h5netcdf/h5netcdf/issues/37 - original = Dataset({"x": ("t", values, {}, {"_FillValue": "XXX"})}) - with pytest.raises(NotImplementedError): - with self.roundtrip(original) as actual: - assert_identical(expected, actual) + original = Dataset({"x": ("t", values, {}, {"_FillValue": fill_value})}) + with self.roundtrip(original) as actual: + assert_identical(expected, actual) original = Dataset({"x": ("t", values, {}, {"_FillValue": ""})}) - with pytest.raises(NotImplementedError): - with self.roundtrip(original) as actual: - assert_identical(expected, actual) + with self.roundtrip(original) as actual: + assert_identical(expected, actual) def test_roundtrip_character_array(self) -> None: with create_tmp_file() as tmp_file: