Skip to content

Commit

Permalink
preserve vlen string dtypes, allow vlen string fill_values (#7869)
Browse files Browse the repository at this point in the history
* preserve vlen string dtypes, allow vlen string fill_values

* update min-deps h5py->3.7, h5netcdf -> 1.1

* add doc/whats-new.rst entry

* add suggestions from review

* remove stale entry
  • Loading branch information
kmuehlbauer authored Nov 17, 2023
1 parent 22ca9ba commit 74901af
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 49 deletions.
4 changes: 2 additions & 2 deletions ci/requirements/min-all-deps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ dependencies:
- dask-core=2022.7
- distributed=2022.7
- flox=0.5
- h5netcdf=1.0
- h5netcdf=1.1
# h5py and hdf5 tend to cause conflicts
# for e.g. hdf5 1.12 conflicts with h5py=3.1
# prioritize bumping other packages instead
- h5py=3.6
- h5py=3.7
- hdf5=1.12
- hypothesis
- iris=3.2
Expand Down
5 changes: 4 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,23 @@ New Features
By `Sam Levang <https://github.com/slevang>`_.
- Allow the usage of h5py drivers (eg: ros3) via h5netcdf (:pull:`8360`).
By `Ezequiel Cimadevilla <https://github.com/zequihg50>`_.
- Enable VLEN string fill_values, preserve VLEN string dtypes (:issue:`1647`, :issue:`7652`, :issue:`7868`, :pull:`7869`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Breaking changes
~~~~~~~~~~~~~~~~
- drop support for `cdms2 <https://github.com/CDAT/cdms>`_. Please use
`xcdat <https://github.com/xCDAT/xcdat>`_ instead (:pull:`8441`).
By `Justus Magin <https://github.com/keewis>`_.

- Following pandas, :py:meth:`infer_freq` will return ``"Y"``, ``"YS"``,
``"QE"``, ``"ME"``, ``"h"``, ``"min"``, ``"s"``, ``"ms"``, ``"us"``, or
``"ns"`` instead of ``"A"``, ``"AS"``, ``"Q"``, ``"M"``, ``"H"``, ``"T"``,
``"S"``, ``"L"``, ``"U"``, or ``"N"``. This is to be consistent with the
deprecation of the latter frequency strings (:issue:`8394`, :pull:`8415`). By
`Spencer Clark <https://github.com/spencerkclark>`_.
- Bump minimum tested pint version to ``>=0.22``. By `Deepak Cherian <https://github.com/dcherian>`_.
- Minimum supported versions for the following packages have changed: ``h5py >=3.7``, ``h5netcdf>=1.1``.
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

Deprecations
~~~~~~~~~~~~
Expand Down
9 changes: 0 additions & 9 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,15 +271,6 @@ def prepare_variable(
dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding)

fillvalue = attrs.pop("_FillValue", None)
if dtype is str and fillvalue is not None:
raise NotImplementedError(
"h5netcdf does not yet support setting a fill value for "
"variable-length strings "
"(https://github.com/h5netcdf/h5netcdf/issues/37). "
f"Either remove '_FillValue' from encoding on variable {name!r} "
"or set {'dtype': 'S1'} in encoding to use the fixed width "
"NC_CHAR type."
)

if dtype is str:
dtype = h5py.special_dtype(vlen=str)
Expand Down
10 changes: 0 additions & 10 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,16 +490,6 @@ def prepare_variable(

fill_value = attrs.pop("_FillValue", None)

if datatype is str and fill_value is not None:
raise NotImplementedError(
"netCDF4 does not yet support setting a fill value for "
"variable-length strings "
"(https://github.com/Unidata/netcdf4-python/issues/730). "
f"Either remove '_FillValue' from encoding on variable {name!r} "
"or set {'dtype': 'S1'} in encoding to use the fixed width "
"NC_CHAR type."
)

encoding = _extract_nc4_variable_encoding(
variable, raise_on_invalid=check_encoding, unlimited_dims=unlimited_dims
)
Expand Down
12 changes: 12 additions & 0 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,3 +562,15 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:

def decode(self):
raise NotImplementedError()


class ObjectVLenStringCoder(VariableCoder):
def encode(self):
return NotImplementedError

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if variable.dtype == object and variable.encoding.get("dtype", False) == str:
variable = variable.astype(variable.encoding["dtype"])
return variable
else:
return variable
4 changes: 4 additions & 0 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,10 @@ def decode_cf_variable(
var = strings.CharacterArrayCoder().decode(var, name=name)
var = strings.EncodedStringCoder().decode(var)

if original_dtype == object:
var = variables.ObjectVLenStringCoder().decode(var)
original_dtype = var.dtype

if mask_and_scale:
for coder in [
variables.UnsignedIntegerCoder(),
Expand Down
62 changes: 35 additions & 27 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -864,12 +864,13 @@ def test_roundtrip_empty_vlen_string_array(self) -> None:
assert check_vlen_dtype(original["a"].dtype) == str
with self.roundtrip(original) as actual:
assert_identical(original, actual)
assert object == actual["a"].dtype
assert actual["a"].dtype == original["a"].dtype
# only check metadata for capable backends
# eg. NETCDF3 based backends do not roundtrip metadata
if actual["a"].dtype.metadata is not None:
assert check_vlen_dtype(actual["a"].dtype) == str
if np.issubdtype(actual["a"].dtype, object):
# only check metadata for capable backends
# eg. NETCDF3 based backends do not roundtrip metadata
if actual["a"].dtype.metadata is not None:
assert check_vlen_dtype(actual["a"].dtype) == str
else:
assert actual["a"].dtype == np.dtype("<U1")

@pytest.mark.parametrize(
"decoded_fn, encoded_fn",
Expand Down Expand Up @@ -1374,32 +1375,39 @@ def test_write_groups(self) -> None:
with self.open(tmp_file, group="data/2") as actual2:
assert_identical(data2, actual2)

def test_encoding_kwarg_vlen_string(self) -> None:
for input_strings in [[b"foo", b"bar", b"baz"], ["foo", "bar", "baz"]]:
original = Dataset({"x": input_strings})
expected = Dataset({"x": ["foo", "bar", "baz"]})
kwargs = dict(encoding={"x": {"dtype": str}})
with self.roundtrip(original, save_kwargs=kwargs) as actual:
assert actual["x"].encoding["dtype"] is str
assert_identical(actual, expected)

def test_roundtrip_string_with_fill_value_vlen(self) -> None:
@pytest.mark.parametrize(
"input_strings, is_bytes",
[
([b"foo", b"bar", b"baz"], True),
(["foo", "bar", "baz"], False),
(["foó", "bár", "baź"], False),
],
)
def test_encoding_kwarg_vlen_string(
self, input_strings: list[str], is_bytes: bool
) -> None:
original = Dataset({"x": input_strings})

expected_string = ["foo", "bar", "baz"] if is_bytes else input_strings
expected = Dataset({"x": expected_string})
kwargs = dict(encoding={"x": {"dtype": str}})
with self.roundtrip(original, save_kwargs=kwargs) as actual:
assert actual["x"].encoding["dtype"] == "<U3"
assert actual["x"].dtype == "<U3"
assert_identical(actual, expected)

@pytest.mark.parametrize("fill_value", ["XXX", "", "bár"])
def test_roundtrip_string_with_fill_value_vlen(self, fill_value: str) -> None:
values = np.array(["ab", "cdef", np.nan], dtype=object)
expected = Dataset({"x": ("t", values)})

# netCDF4-based backends don't support an explicit fillvalue
# for variable length strings yet.
# https://github.com/Unidata/netcdf4-python/issues/730
# https://github.com/h5netcdf/h5netcdf/issues/37
original = Dataset({"x": ("t", values, {}, {"_FillValue": "XXX"})})
with pytest.raises(NotImplementedError):
with self.roundtrip(original) as actual:
assert_identical(expected, actual)
original = Dataset({"x": ("t", values, {}, {"_FillValue": fill_value})})
with self.roundtrip(original) as actual:
assert_identical(expected, actual)

original = Dataset({"x": ("t", values, {}, {"_FillValue": ""})})
with pytest.raises(NotImplementedError):
with self.roundtrip(original) as actual:
assert_identical(expected, actual)
with self.roundtrip(original) as actual:
assert_identical(expected, actual)

def test_roundtrip_character_array(self) -> None:
with create_tmp_file() as tmp_file:
Expand Down

0 comments on commit 74901af

Please sign in to comment.