Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

numpy 2 compatibility in the netcdf4 and h5netcdf backends #9136

Merged
merged 20 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions ci/install-upstream-wheels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ $conda remove -y numba numbagg sparse
# temporarily remove numexpr
$conda remove -y numexpr
# temporarily remove backends
$conda remove -y cf_units hdf5 h5py netcdf4 pydap
$conda remove -y cf_units pydap
# forcibly remove packages to avoid artifacts
$conda remove -y --force \
numpy \
Expand All @@ -37,8 +37,7 @@ python -m pip install \
numpy \
scipy \
matplotlib \
pandas \
h5py
pandas
# for some reason pandas depends on pyarrow already.
# Remove once a `pyarrow` version compiled with `numpy>=2.0` is on `conda-forge`
python -m pip install \
Expand Down
2 changes: 1 addition & 1 deletion ci/requirements/all-but-dask.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ dependencies:
- netcdf4
- numba
- numbagg
- numpy<2
- numpy
- packaging
- pandas
- pint>=0.22
Expand Down
2 changes: 1 addition & 1 deletion ci/requirements/environment-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dependencies:
- netcdf4
- numba
- numbagg
- numpy<2
- numpy
- packaging
- pandas
# - pint>=0.22
Expand Down
2 changes: 1 addition & 1 deletion ci/requirements/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ dependencies:
- numba
- numbagg
- numexpr
- numpy<2
- numpy
- opt_einsum
- packaging
- pandas
Expand Down
4 changes: 3 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ Bug fixes
By `Pontus Lurcock <https://github.com/pont-us>`_.
- Allow diffing objects with array attributes on variables (:issue:`9153`, :pull:`9169`).
By `Justus Magin <https://github.com/keewis>`_.
- ``numpy>=2`` compatibility in the ``netcdf4`` backend (:pull:`9136`).
By `Justus Magin <https://github.com/keewis>`_ and `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
- Promote floating-point numeric datetimes before decoding (:issue:`9179`, :pull:`9182`).
By `Justus Magin <https://github.com/keewis>`_.
- Fiy static typing of tolerance arguments by allowing `str` type (:issue:`8892`, :pull:`9194`).
Expand All @@ -61,7 +63,7 @@ Documentation
- Adds a flow-chart diagram to help users navigate help resources (`Discussion #8990 <https://github.com/pydata/xarray/discussions/8990>`_).
By `Jessica Scheick <https://github.com/jessicas11>`_.
- Improvements to Zarr & chunking docs (:pull:`9139`, :pull:`9140`, :pull:`9132`)
By `Maximilian Roos <https://github.com/max-sixty>`_
By `Maximilian Roos <https://github.com/max-sixty>`_.


Internal Changes
Expand Down
16 changes: 10 additions & 6 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,10 +516,13 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)

pop_to(encoding, attrs, "_Unsigned")
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
# we need the on-disk type here
# trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}"))
if "_FillValue" in attrs:
new_fill = signed_dtype.type(attrs["_FillValue"])
attrs["_FillValue"] = new_fill
new_fill = np.array(attrs["_FillValue"])
# use view here to prevent OverflowError
attrs["_FillValue"] = new_fill.view(signed_dtype).item()
data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)

return Variable(dims, data, attrs, encoding, fastpath=True)
Expand All @@ -535,10 +538,11 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if unsigned == "true":
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=unsigned_dtype)
data = lazy_elemwise_func(data, transform, unsigned_dtype)
if "_FillValue" in attrs:
new_fill = unsigned_dtype.type(attrs["_FillValue"])
attrs["_FillValue"] = new_fill
new_fill = np.array(attrs["_FillValue"], dtype=data.dtype)
# use view here to prevent OverflowError
attrs["_FillValue"] = new_fill.view(unsigned_dtype).item()
data = lazy_elemwise_func(data, transform, unsigned_dtype)
elif data.dtype.kind == "u":
if unsigned == "false":
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
Expand Down
33 changes: 31 additions & 2 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@

def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
encoding = {
"_FillValue": 255,
"_FillValue": np.int8(-1),
"_Unsigned": "true",
"dtype": "i1",
"add_offset": dtype.type(10),
Expand Down Expand Up @@ -925,6 +925,35 @@
assert decoded.variables[k].dtype == actual.variables[k].dtype
assert_allclose(decoded, actual, decode_bytes=False)

@pytest.mark.parametrize("fillvalue", [np.int8(-1), np.uint8(255)])
def test_roundtrip_unsigned(self, fillvalue):
# regression/numpy2 test for
encoding = {
"_FillValue": fillvalue,
"_Unsigned": "true",
"dtype": "i1",
}
x = np.array([0, 1, 127, 128, 254, np.nan], dtype=np.float32)
decoded = Dataset({"x": ("t", x, {}, encoding)})

attributes = {
"_FillValue": fillvalue,
"_Unsigned": "true",
}
# Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -2, -1], dtype="i1")
encoded = Dataset({"x": ("t", sb, attributes)})

with self.roundtrip(decoded) as actual:
for k in decoded.variables:
assert decoded.variables[k].dtype == actual.variables[k].dtype
assert_allclose(decoded, actual, decode_bytes=False)

with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual:
for k in encoded.variables:
assert encoded.variables[k].dtype == actual.variables[k].dtype
assert_allclose(encoded, actual, decode_bytes=False)

@staticmethod
def _create_cf_dataset():
original = Dataset(
Expand Down Expand Up @@ -4285,7 +4314,7 @@
def test_roundtrip_numpy_datetime_data(self) -> None:
# Override method in DatasetIOBase - remove not applicable
# save_kwargs
times = pd.to_datetime(["2000-01-01", "2000-01-02", "NaT"])
times = pd.to_datetime(["2000-01-01", "2000-01-02", "NaT"], unit="ns")
expected = Dataset({"t": ("t", times), "t0": times[0]})
with self.roundtrip(expected) as actual:
assert_identical(expected, actual)
Expand Down Expand Up @@ -4688,7 +4717,7 @@
with open_dataset(tmp, chunks=chunks) as dask_ds:
assert_identical(data, dask_ds)
with create_tmp_file() as tmp2:
dask_ds.to_netcdf(tmp2)

Check failure on line 4720 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.12 flaky

TestDask.test_dask_roundtrip Failed: Timeout >180.0s
with open_dataset(tmp2) as on_disk:
assert_identical(data, on_disk)

Expand Down
Loading