From b5dad00af8194e1b44df9965b8a6c8f4d45e666c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 31 Mar 2023 10:34:30 +0200 Subject: [PATCH] determine cf packed data dtype --- xarray/coding/variables.py | 27 +++++++++----- xarray/tests/test_backends.py | 67 ++++++++++++++++++++--------------- 2 files changed, 57 insertions(+), 37 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 57e850970be..94fe21633d6 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -232,9 +232,21 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp return data -def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]: - """Return a float dtype that can losslessly represent `dtype` values.""" - # Keep float32 as-is. Upcast half-precision to single-precision, +def _choose_float_dtype( + dtype: np.dtype, encoding: MutableMapping +) -> type[np.floating[Any]]: + # check scale/offset first to derive dtype with + if "scale_factor" in encoding or "add_offset" in encoding: + scale_factor = encoding.get("scale_factor", False) + add_offset = encoding.get("add_offset", False) + # minimal floating point size -> 4 byte + maxsize = 4 + if scale_factor and np.issubdtype(type(scale_factor), np.floating): + maxsize = max(maxsize, np.dtype(type(scale_factor)).itemsize) + if add_offset and np.issubdtype(type(add_offset), np.floating): + maxsize = max(maxsize, np.dtype(type(add_offset)).itemsize) + return np.dtype(f"float{maxsize * 8}") + # Keep float32 as-is. Upcast half-precision to single-precision, # because float16 is "intended for storage but not computation" if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating): return np.float32 @@ -242,10 +254,7 @@ def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[A if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer): # A scale factor is entirely safe (vanishing into the mantissa), # but a large integer offset could lead to loss of precision. - # Sensitivity analysis can be tricky, so we just use a float64 - # if there's any offset at all - better unoptimised than wrong! - if not has_offset: - return np.float32 + return np.float32 # For all other types and circumstances, we just use float64. # (safe because eg. complex numbers are not supported in NetCDF) return np.float64 @@ -281,7 +290,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: dims, data, attrs, encoding = unpack_for_encoding(variable) if "scale_factor" in encoding or "add_offset" in encoding: - dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) + dtype = _choose_float_dtype(data.dtype, encoding) data = data.astype(dtype=dtype, copy=True) if "add_offset" in encoding: data -= pop_to(encoding, attrs, "add_offset", name=name) @@ -297,7 +306,7 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) add_offset = pop_to(attrs, encoding, "add_offset", name=name) - dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) + dtype = _choose_float_dtype(data.dtype, encoding) if np.ndim(scale_factor) > 0: scale_factor = np.asarray(scale_factor).item() if np.ndim(add_offset) > 0: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6f40a44c2e2..51f898f36ba 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -138,96 +138,96 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset: ) -def create_masked_and_scaled_data() -> Dataset: - x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32) +def create_masked_and_scaled_data(dtype=np.float32) -> Dataset: + x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype) encoding = { "_FillValue": -1, "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), "dtype": "i2", } return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_masked_and_scaled_data() -> Dataset: - attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)} +def create_encoded_masked_and_scaled_data(dtype=np.float32) -> Dataset: + attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)} return Dataset( {"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)} ) -def create_unsigned_masked_scaled_data() -> Dataset: +def create_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": "true", "dtype": "i1", "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } - x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32) + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_unsigned_masked_scaled_data() -> Dataset: +def create_encoded_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -1, "_Unsigned": "true", "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype="i1") return Dataset({"x": ("t", sb, attributes)}) -def create_bad_unsigned_masked_scaled_data() -> Dataset: +def create_bad_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": True, "dtype": "i1", "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } - x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32) + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset: +def create_bad_encoded_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -1, "_Unsigned": True, "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype="i1") return Dataset({"x": ("t", sb, attributes)}) -def create_signed_masked_scaled_data() -> Dataset: +def create_signed_masked_scaled_data(dtype=np.float32) -> Dataset: encoding = { "_FillValue": -127, "_Unsigned": "false", "dtype": "i1", "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } - x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32) + x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_signed_masked_scaled_data() -> Dataset: +def create_encoded_signed_masked_scaled_data(dtype=np.float32) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -127, "_Unsigned": "false", "add_offset": 10, - "scale_factor": np.float32(0.1), + "scale_factor": dtype(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([-110, 1, 127, -127], dtype="i1") @@ -857,6 +857,7 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: with self.roundtrip(original) as actual: assert_identical(expected, actual) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize( "decoded_fn, encoded_fn", [ @@ -876,12 +877,19 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], ) - def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None: - decoded = decoded_fn() - encoded = encoded_fn() + def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: + if dtype == np.float32 and isinstance( + self, (TestZarrDirectoryStore, TestZarrDictStore) + ): + pytest.skip( + "zarr attributes (eg. `scale_factor` are unconditionally promoted to `float64`" + ) + decoded = decoded_fn(dtype) + encoded = encoded_fn(dtype) with self.roundtrip(decoded) as actual: for k in decoded.variables: + print(k, decoded.variables[k].dtype) assert decoded.variables[k].dtype == actual.variables[k].dtype assert_allclose(decoded, actual, decode_bytes=False) @@ -899,7 +907,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None: # make sure roundtrip encoding didn't change the # original dataset. - assert_allclose(encoded, encoded_fn(), decode_bytes=False) + assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False) with self.roundtrip(encoded) as actual: for k in decoded.variables: @@ -1533,7 +1541,8 @@ def test_encoding_chunksizes_unlimited(self) -> None: with self.roundtrip(ds) as actual: assert_equal(ds, actual) - def test_mask_and_scale(self) -> None: + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + def test_mask_and_scale(self, dtype) -> None: with create_tmp_file() as tmp_file: with nc4.Dataset(tmp_file, mode="w") as nc: nc.createDimension("t", 5) @@ -1541,21 +1550,23 @@ def test_mask_and_scale(self) -> None: v = nc.variables["x"] v.set_auto_maskandscale(False) v.add_offset = 10 - v.scale_factor = 0.1 + v.scale_factor = dtype(0.1) v[:] = np.array([-1, -1, 0, 1, 2]) # first make sure netCDF4 reads the masked and scaled data # correctly with nc4.Dataset(tmp_file, mode="r") as nc: expected = np.ma.array( - [-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False] + [-1, -1, 10, 10.1, 10.2], + mask=[True, True, False, False, False], + dtype=dtype, ) actual = nc.variables["x"][:] assert_array_equal(expected, actual) # now check xarray with open_dataset(tmp_file) as ds: - expected = create_masked_and_scaled_data() + expected = create_masked_and_scaled_data(dtype) assert_identical(expected, ds) def test_0dimensional_variable(self) -> None: