Skip to content

Commit

Permalink
implement scale_factor/add_offset CF conformance test, add and align …
Browse files Browse the repository at this point in the history
…tests
  • Loading branch information
kmuehlbauer committed Apr 20, 2023
1 parent 68bae08 commit c30af87
Show file tree
Hide file tree
Showing 3 changed files with 248 additions and 52 deletions.
102 changes: 102 additions & 0 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,97 @@ def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[A
return np.float64


def _ensure_scale_offset_conformance(
mapping: MutableMapping, strict: bool = False
) -> bool | None:
"""Check conformance of scale_factor and add_offset for cf encoding/decoding.
scale_factor and/or add_offset as well as packed dtype are needed within mapping
"""
conforms = True
# https://cfconventions.org/cf-conventions/cf-conventions.html#packed-data
scale_factor = mapping.get("scale_factor")
if np.ndim(scale_factor) > 0:
scale_factor = np.asarray(scale_factor).item()
add_offset = mapping.get("add_offset")
if np.ndim(add_offset) > 0:
add_offset = np.asarray(add_offset).item()
dtype = mapping.get("dtype")
ptype = np.dtype(dtype) if dtype is not None else None

# get the type from scale_factor/add_offset
scale_offset_dtype = list(
{np.dtype(type(att)) for att in [scale_factor, add_offset] if att is not None}
)

# raise early, aligns with netcdf4-python
if np.float16 in scale_offset_dtype:
raise ValueError(
f"scale_factor and/or add_offset dtype {scale_offset_dtype} mismatch. "
"float16 is not allowed."
)

ptype_exists = ptype is not None

# no packing information available, do nothing
if not scale_offset_dtype:
return None

# no packing information available, do nothing
if not scale_offset_dtype and not ptype_exists:
return None

# mandatory packing information missing
if scale_offset_dtype and not ptype_exists:
raise ValueError("Packed dtype information is missing!")

if len(scale_offset_dtype) == 1:
# OK, we have at least one of scale_factor or add_offset
# and if both are given, they are of the same dtype
scale_offset_dtype = scale_offset_dtype[0]

if scale_offset_dtype != ptype:
if scale_offset_dtype not in [np.float32, np.float64]:
msg = (
f"scale_factor and/or add_offset dtype {scale_offset_dtype} "
"mismatch. Must be either float32 or float64 dtype."
)
if strict:
raise ValueError(msg)
else:
warnings.warn(msg, SerializationWarning, stacklevel=3)
conforms = False
if np.issubdtype(ptype, np.integer) and ptype not in [
np.int8,
np.int16,
np.int32,
]:
msg = f"packed dtype {ptype} mismatch. Must be of type byte, short or int."
if strict:
raise ValueError(msg)
else:
warnings.warn(msg, SerializationWarning, stacklevel=3)
conforms = False
if ptype == np.int32 and scale_offset_dtype == np.float32:
warnings.warn(
"Trying to pack float32 into int32. This is not advised per CF Convention "
"because of potential precision loss!",
SerializationWarning,
stacklevel=3,
)
else:
msg = (
f"scale_factor dtype {np.dtype(type(scale_factor))} and add_offset dtype "
f"{np.dtype(type(add_offset))} mismatch! Must be of same dtype."
)
if strict:
raise ValueError(msg)
else:
warnings.warn(msg, SerializationWarning, stacklevel=3)
conforms = False
return conforms


class CFScaleOffsetCoder(VariableCoder):
"""Scale and offset variables according to CF conventions.
Expand All @@ -329,6 +420,9 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)

if "scale_factor" in encoding or "add_offset" in encoding:
# strict checking, raise error on encoding
# we do not want to write non-conforming data
_ensure_scale_offset_conformance(encoding, strict=True)
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
data = data.astype(dtype=dtype, copy=True)
if "add_offset" in encoding:
Expand All @@ -345,6 +439,14 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:

scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
add_offset = pop_to(attrs, encoding, "add_offset", name=name)

# for decoding we need the original dtype
encoding.setdefault("dtype", data.dtype)

# only warn on decoding
# we try to decode non-conforming data
_ensure_scale_offset_conformance(encoding, strict=False)

dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
if np.ndim(scale_factor) > 0:
scale_factor = np.asarray(scale_factor).item()
Expand Down
91 changes: 59 additions & 32 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from contextlib import ExitStack
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Final, cast
from typing import TYPE_CHECKING, Any, Callable, Final, cast

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -138,96 +138,110 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset:
)


def create_masked_and_scaled_data() -> Dataset:
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32)
def create_masked_and_scaled_data(dtype: type[np.number] = np.float32) -> Dataset:
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype)
encoding = {
"_FillValue": -1,
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
"dtype": "i2",
}
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_masked_and_scaled_data() -> Dataset:
attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)}
def create_encoded_masked_and_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
attributes = {"_FillValue": -1, "add_offset": dtype(10), "scale_factor": dtype(0.1)}
return Dataset(
{"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)}
)


def create_unsigned_masked_scaled_data() -> Dataset:
def create_unsigned_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
encoding = {
"_FillValue": 255,
"_Unsigned": "true",
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_unsigned_masked_scaled_data() -> Dataset:
def create_encoded_unsigned_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -1,
"_Unsigned": "true",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
}
# Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
return Dataset({"x": ("t", sb, attributes)})


def create_bad_unsigned_masked_scaled_data() -> Dataset:
def create_bad_unsigned_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
encoding = {
"_FillValue": 255,
"_Unsigned": True,
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(0),
"scale_factor": dtype(0.1),
}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset:
def create_bad_encoded_unsigned_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -1,
"_Unsigned": True,
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
return Dataset({"x": ("t", sb, attributes)})


def create_signed_masked_scaled_data() -> Dataset:
def create_signed_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
encoding = {
"_FillValue": -127,
"_Unsigned": "false",
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
}
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32)
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_signed_masked_scaled_data() -> Dataset:
def create_encoded_signed_masked_scaled_data(
dtype: type[np.number] = np.float32,
) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -127,
"_Unsigned": "false",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"add_offset": dtype(10),
"scale_factor": dtype(0.1),
}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([-110, 1, 127, -127], dtype="i1")
Expand Down Expand Up @@ -859,6 +873,8 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
with self.roundtrip(original) as actual:
assert_identical(expected, actual)

# Todo: (kmuehlbauer) make this work np.float64
@pytest.mark.parametrize("dtype", [np.float32])
@pytest.mark.parametrize(
"decoded_fn, encoded_fn",
[
Expand All @@ -878,9 +894,20 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
(create_masked_and_scaled_data, create_encoded_masked_and_scaled_data),
],
)
def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:
decoded = decoded_fn()
encoded = encoded_fn()
def test_roundtrip_mask_and_scale(
self,
decoded_fn: Callable[[type[np.number]], Dataset],
encoded_fn: Callable[[type[np.number]], Dataset],
dtype: type[np.number],
) -> None:
if dtype == np.float32 and isinstance(
self, (TestZarrDirectoryStore, TestZarrDictStore)
):
pytest.skip(
"zarr attributes (eg. `scale_factor` are unconditionally promoted to `float64`"
)
decoded = decoded_fn(dtype)
encoded = encoded_fn(dtype)

with self.roundtrip(decoded) as actual:
for k in decoded.variables:
Expand All @@ -901,7 +928,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:

# make sure roundtrip encoding didn't change the
# original dataset.
assert_allclose(encoded, encoded_fn(), decode_bytes=False)
assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False)

with self.roundtrip(encoded) as actual:
for k in decoded.variables:
Expand Down
Loading

0 comments on commit c30af87

Please sign in to comment.