Skip to content

Commit

Permalink
determine cf packed data dtype
Browse files Browse the repository at this point in the history
  • Loading branch information
kmuehlbauer committed Mar 31, 2023
1 parent 2b63d92 commit b5dad00
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 37 deletions.
27 changes: 18 additions & 9 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,20 +232,29 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
return data


def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]:
"""Return a float dtype that can losslessly represent `dtype` values."""
# Keep float32 as-is. Upcast half-precision to single-precision,
def _choose_float_dtype(
dtype: np.dtype, encoding: MutableMapping
) -> type[np.floating[Any]]:
# check scale/offset first to derive dtype with
if "scale_factor" in encoding or "add_offset" in encoding:
scale_factor = encoding.get("scale_factor", False)
add_offset = encoding.get("add_offset", False)
# minimal floating point size -> 4 byte
maxsize = 4
if scale_factor and np.issubdtype(type(scale_factor), np.floating):
maxsize = max(maxsize, np.dtype(type(scale_factor)).itemsize)
if add_offset and np.issubdtype(type(add_offset), np.floating):
maxsize = max(maxsize, np.dtype(type(add_offset)).itemsize)
return np.dtype(f"float{maxsize * 8}")
# Keep float32 as-is. Upcast half-precision to single-precision,
# because float16 is "intended for storage but not computation"
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
return np.float32
# float32 can exactly represent all integers up to 24 bits
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
# A scale factor is entirely safe (vanishing into the mantissa),
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if not has_offset:
return np.float32
return np.float32
# For all other types and circumstances, we just use float64.
# (safe because eg. complex numbers are not supported in NetCDF)
return np.float64
Expand Down Expand Up @@ -281,7 +290,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
dims, data, attrs, encoding = unpack_for_encoding(variable)

if "scale_factor" in encoding or "add_offset" in encoding:
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
dtype = _choose_float_dtype(data.dtype, encoding)
data = data.astype(dtype=dtype, copy=True)
if "add_offset" in encoding:
data -= pop_to(encoding, attrs, "add_offset", name=name)
Expand All @@ -297,7 +306,7 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:

scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
add_offset = pop_to(attrs, encoding, "add_offset", name=name)
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
dtype = _choose_float_dtype(data.dtype, encoding)
if np.ndim(scale_factor) > 0:
scale_factor = np.asarray(scale_factor).item()
if np.ndim(add_offset) > 0:
Expand Down
67 changes: 39 additions & 28 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,96 +138,96 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset:
)


def create_masked_and_scaled_data() -> Dataset:
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32)
def create_masked_and_scaled_data(dtype=np.float32) -> Dataset:
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype)
encoding = {
"_FillValue": -1,
"add_offset": 10,
"scale_factor": np.float32(0.1),
"scale_factor": dtype(0.1),
"dtype": "i2",
}
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_masked_and_scaled_data() -> Dataset:
attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)}
def create_encoded_masked_and_scaled_data(dtype=np.float32) -> Dataset:
attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": dtype(0.1)}
return Dataset(
{"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)}
)


def create_unsigned_masked_scaled_data() -> Dataset:
def create_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
encoding = {
"_FillValue": 255,
"_Unsigned": "true",
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"scale_factor": dtype(0.1),
}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_unsigned_masked_scaled_data() -> Dataset:
def create_encoded_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -1,
"_Unsigned": "true",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"scale_factor": dtype(0.1),
}
# Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
return Dataset({"x": ("t", sb, attributes)})


def create_bad_unsigned_masked_scaled_data() -> Dataset:
def create_bad_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
encoding = {
"_FillValue": 255,
"_Unsigned": True,
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"scale_factor": dtype(0.1),
}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset:
def create_bad_encoded_unsigned_masked_scaled_data(dtype=np.float32) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -1,
"_Unsigned": True,
"add_offset": 10,
"scale_factor": np.float32(0.1),
"scale_factor": dtype(0.1),
}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype="i1")
return Dataset({"x": ("t", sb, attributes)})


def create_signed_masked_scaled_data() -> Dataset:
def create_signed_masked_scaled_data(dtype=np.float32) -> Dataset:
encoding = {
"_FillValue": -127,
"_Unsigned": "false",
"dtype": "i1",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"scale_factor": dtype(0.1),
}
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32)
x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype)
return Dataset({"x": ("t", x, {}, encoding)})


def create_encoded_signed_masked_scaled_data() -> Dataset:
def create_encoded_signed_masked_scaled_data(dtype=np.float32) -> Dataset:
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {
"_FillValue": -127,
"_Unsigned": "false",
"add_offset": 10,
"scale_factor": np.float32(0.1),
"scale_factor": dtype(0.1),
}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([-110, 1, 127, -127], dtype="i1")
Expand Down Expand Up @@ -857,6 +857,7 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
with self.roundtrip(original) as actual:
assert_identical(expected, actual)

@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize(
"decoded_fn, encoded_fn",
[
Expand All @@ -876,12 +877,19 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None:
(create_masked_and_scaled_data, create_encoded_masked_and_scaled_data),
],
)
def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:
decoded = decoded_fn()
encoded = encoded_fn()
def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
if dtype == np.float32 and isinstance(
self, (TestZarrDirectoryStore, TestZarrDictStore)
):
pytest.skip(
"zarr attributes (eg. `scale_factor` are unconditionally promoted to `float64`"
)
decoded = decoded_fn(dtype)
encoded = encoded_fn(dtype)

with self.roundtrip(decoded) as actual:
for k in decoded.variables:
print(k, decoded.variables[k].dtype)
assert decoded.variables[k].dtype == actual.variables[k].dtype
assert_allclose(decoded, actual, decode_bytes=False)

Expand All @@ -899,7 +907,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None:

# make sure roundtrip encoding didn't change the
# original dataset.
assert_allclose(encoded, encoded_fn(), decode_bytes=False)
assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False)

with self.roundtrip(encoded) as actual:
for k in decoded.variables:
Expand Down Expand Up @@ -1533,29 +1541,32 @@ def test_encoding_chunksizes_unlimited(self) -> None:
with self.roundtrip(ds) as actual:
assert_equal(ds, actual)

def test_mask_and_scale(self) -> None:
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_mask_and_scale(self, dtype) -> None:
with create_tmp_file() as tmp_file:
with nc4.Dataset(tmp_file, mode="w") as nc:
nc.createDimension("t", 5)
nc.createVariable("x", "int16", ("t",), fill_value=-1)
v = nc.variables["x"]
v.set_auto_maskandscale(False)
v.add_offset = 10
v.scale_factor = 0.1
v.scale_factor = dtype(0.1)
v[:] = np.array([-1, -1, 0, 1, 2])

# first make sure netCDF4 reads the masked and scaled data
# correctly
with nc4.Dataset(tmp_file, mode="r") as nc:
expected = np.ma.array(
[-1, -1, 10, 10.1, 10.2], mask=[True, True, False, False, False]
[-1, -1, 10, 10.1, 10.2],
mask=[True, True, False, False, False],
dtype=dtype,
)
actual = nc.variables["x"][:]
assert_array_equal(expected, actual)

# now check xarray
with open_dataset(tmp_file) as ds:
expected = create_masked_and_scaled_data()
expected = create_masked_and_scaled_data(dtype)
assert_identical(expected, ds)

def test_0dimensional_variable(self) -> None:
Expand Down

0 comments on commit b5dad00

Please sign in to comment.