Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved CF decoding #6812

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 74 additions & 10 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype):
return data


def _choose_float_dtype(dtype, has_offset):
def _choose_float_dtype_encoding(dtype, has_offset):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Encoding per the CF spec is fairly specific. The packed variable is supposed to be type byte/short/int, not float. Most of the tests encode with a scale_factor or add_offset that require the packed data to be type float. Rather than trying to solve all this, I have just split the encode and decode dtype function.

"""Return a float dtype that can losslessly represent `dtype` values."""
# Keep float32 as-is. Upcast half-precision to single-precision,
# because float16 is "intended for storage but not computation"
Expand All @@ -236,13 +236,76 @@ def _choose_float_dtype(dtype, has_offset):
# but a large integer offset could lead to loss of precision.
# Sensitivity analysis can be tricky, so we just use a float64
# if there's any offset at all - better unoptimised than wrong!
if not has_offset:
if has_offset:
return np.float64
else:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reverted to original algorithm as per suggestion from dcherian.

return np.float32
# For all other types and circumstances, we just use float64.
# (safe because eg. complex numbers are not supported in NetCDF)
return np.float64


def _choose_float_dtype_decoding(dtype, scale_factor, add_offset):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I chose to focus on decoding per the CF specification, so I split the function. Furthermore, decoding makes heavy use of np.find_common_type to select the correct datatype for the final product.

"""Return a dtype per CF convention standards."""

# From CF standard:
# If the scale_factor and add_offset attributes are of the same data type as the
# associated variable, the unpacked data is assumed to be of the same data type as
# the packed data.
if (dtype == type(scale_factor) if scale_factor is not None else dtype) and (
dtype == (type(add_offset) if add_offset is not None else dtype)
):
ret_dtype = dtype

# If here, one or both of scale_factor and add_offset exist, and do not match dtype

# From CF standard:
# However, if the scale_factor and add_offset attributes are of a different data
# type from the variable (containing the packed data) then the unpacked data should
# match the type of these attributes, which must both be of type float or both be of
# type double.

# Check that scale_factor and add_offset are float or double and the same.
if (scale_factor is not None) and (add_offset is not None):
if not np.issubdtype(type(scale_factor), np.double):
print("Warning - non-compliant CF file: scale_factor not float or double")
if not np.issubdtype(type(add_offset), np.double):
print("Warning - non-compliant CF file: add_offset not float or double")
if type(scale_factor) != type(add_offset):
print(
"Warning - non-compliant CF file: scale_factor and add_offset are different type"
)

# From CF standard:
# An additional restriction in this case is that the variable containing the packed
# data must be of type byte, short or int. It is not advised to unpack an int into a
# float as there is a potential precision loss.

# Check that dtype is byte, short, or int.
if not np.issubdtype(dtype, np.integer):
print("Warning - non-compliant CF file: dtype is not byte, short, or int")

if scale_factor is None:
ret_dtype = np.find_common_type([dtype, type(add_offset)], [])
elif add_offset is None:
ret_dtype = np.find_common_type([dtype, type(scale_factor)], [])
else:
ret_dtype = np.find_common_type(
[
dtype,
type(scale_factor) if scale_factor is not None else None,
type(add_offset) if add_offset is not None else None,
],
[],
)

# Upcast half-precision to single-precision, because float16 is "intended for
# storage but not computation"
if ret_dtype.itemsize <= 4 and np.issubdtype(ret_dtype, np.floating):
ret_dtype = np.float32
return ret_dtype


class CFScaleOffsetCoder(VariableCoder):
"""Scale and offset variables according to CF conventions.

Expand All @@ -253,13 +316,14 @@ class CFScaleOffsetCoder(VariableCoder):
def encode(self, variable, name=None):
dims, data, attrs, encoding = unpack_for_encoding(variable)

if "scale_factor" in encoding or "add_offset" in encoding:
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
data = data.astype(dtype=dtype, copy=True)
if "add_offset" in encoding:
data -= pop_to(encoding, attrs, "add_offset", name=name)
if "scale_factor" in encoding:
data /= pop_to(encoding, attrs, "scale_factor", name=name)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The data type may change when adding or scaling, hence changing from data /= ... to data = data / ....

scale_factor = pop_to(encoding, attrs, "scale_factor", name=name)
add_offset = pop_to(encoding, attrs, "add_offset", name=name)
dtype = _choose_float_dtype_encoding(data.dtype, add_offset in attrs)
data = data.astype(dtype=dtype, copy=True)
if add_offset is not None:
data = data - add_offset
if scale_factor is not None:
data = data / scale_factor

return Variable(dims, data, attrs, encoding)

Expand All @@ -269,7 +333,7 @@ def decode(self, variable, name=None):
if "scale_factor" in attrs or "add_offset" in attrs:
scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
add_offset = pop_to(attrs, encoding, "add_offset", name=name)
dtype = _choose_float_dtype(data.dtype, "add_offset" in attrs)
dtype = _choose_float_dtype_decoding(data.dtype, scale_factor, add_offset)
if np.ndim(scale_factor) > 0:
scale_factor = np.asarray(scale_factor).item()
if np.ndim(add_offset) > 0:
Expand Down
15 changes: 8 additions & 7 deletions xarray/tests/test_coding.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def test_CFMaskCoder_decode() -> None:
def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None:
original = xr.Variable(("x",), data, encoding=encoding)
encoded = encode_cf_variable(original)

assert encoded.dtype == encoded.attrs["missing_value"].dtype
assert encoded.dtype == encoded.attrs["_FillValue"].dtype

Expand Down Expand Up @@ -97,20 +96,22 @@ def test_coder_roundtrip() -> None:


@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split())
def test_scaling_converts_to_float32(dtype) -> None:
def test_scaling_converts_to_float64(dtype) -> None:
original = xr.Variable(
("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10)
)
coder = variables.CFScaleOffsetCoder()
encoded = coder.encode(original)
assert encoded.dtype == np.float32
if dtype in ["f2", "f4"]:
assert encoded.dtype == np.float32
if dtype in ["u1", "u2", "i1", "i2"]:
assert encoded.dtype == np.float64
roundtripped = coder.decode(encoded)
assert_identical(original, roundtripped)
assert roundtripped.dtype == np.float32
assert roundtripped.dtype == np.float64


@pytest.mark.parametrize("scale_factor", (10, [10]))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure that scale_factor or add_offset can be an array type per the CF spec, so I changed this test.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These kinds of things tend to happen though. Since we have tested for it, we should just keep it around.

@pytest.mark.parametrize("add_offset", (0.1, [0.1]))
@pytest.mark.parametrize("scale_factor", (10, 1))
@pytest.mark.parametrize("add_offset", (0.1, 1.0))
def test_scaling_offset_as_list(scale_factor, add_offset) -> None:
# test for #4631
encoding = dict(scale_factor=scale_factor, add_offset=add_offset)
Expand Down