-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improved CF decoding #6812
base: main
Are you sure you want to change the base?
Improved CF decoding #6812
Changes from all commits
2a5686c
108586e
4eedd29
312acda
4615720
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -224,7 +224,7 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype): | |
return data | ||
|
||
|
||
def _choose_float_dtype(dtype, has_offset): | ||
def _choose_float_dtype_encoding(dtype, has_offset): | ||
"""Return a float dtype that can losslessly represent `dtype` values.""" | ||
# Keep float32 as-is. Upcast half-precision to single-precision, | ||
# because float16 is "intended for storage but not computation" | ||
|
@@ -236,13 +236,76 @@ def _choose_float_dtype(dtype, has_offset): | |
# but a large integer offset could lead to loss of precision. | ||
# Sensitivity analysis can be tricky, so we just use a float64 | ||
# if there's any offset at all - better unoptimised than wrong! | ||
if not has_offset: | ||
if has_offset: | ||
return np.float64 | ||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reverted to original algorithm as per suggestion from dcherian. |
||
return np.float32 | ||
# For all other types and circumstances, we just use float64. | ||
# (safe because eg. complex numbers are not supported in NetCDF) | ||
return np.float64 | ||
|
||
|
||
def _choose_float_dtype_decoding(dtype, scale_factor, add_offset): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I chose to focus on decoding per the CF specification, so I split the function. Furthermore, decoding makes heavy use of |
||
"""Return a dtype per CF convention standards.""" | ||
|
||
# From CF standard: | ||
# If the scale_factor and add_offset attributes are of the same data type as the | ||
# associated variable, the unpacked data is assumed to be of the same data type as | ||
# the packed data. | ||
if (dtype == type(scale_factor) if scale_factor is not None else dtype) and ( | ||
dtype == (type(add_offset) if add_offset is not None else dtype) | ||
): | ||
ret_dtype = dtype | ||
|
||
# If here, one or both of scale_factor and add_offset exist, and do not match dtype | ||
|
||
# From CF standard: | ||
# However, if the scale_factor and add_offset attributes are of a different data | ||
# type from the variable (containing the packed data) then the unpacked data should | ||
# match the type of these attributes, which must both be of type float or both be of | ||
# type double. | ||
|
||
# Check that scale_factor and add_offset are float or double and the same. | ||
if (scale_factor is not None) and (add_offset is not None): | ||
if not np.issubdtype(type(scale_factor), np.double): | ||
print("Warning - non-compliant CF file: scale_factor not float or double") | ||
if not np.issubdtype(type(add_offset), np.double): | ||
print("Warning - non-compliant CF file: add_offset not float or double") | ||
if type(scale_factor) != type(add_offset): | ||
print( | ||
"Warning - non-compliant CF file: scale_factor and add_offset are different type" | ||
) | ||
|
||
# From CF standard: | ||
# An additional restriction in this case is that the variable containing the packed | ||
# data must be of type byte, short or int. It is not advised to unpack an int into a | ||
# float as there is a potential precision loss. | ||
|
||
# Check that dtype is byte, short, or int. | ||
if not np.issubdtype(dtype, np.integer): | ||
print("Warning - non-compliant CF file: dtype is not byte, short, or int") | ||
|
||
if scale_factor is None: | ||
ret_dtype = np.find_common_type([dtype, type(add_offset)], []) | ||
elif add_offset is None: | ||
ret_dtype = np.find_common_type([dtype, type(scale_factor)], []) | ||
else: | ||
ret_dtype = np.find_common_type( | ||
[ | ||
dtype, | ||
type(scale_factor) if scale_factor is not None else None, | ||
type(add_offset) if add_offset is not None else None, | ||
], | ||
[], | ||
) | ||
|
||
# Upcast half-precision to single-precision, because float16 is "intended for | ||
# storage but not computation" | ||
if ret_dtype.itemsize <= 4 and np.issubdtype(ret_dtype, np.floating): | ||
ret_dtype = np.float32 | ||
return ret_dtype | ||
|
||
|
||
class CFScaleOffsetCoder(VariableCoder): | ||
"""Scale and offset variables according to CF conventions. | ||
|
||
|
@@ -253,13 +316,14 @@ class CFScaleOffsetCoder(VariableCoder): | |
def encode(self, variable, name=None): | ||
dims, data, attrs, encoding = unpack_for_encoding(variable) | ||
|
||
if "scale_factor" in encoding or "add_offset" in encoding: | ||
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) | ||
data = data.astype(dtype=dtype, copy=True) | ||
if "add_offset" in encoding: | ||
data -= pop_to(encoding, attrs, "add_offset", name=name) | ||
if "scale_factor" in encoding: | ||
data /= pop_to(encoding, attrs, "scale_factor", name=name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The data type may change when adding or scaling, hence changing from |
||
scale_factor = pop_to(encoding, attrs, "scale_factor", name=name) | ||
add_offset = pop_to(encoding, attrs, "add_offset", name=name) | ||
dtype = _choose_float_dtype_encoding(data.dtype, add_offset in attrs) | ||
data = data.astype(dtype=dtype, copy=True) | ||
if add_offset is not None: | ||
data = data - add_offset | ||
if scale_factor is not None: | ||
data = data / scale_factor | ||
|
||
return Variable(dims, data, attrs, encoding) | ||
|
||
|
@@ -269,7 +333,7 @@ def decode(self, variable, name=None): | |
if "scale_factor" in attrs or "add_offset" in attrs: | ||
scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) | ||
add_offset = pop_to(attrs, encoding, "add_offset", name=name) | ||
dtype = _choose_float_dtype(data.dtype, "add_offset" in attrs) | ||
dtype = _choose_float_dtype_decoding(data.dtype, scale_factor, add_offset) | ||
if np.ndim(scale_factor) > 0: | ||
scale_factor = np.asarray(scale_factor).item() | ||
if np.ndim(add_offset) > 0: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,7 +48,6 @@ def test_CFMaskCoder_decode() -> None: | |
def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None: | ||
original = xr.Variable(("x",), data, encoding=encoding) | ||
encoded = encode_cf_variable(original) | ||
|
||
assert encoded.dtype == encoded.attrs["missing_value"].dtype | ||
assert encoded.dtype == encoded.attrs["_FillValue"].dtype | ||
|
||
|
@@ -97,20 +96,22 @@ def test_coder_roundtrip() -> None: | |
|
||
|
||
@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split()) | ||
def test_scaling_converts_to_float32(dtype) -> None: | ||
def test_scaling_converts_to_float64(dtype) -> None: | ||
original = xr.Variable( | ||
("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10) | ||
) | ||
coder = variables.CFScaleOffsetCoder() | ||
encoded = coder.encode(original) | ||
assert encoded.dtype == np.float32 | ||
if dtype in ["f2", "f4"]: | ||
assert encoded.dtype == np.float32 | ||
if dtype in ["u1", "u2", "i1", "i2"]: | ||
assert encoded.dtype == np.float64 | ||
roundtripped = coder.decode(encoded) | ||
assert_identical(original, roundtripped) | ||
assert roundtripped.dtype == np.float32 | ||
assert roundtripped.dtype == np.float64 | ||
|
||
|
||
@pytest.mark.parametrize("scale_factor", (10, [10])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These kinds of things tend to happen though. Since we have tested for it, we should just keep it around. |
||
@pytest.mark.parametrize("add_offset", (0.1, [0.1])) | ||
@pytest.mark.parametrize("scale_factor", (10, 1)) | ||
@pytest.mark.parametrize("add_offset", (0.1, 1.0)) | ||
def test_scaling_offset_as_list(scale_factor, add_offset) -> None: | ||
# test for #4631 | ||
encoding = dict(scale_factor=scale_factor, add_offset=add_offset) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Encoding per the CF spec is fairly specific. The packed variable is supposed to be type byte/short/int, not float. Most of the tests encode with a
scale_factor
oradd_offset
that require the packed data to be type float. Rather than trying to solve all this, I have just split the encode and decode dtype function.