From 759a85748d7cc68e9633a506e6ec7637142d1bfc Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 16 Nov 2022 13:01:15 -0500 Subject: [PATCH 1/6] add test for resave breaking hdu array links issue #1232 --- asdf/tests/test_fits_embed.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/asdf/tests/test_fits_embed.py b/asdf/tests/test_fits_embed.py index a3ff0a4f2..669cceece 100644 --- a/asdf/tests/test_fits_embed.py +++ b/asdf/tests/test_fits_embed.py @@ -483,3 +483,32 @@ def test_array_view_different_layout(tmp_path): ValueError, match=r"ASDF has only limited support for serializing views over arrays stored in FITS HDUs" ): af.write_to(file_path) + + +def test_resave_breaks_hdulist_tree_array_link(tmp_path): + """ + Test that writing, reading and rewriting an AsdfInFits file + maintains links between hdus and arrays in the asdf tree + + If the link is broken, data can be duplicated (exist both + as a hdu and as an internal block in the asdf tree). + + See issues: + https://github.com/asdf-format/asdf/issues/1232 + https://github.com/spacetelescope/jwst/issues/7354 + https://github.com/spacetelescope/jwst/issues/7274 + """ + file_path_1 = tmp_path / "test1.fits" + file_path_2 = tmp_path / "test2.fits" + + af = create_asdf_in_fits("f4") + af.write_to(file_path_1) + + with asdf.open(file_path_1) as af1: + af1.write_to(file_path_2) + + # check that af1 (original write) and af2 (rewrite) do not contain internal blocks + with fits.open(file_path_1) as af1, fits.open(file_path_2) as af2: + for f in (af1, af2): + block_bytes = f["ASDF"].data.tobytes().split(b"...")[1].strip() + assert len(block_bytes) == 0 From 4daaa73304c82d03613edaf598315348eea2629c Mon Sep 17 00:00:00 2001 From: Brett Date: Wed, 16 Nov 2022 13:03:53 -0500 Subject: [PATCH 2/6] add NDArrayType support to get_array_base and fits block finding NDArrayType fails checks to isinstance(arr, ndarray) as it is not a subclass of ndarray. This prevented util.get_array_base from finding the correct base array which contributed to failures to link hdu items to arrays within the tree (leading to duplication of data). The EmbeddedBlockManager in fits_embed also skipped checking for links between NDArrayType and hdu items which also broke the link between hdu items and arrays within the tree. This PR removes the skip. fixes issue #1232 --- asdf/fits_embed.py | 13 ++++++------- asdf/util.py | 4 +++- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/asdf/fits_embed.py b/asdf/fits_embed.py index b43628133..480b0af5d 100644 --- a/asdf/fits_embed.py +++ b/asdf/fits_embed.py @@ -92,13 +92,12 @@ def get_source(self, block): def find_or_create_block_for_array(self, arr, ctx): from .tags.core import ndarray - if not isinstance(arr, ndarray.NDArrayType): - base = util.get_array_base(arr) - for hdu in self._hdulist: - if hdu.data is None: - continue - if base is util.get_array_base(hdu.data): - return _FitsBlock(hdu) + base = util.get_array_base(arr) + for hdu in self._hdulist: + if hdu.data is None: + continue + if base is util.get_array_base(hdu.data): + return _FitsBlock(hdu) return super().find_or_create_block_for_array(arr, ctx) diff --git a/asdf/util.py b/asdf/util.py index a45c86357..df8924975 100644 --- a/asdf/util.py +++ b/asdf/util.py @@ -85,8 +85,10 @@ def get_array_base(arr): For a given Numpy array, finds the base array that "owns" the actual data. """ + from .tags.core import ndarray + base = arr - while isinstance(base.base, np.ndarray): + while isinstance(base.base, (np.ndarray, ndarray.NDArrayType)): base = base.base return base From 36117eb7c83b69fa726acd348ae63098525b7ab5 Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 18 Nov 2022 11:07:11 -0500 Subject: [PATCH 3/6] add test for hdu array link in fits --- asdf/tests/test_fits_embed.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/asdf/tests/test_fits_embed.py b/asdf/tests/test_fits_embed.py index 669cceece..a484ab156 100644 --- a/asdf/tests/test_fits_embed.py +++ b/asdf/tests/test_fits_embed.py @@ -463,6 +463,29 @@ def test_array_view_compatible_dtype(tmp_path): af.write_to(file_path) +def test_hdu_link_independence(tmp_path): + asdf_in_fits = create_asdf_in_fits("f4") + # set all arrays to same values + asdf_in_fits["model"]["sci"]["data"][:] = 0 + asdf_in_fits["model"]["dq"]["data"][:] = 0 + asdf_in_fits["model"]["err"]["data"][:] = 0 + + fn0 = tmp_path / "test0.fits" + + # write out the asdf in fits file + asdf_in_fits.write_to(str(fn0)) + + with asdf.open(fn0, mode="r") as aif: + # assign new values + aif["model"]["sci"]["data"][:] = 1 + aif["model"]["dq"]["data"][:] = 2 + aif["model"]["err"]["data"][:] = 3 + + assert np.all(aif["model"]["sci"]["data"] == 1) + assert np.all(aif["model"]["dq"]["data"] == 1) + assert np.all(aif["model"]["err"]["data"] == 1) + + def test_array_view_different_layout(tmp_path): """ A view over the FITS array with a different memory layout From 1dae83a9807d7339e7a8981969c77f79e608dc6c Mon Sep 17 00:00:00 2001 From: Brett Date: Fri, 18 Nov 2022 16:39:56 -0500 Subject: [PATCH 4/6] fix test_hdu_link_independence test --- asdf/tests/test_fits_embed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asdf/tests/test_fits_embed.py b/asdf/tests/test_fits_embed.py index a484ab156..1ea062df7 100644 --- a/asdf/tests/test_fits_embed.py +++ b/asdf/tests/test_fits_embed.py @@ -482,8 +482,8 @@ def test_hdu_link_independence(tmp_path): aif["model"]["err"]["data"][:] = 3 assert np.all(aif["model"]["sci"]["data"] == 1) - assert np.all(aif["model"]["dq"]["data"] == 1) - assert np.all(aif["model"]["err"]["data"] == 1) + assert np.all(aif["model"]["dq"]["data"] == 2) + assert np.all(aif["model"]["err"]["data"] == 3) def test_array_view_different_layout(tmp_path): From 0c904cb7b767c0f5f45337856a8b6c8e76a92ad0 Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 21 Nov 2022 10:42:45 -0500 Subject: [PATCH 5/6] update changelog --- CHANGES.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.rst b/CHANGES.rst index ce2f424d8..743b86242 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -15,6 +15,7 @@ - Modify generic_file for fsspec compatibility [#1226] - Add fsspec http filesystem support [#1228] - Memmap whole file instead of each array [#1230] +- Fix issue #1232 where array data was duplicated during resaving of a fits file [#1234] 2.13.0 (2022-08-19) ------------------- From b812f426a5f97adb9d07b876adffa03920efe3cb Mon Sep 17 00:00:00 2001 From: Brett Date: Mon, 21 Nov 2022 13:13:08 -0500 Subject: [PATCH 6/6] improve test_hdu_link_independence docstring --- asdf/tests/test_fits_embed.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/asdf/tests/test_fits_embed.py b/asdf/tests/test_fits_embed.py index 1ea062df7..84b7bd631 100644 --- a/asdf/tests/test_fits_embed.py +++ b/asdf/tests/test_fits_embed.py @@ -464,6 +464,16 @@ def test_array_view_compatible_dtype(tmp_path): def test_hdu_link_independence(tmp_path): + """ + As links between arrays and hdu items are made during + saving, it's possible that if this goes wrong links + might be made between multiple arrays and a single hdu. + In this case, modifying one array will change memory + shared with another array. This test creates a file + with multiple arrays, writes it to a fits file, + reads it back in and then modifies the contents of + each array to check for this possible errort. + """ asdf_in_fits = create_asdf_in_fits("f4") # set all arrays to same values asdf_in_fits["model"]["sci"]["data"][:] = 0