Fix bugs resulting in invalid strides values for FITS arrays and view…

…s over base arrays with negative strides.
asdf-format · Feb 22, 2021 · c2498f9 · c2498f9
1 parent 9e63770
commit c2498f9
Show file tree

Hide file tree

Showing 6 changed files with 160 additions and 55 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -43,16 +43,22 @@
 - Add ``edit`` subcommand to asdftool for efficient editing of
   the YAML portion of an ASDF file.  [#873, #922]
 
+2.7.3 (unreleased)
+------------------
+
+- Add pytest plugin options to skip and xfail individual tests
+  and xfail the unsupported ndarray-1.0.0 example. [#929]
+
+- Fix bugs resulting in invalid strides values for FITS arrays
+  and views over base arrays with negative strides. [#930]
+
 2.7.2 (2021-01-15)
 ------------------
 
 - Fix bug causing test collection failures in some environments. [#889]
 
 - Fix bug when decompressing arrays with numpy 1.20.  [#901, #909]
 
-- Add pytest plugin options to skip and xfail individual tests
-  and xfail the unsupported ndarray-1.0.0 example. [#929]
-
 2.7.1 (2020-08-18)
 ------------------
 

diff --git a/asdf/block.py b/asdf/block.py
@@ -851,13 +851,6 @@ def end_offset(self):
         """
         return self.offset + self.header_size + self.allocated
 
-    def override_byteorder(self, byteorder):
-        """
-        Hook to permit overriding the byteorder value stored in the
-        tree.  This is used to support blocks stored in FITS files.
-        """
-        return byteorder
-
     @property
     def trust_data_dtype(self):
         """
@@ -910,11 +903,11 @@ def _set_checksum(self, checksum):
         else:
             self._checksum = checksum
 
-    def _calculate_checksum(self, data):
+    def _calculate_checksum(self, array):
         # The following line is safe because we're only using
         # the MD5 as a checksum.
         m = hashlib.new('md5') # nosec
-        m.update(self.data.ravel('K'))
+        m.update(array)
         return m.digest()
 
     def validate_checksum(self):
@@ -929,7 +922,7 @@ def validate_checksum(self):
             `False`.
         """
         if self._checksum:
-            checksum = self._calculate_checksum(self.data)
+            checksum = self._calculate_checksum(self._flattened_data)
             if checksum != self._checksum:
                 return False
         return True
@@ -938,7 +931,7 @@ def update_checksum(self):
         """
         Update the checksum based on the current data contents.
         """
-        self._checksum = self._calculate_checksum(self.data)
+        self._checksum = self._calculate_checksum(self._flattened_data)
 
     def update_size(self):
         """
@@ -947,13 +940,14 @@ def update_size(self):
         updating the file in-place, otherwise the work is redundant.
         """
         if self._data is not None:
-            self._data_size = self._data.data.nbytes
+            data = self._flattened_data
+            self._data_size = data.nbytes
 
             if not self.output_compression:
                 self._size = self._data_size
             else:
                 self._size = mcompression.get_compressed_size(
-                    self._data, self.output_compression)
+                    data, self.output_compression)
         else:
             self._data_size = self._size = 0
 
@@ -1098,22 +1092,55 @@ def _memmap_data(self):
             self._data = self._fd.memmap_array(self.data_offset, self._size)
             self._memmapped = True
 
+    @property
+    def _flattened_data(self):
+        """
+        Retrieve flattened data suitable for writing.
+
+        Returns
+        -------
+        np.ndarray
+            1D contiguous array.
+        """
+        data = self.data
+
+        # Reverse axes with negative strides so that we write the array
+        # according to the memory layout of the underlying buffer.
+        if any(s < 0 for s in data.strides):
+            slices = []
+            for stride in data.strides:
+                if stride < 0:
+                    slices.append(slice(None, None, -1))
+                else:
+                    slices.append(slice(None))
+            data = data[tuple(slices)]
+
+        # 'K' order flattens the array in the order that elements
+        # occur in memory (except negative strides are reversed,
+        # which we handled above).
+        return data.ravel(order='K')
+
     def write(self, fd):
         """
         Write an internal block to the given Python file-like object.
         """
         self._header_size = self._header.size
 
+        if self._data is not None:
+            data = self._flattened_data
+        else:
+            data = None
+
         flags = 0
         data_size = used_size = allocated_size = 0
         if self._array_storage == 'streamed':
             flags |= constants.BLOCK_FLAG_STREAMED
-        elif self._data is not None:
-            self.update_checksum()
-            data_size = self._data.nbytes
+        elif data is not None:
+            self._checksum = self._calculate_checksum(data)
+            data_size = data.nbytes
             if not fd.seekable() and self.output_compression:
                 buff = io.BytesIO()
-                mcompression.compress(buff, self._data,
+                mcompression.compress(buff, data,
                                       self.output_compression)
                 self.allocated = self._size = buff.tell()
             allocated_size = self.allocated
@@ -1138,7 +1165,7 @@ def write(self, fd):
             used_size=used_size, data_size=data_size,
             checksum=checksum))
 
-        if self._data is not None:
+        if data is not None:
             if self.output_compression:
                 if not fd.seekable():
                     fd.write(buff.getvalue())
@@ -1149,7 +1176,7 @@ def write(self, fd):
                     # header.
                     start = fd.tell()
                     mcompression.compress(
-                        fd, self._data, self.output_compression)
+                        fd, data, self.output_compression)
                     end = fd.tell()
                     self.allocated = self._size = end - start
                     fd.seek(self.offset + 6)
@@ -1161,7 +1188,7 @@ def write(self, fd):
             else:
                 if used_size != data_size:
                     raise RuntimeError(f"Block used size {used_size} is not equal to the data size {data_size}")
-                fd.write_array(self._data)
+                fd.write_array(data)
 
     @property
     def data(self):

diff --git a/asdf/fits_embed.py b/asdf/fits_embed.py
@@ -46,13 +46,6 @@ def readonly(self):
     def array_storage(self):
         return 'fits'
 
-    def override_byteorder(self, byteorder):
-        # FITS data is always stored in big-endian byte order.
-        # The data array may not report big-endian, but we want
-        # the value written to the tree to match the actual
-        # byte order on disk.
-        return 'big'
-
     @property
     def trust_data_dtype(self):
         # astropy.io.fits returns arrays in native byte order

diff --git a/asdf/generic_io.py b/asdf/generic_io.py
@@ -92,7 +92,7 @@ def _array_fromfile(fd, size):
 """
 
 
-def _array_tofile_chunked(write, array, chunksize):  # pragma: no cover
+def _array_tofile_chunked(write, array, chunksize):
     array = array.view(np.uint8)
     for i in range(0, array.nbytes, chunksize):
         write(array[i:i + chunksize].data)
@@ -101,8 +101,7 @@ def _array_tofile_chunked(write, array, chunksize):  # pragma: no cover
 def _array_tofile_simple(fd, write, array):
     return write(array.data)
 
-
-if sys.platform == 'darwin':  # pragma: no cover
+if sys.platform == 'darwin':
     def _array_tofile(fd, write, array):
         # This value is currently set as a workaround for a known bug in Python
         # on OSX. Individual writes must be less than 2GB, which necessitates
@@ -112,7 +111,7 @@ def _array_tofile(fd, write, array):
         if fd is None or array.nbytes >= OSX_WRITE_LIMIT and array.nbytes % 4096 == 0:
             return _array_tofile_chunked(write, array, OSX_WRITE_LIMIT)
         return _array_tofile_simple(fd, write, array)
-elif sys.platform.startswith('win'):  # pragma: no cover
+elif sys.platform.startswith('win'):
     def _array_tofile(fd, write, array):
         WIN_WRITE_LIMIT = 2 ** 30
         return _array_tofile_chunked(write, array, WIN_WRITE_LIMIT)
@@ -370,7 +369,20 @@ def write(self, content):
     """
 
     def write_array(self, array):
-        _array_tofile(None, self.write, array.ravel(order='K'))
+        """
+        Write array content to the file.  Array must be 1D contiguous
+        so that this method can avoid making assumptions about the
+        intended memory layout.  Endianness is preserved.
+
+        Parameters
+        ----------
+        array : np.ndarray
+            Must be 1D contiguous.
+        """
+        if len(array.shape) != 1 or not array.flags.contiguous:
+            raise ValueError("Requires 1D contiguous array.")
+
+        _array_tofile(None, self.write, array)
 
     def seek(self, offset, whence=0):
         """
@@ -749,7 +761,10 @@ def write_array(self, arr):
             arr.flush()
             self.fast_forward(len(arr.data))
         else:
-            _array_tofile(self._fd, self._fd.write, arr.ravel(order='K'))
+            if len(arr.shape) != 1 or not arr.flags.contiguous:
+                raise ValueError("Requires 1D contiguous array.")
+
+            _array_tofile(self._fd, self._fd.write, arr)
 
     def can_memmap(self):
         return True

diff --git a/asdf/tags/core/ndarray.py b/asdf/tags/core/ndarray.py
@@ -81,7 +81,10 @@ def asdf_datatype_to_numpy_dtype(datatype, byteorder=None):
     raise ValueError("Unknown datatype {0}".format(datatype))
 
 
-def numpy_byteorder_to_asdf_byteorder(byteorder):
+def numpy_byteorder_to_asdf_byteorder(byteorder, override=None):
+    if override is not None:
+        return override
+
     if byteorder == '=':
         return sys.byteorder
     elif byteorder == '<':
@@ -90,38 +93,38 @@ def numpy_byteorder_to_asdf_byteorder(byteorder):
         return 'big'
 
 
-def numpy_dtype_to_asdf_datatype(dtype, include_byteorder=True):
+def numpy_dtype_to_asdf_datatype(dtype, include_byteorder=True, override_byteorder=None):
     dtype = np.dtype(dtype)
     if dtype.names is not None:
         fields = []
         for name in dtype.names:
             field = dtype.fields[name][0]
             d = {}
             d['name'] = name
-            field_dtype, byteorder = numpy_dtype_to_asdf_datatype(field)
+            field_dtype, byteorder = numpy_dtype_to_asdf_datatype(field, override_byteorder=override_byteorder)
             d['datatype'] = field_dtype
             if include_byteorder:
                 d['byteorder'] = byteorder
             if field.shape:
                 d['shape'] = list(field.shape)
             fields.append(d)
-        return fields, numpy_byteorder_to_asdf_byteorder(dtype.byteorder)
+        return fields, numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder)
 
     elif dtype.subdtype is not None:
-        return numpy_dtype_to_asdf_datatype(dtype.subdtype[0])
+        return numpy_dtype_to_asdf_datatype(dtype.subdtype[0], override_byteorder=override_byteorder)
 
     elif dtype.name in _datatype_names:
-        return dtype.name, numpy_byteorder_to_asdf_byteorder(dtype.byteorder)
+        return dtype.name, numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder)
 
     elif dtype.name == 'bool':
-        return 'bool8', numpy_byteorder_to_asdf_byteorder(dtype.byteorder)
+        return 'bool8', numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder)
 
     elif dtype.name.startswith('string') or dtype.name.startswith('bytes'):
         return ['ascii', dtype.itemsize], 'big'
 
     elif dtype.name.startswith('unicode') or dtype.name.startswith('str'):
         return (['ucs4', int(dtype.itemsize / 4)],
-                numpy_byteorder_to_asdf_byteorder(dtype.byteorder))
+                numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder))
 
     raise ValueError("Unknown dtype {0}".format(dtype))
 
@@ -408,32 +411,53 @@ def reserve_blocks(cls, data, ctx):
 
     @classmethod
     def to_tree(cls, data, ctx):
+        # The ndarray-1.0.0 schema does not permit 0 valued strides.
+        # Perhaps we'll want to allow this someday, to efficiently
+        # represent an array of all the same value.
         if any(stride == 0 for stride in data.strides):
             data = np.ascontiguousarray(data)
 
-        base = util.get_array_base(data)
         shape = data.shape
-        dtype = data.dtype
-        offset = data.ctypes.data - base.ctypes.data
 
-        if data.flags.c_contiguous:
+        block = ctx.blocks.find_or_create_block_for_array(data, ctx)
+
+        if block.array_storage == "fits":
+            # Views over arrays stored in FITS files have some idiosyncracies.
+            # astropy.io.fits always writes arrays C-contiguous with big-endian
+            # byte order, whereas asdf preserves the "contiguousity" and byte order
+            # of the base array.
+            if block.data is not data:
+                raise ValueError("Cannot serialize view over FITS array")
+
+            offset = 0
             strides = None
+            dtype, byteorder = numpy_dtype_to_asdf_datatype(
+                data.dtype,
+                include_byteorder=(block.array_storage != "inline"),
+                override_byteorder="big",
+            )
         else:
-            strides = data.strides
+            base = util.get_array_base(data)
+            # Compute the offset relative to the base array and not the
+            # block data, in case the block is compressed.
+            offset = data.ctypes.data - base.ctypes.data
 
-        block = ctx.blocks.find_or_create_block_for_array(data, ctx)
+            if data.flags.c_contiguous:
+                strides = None
+            else:
+                strides = data.strides
+
+            dtype, byteorder = numpy_dtype_to_asdf_datatype(
+                data.dtype,
+                include_byteorder=(block.array_storage != "inline"),
+            )
 
         result = {}
 
         result['shape'] = list(shape)
         if block.array_storage == 'streamed':
             result['shape'][0] = '*'
 
-        dtype, byteorder = numpy_dtype_to_asdf_datatype(
-            dtype, include_byteorder=(block.array_storage != 'inline'))
-
-        byteorder = block.override_byteorder(byteorder)
-
         if block.array_storage == 'inline':
             listdata = numpy_array_to_list(data)
             result['data'] = listdata