Skip to content

Commit

Permalink
Fix bugs resulting in invalid strides values for FITS arrays and view…
Browse files Browse the repository at this point in the history
…s over base arrays with negative strides.
  • Loading branch information
Ed Slavich committed Feb 22, 2021
1 parent 9e63770 commit c2498f9
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 55 deletions.
12 changes: 9 additions & 3 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,22 @@
- Add ``edit`` subcommand to asdftool for efficient editing of
the YAML portion of an ASDF file. [#873, #922]

2.7.3 (unreleased)
------------------

- Add pytest plugin options to skip and xfail individual tests
and xfail the unsupported ndarray-1.0.0 example. [#929]

- Fix bugs resulting in invalid strides values for FITS arrays
and views over base arrays with negative strides. [#930]

2.7.2 (2021-01-15)
------------------

- Fix bug causing test collection failures in some environments. [#889]

- Fix bug when decompressing arrays with numpy 1.20. [#901, #909]

- Add pytest plugin options to skip and xfail individual tests
and xfail the unsupported ndarray-1.0.0 example. [#929]

2.7.1 (2020-08-18)
------------------

Expand Down
67 changes: 47 additions & 20 deletions asdf/block.py
Original file line number Diff line number Diff line change
Expand Up @@ -851,13 +851,6 @@ def end_offset(self):
"""
return self.offset + self.header_size + self.allocated

def override_byteorder(self, byteorder):
"""
Hook to permit overriding the byteorder value stored in the
tree. This is used to support blocks stored in FITS files.
"""
return byteorder

@property
def trust_data_dtype(self):
"""
Expand Down Expand Up @@ -910,11 +903,11 @@ def _set_checksum(self, checksum):
else:
self._checksum = checksum

def _calculate_checksum(self, data):
def _calculate_checksum(self, array):
# The following line is safe because we're only using
# the MD5 as a checksum.
m = hashlib.new('md5') # nosec
m.update(self.data.ravel('K'))
m.update(array)
return m.digest()

def validate_checksum(self):
Expand All @@ -929,7 +922,7 @@ def validate_checksum(self):
`False`.
"""
if self._checksum:
checksum = self._calculate_checksum(self.data)
checksum = self._calculate_checksum(self._flattened_data)
if checksum != self._checksum:
return False
return True
Expand All @@ -938,7 +931,7 @@ def update_checksum(self):
"""
Update the checksum based on the current data contents.
"""
self._checksum = self._calculate_checksum(self.data)
self._checksum = self._calculate_checksum(self._flattened_data)

def update_size(self):
"""
Expand All @@ -947,13 +940,14 @@ def update_size(self):
updating the file in-place, otherwise the work is redundant.
"""
if self._data is not None:
self._data_size = self._data.data.nbytes
data = self._flattened_data
self._data_size = data.nbytes

if not self.output_compression:
self._size = self._data_size
else:
self._size = mcompression.get_compressed_size(
self._data, self.output_compression)
data, self.output_compression)
else:
self._data_size = self._size = 0

Expand Down Expand Up @@ -1098,22 +1092,55 @@ def _memmap_data(self):
self._data = self._fd.memmap_array(self.data_offset, self._size)
self._memmapped = True

@property
def _flattened_data(self):
"""
Retrieve flattened data suitable for writing.
Returns
-------
np.ndarray
1D contiguous array.
"""
data = self.data

# Reverse axes with negative strides so that we write the array
# according to the memory layout of the underlying buffer.
if any(s < 0 for s in data.strides):
slices = []
for stride in data.strides:
if stride < 0:
slices.append(slice(None, None, -1))
else:
slices.append(slice(None))
data = data[tuple(slices)]

# 'K' order flattens the array in the order that elements
# occur in memory (except negative strides are reversed,
# which we handled above).
return data.ravel(order='K')

def write(self, fd):
"""
Write an internal block to the given Python file-like object.
"""
self._header_size = self._header.size

if self._data is not None:
data = self._flattened_data
else:
data = None

flags = 0
data_size = used_size = allocated_size = 0
if self._array_storage == 'streamed':
flags |= constants.BLOCK_FLAG_STREAMED
elif self._data is not None:
self.update_checksum()
data_size = self._data.nbytes
elif data is not None:
self._checksum = self._calculate_checksum(data)
data_size = data.nbytes
if not fd.seekable() and self.output_compression:
buff = io.BytesIO()
mcompression.compress(buff, self._data,
mcompression.compress(buff, data,
self.output_compression)
self.allocated = self._size = buff.tell()
allocated_size = self.allocated
Expand All @@ -1138,7 +1165,7 @@ def write(self, fd):
used_size=used_size, data_size=data_size,
checksum=checksum))

if self._data is not None:
if data is not None:
if self.output_compression:
if not fd.seekable():
fd.write(buff.getvalue())
Expand All @@ -1149,7 +1176,7 @@ def write(self, fd):
# header.
start = fd.tell()
mcompression.compress(
fd, self._data, self.output_compression)
fd, data, self.output_compression)
end = fd.tell()
self.allocated = self._size = end - start
fd.seek(self.offset + 6)
Expand All @@ -1161,7 +1188,7 @@ def write(self, fd):
else:
if used_size != data_size:
raise RuntimeError(f"Block used size {used_size} is not equal to the data size {data_size}")
fd.write_array(self._data)
fd.write_array(data)

@property
def data(self):
Expand Down
7 changes: 0 additions & 7 deletions asdf/fits_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,6 @@ def readonly(self):
def array_storage(self):
return 'fits'

def override_byteorder(self, byteorder):
# FITS data is always stored in big-endian byte order.
# The data array may not report big-endian, but we want
# the value written to the tree to match the actual
# byte order on disk.
return 'big'

@property
def trust_data_dtype(self):
# astropy.io.fits returns arrays in native byte order
Expand Down
27 changes: 21 additions & 6 deletions asdf/generic_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def _array_fromfile(fd, size):
"""


def _array_tofile_chunked(write, array, chunksize): # pragma: no cover
def _array_tofile_chunked(write, array, chunksize):
array = array.view(np.uint8)
for i in range(0, array.nbytes, chunksize):
write(array[i:i + chunksize].data)
Expand All @@ -101,8 +101,7 @@ def _array_tofile_chunked(write, array, chunksize): # pragma: no cover
def _array_tofile_simple(fd, write, array):
return write(array.data)


if sys.platform == 'darwin': # pragma: no cover
if sys.platform == 'darwin':
def _array_tofile(fd, write, array):
# This value is currently set as a workaround for a known bug in Python
# on OSX. Individual writes must be less than 2GB, which necessitates
Expand All @@ -112,7 +111,7 @@ def _array_tofile(fd, write, array):
if fd is None or array.nbytes >= OSX_WRITE_LIMIT and array.nbytes % 4096 == 0:
return _array_tofile_chunked(write, array, OSX_WRITE_LIMIT)
return _array_tofile_simple(fd, write, array)
elif sys.platform.startswith('win'): # pragma: no cover
elif sys.platform.startswith('win'):
def _array_tofile(fd, write, array):
WIN_WRITE_LIMIT = 2 ** 30
return _array_tofile_chunked(write, array, WIN_WRITE_LIMIT)
Expand Down Expand Up @@ -370,7 +369,20 @@ def write(self, content):
"""

def write_array(self, array):
_array_tofile(None, self.write, array.ravel(order='K'))
"""
Write array content to the file. Array must be 1D contiguous
so that this method can avoid making assumptions about the
intended memory layout. Endianness is preserved.
Parameters
----------
array : np.ndarray
Must be 1D contiguous.
"""
if len(array.shape) != 1 or not array.flags.contiguous:
raise ValueError("Requires 1D contiguous array.")

_array_tofile(None, self.write, array)

def seek(self, offset, whence=0):
"""
Expand Down Expand Up @@ -749,7 +761,10 @@ def write_array(self, arr):
arr.flush()
self.fast_forward(len(arr.data))
else:
_array_tofile(self._fd, self._fd.write, arr.ravel(order='K'))
if len(arr.shape) != 1 or not arr.flags.contiguous:
raise ValueError("Requires 1D contiguous array.")

_array_tofile(self._fd, self._fd.write, arr)

def can_memmap(self):
return True
Expand Down
62 changes: 43 additions & 19 deletions asdf/tags/core/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ def asdf_datatype_to_numpy_dtype(datatype, byteorder=None):
raise ValueError("Unknown datatype {0}".format(datatype))


def numpy_byteorder_to_asdf_byteorder(byteorder):
def numpy_byteorder_to_asdf_byteorder(byteorder, override=None):
if override is not None:
return override

if byteorder == '=':
return sys.byteorder
elif byteorder == '<':
Expand All @@ -90,38 +93,38 @@ def numpy_byteorder_to_asdf_byteorder(byteorder):
return 'big'


def numpy_dtype_to_asdf_datatype(dtype, include_byteorder=True):
def numpy_dtype_to_asdf_datatype(dtype, include_byteorder=True, override_byteorder=None):
dtype = np.dtype(dtype)
if dtype.names is not None:
fields = []
for name in dtype.names:
field = dtype.fields[name][0]
d = {}
d['name'] = name
field_dtype, byteorder = numpy_dtype_to_asdf_datatype(field)
field_dtype, byteorder = numpy_dtype_to_asdf_datatype(field, override_byteorder=override_byteorder)
d['datatype'] = field_dtype
if include_byteorder:
d['byteorder'] = byteorder
if field.shape:
d['shape'] = list(field.shape)
fields.append(d)
return fields, numpy_byteorder_to_asdf_byteorder(dtype.byteorder)
return fields, numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder)

elif dtype.subdtype is not None:
return numpy_dtype_to_asdf_datatype(dtype.subdtype[0])
return numpy_dtype_to_asdf_datatype(dtype.subdtype[0], override_byteorder=override_byteorder)

elif dtype.name in _datatype_names:
return dtype.name, numpy_byteorder_to_asdf_byteorder(dtype.byteorder)
return dtype.name, numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder)

elif dtype.name == 'bool':
return 'bool8', numpy_byteorder_to_asdf_byteorder(dtype.byteorder)
return 'bool8', numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder)

elif dtype.name.startswith('string') or dtype.name.startswith('bytes'):
return ['ascii', dtype.itemsize], 'big'

elif dtype.name.startswith('unicode') or dtype.name.startswith('str'):
return (['ucs4', int(dtype.itemsize / 4)],
numpy_byteorder_to_asdf_byteorder(dtype.byteorder))
numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder))

raise ValueError("Unknown dtype {0}".format(dtype))

Expand Down Expand Up @@ -408,32 +411,53 @@ def reserve_blocks(cls, data, ctx):

@classmethod
def to_tree(cls, data, ctx):
# The ndarray-1.0.0 schema does not permit 0 valued strides.
# Perhaps we'll want to allow this someday, to efficiently
# represent an array of all the same value.
if any(stride == 0 for stride in data.strides):
data = np.ascontiguousarray(data)

base = util.get_array_base(data)
shape = data.shape
dtype = data.dtype
offset = data.ctypes.data - base.ctypes.data

if data.flags.c_contiguous:
block = ctx.blocks.find_or_create_block_for_array(data, ctx)

if block.array_storage == "fits":
# Views over arrays stored in FITS files have some idiosyncracies.
# astropy.io.fits always writes arrays C-contiguous with big-endian
# byte order, whereas asdf preserves the "contiguousity" and byte order
# of the base array.
if block.data is not data:
raise ValueError("Cannot serialize view over FITS array")

offset = 0
strides = None
dtype, byteorder = numpy_dtype_to_asdf_datatype(
data.dtype,
include_byteorder=(block.array_storage != "inline"),
override_byteorder="big",
)
else:
strides = data.strides
base = util.get_array_base(data)
# Compute the offset relative to the base array and not the
# block data, in case the block is compressed.
offset = data.ctypes.data - base.ctypes.data

block = ctx.blocks.find_or_create_block_for_array(data, ctx)
if data.flags.c_contiguous:
strides = None
else:
strides = data.strides

dtype, byteorder = numpy_dtype_to_asdf_datatype(
data.dtype,
include_byteorder=(block.array_storage != "inline"),
)

result = {}

result['shape'] = list(shape)
if block.array_storage == 'streamed':
result['shape'][0] = '*'

dtype, byteorder = numpy_dtype_to_asdf_datatype(
dtype, include_byteorder=(block.array_storage != 'inline'))

byteorder = block.override_byteorder(byteorder)

if block.array_storage == 'inline':
listdata = numpy_array_to_list(data)
result['data'] = listdata
Expand Down
Loading

0 comments on commit c2498f9

Please sign in to comment.