Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix strides issues over FITS arrays and base arrays with negative strides #930

Merged
merged 2 commits into from
Feb 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,22 @@
- Add ``edit`` subcommand to asdftool for efficient editing of
the YAML portion of an ASDF file. [#873, #922]

2.7.3 (unreleased)
------------------

- Add pytest plugin options to skip and xfail individual tests
and xfail the unsupported ndarray-1.0.0 example. [#929]

- Fix bugs resulting in invalid strides values for FITS arrays
and views over base arrays with negative strides. [#930]

2.7.2 (2021-01-15)
------------------

- Fix bug causing test collection failures in some environments. [#889]

- Fix bug when decompressing arrays with numpy 1.20. [#901, #909]

- Add pytest plugin options to skip and xfail individual tests
and xfail the unsupported ndarray-1.0.0 example. [#929]

2.7.1 (2020-08-18)
------------------

Expand Down
67 changes: 47 additions & 20 deletions asdf/block.py
Original file line number Diff line number Diff line change
Expand Up @@ -851,13 +851,6 @@ def end_offset(self):
"""
return self.offset + self.header_size + self.allocated

def override_byteorder(self, byteorder):
"""
Hook to permit overriding the byteorder value stored in the
tree. This is used to support blocks stored in FITS files.
"""
return byteorder

@property
def trust_data_dtype(self):
"""
Expand Down Expand Up @@ -910,11 +903,11 @@ def _set_checksum(self, checksum):
else:
self._checksum = checksum

def _calculate_checksum(self, data):
def _calculate_checksum(self, array):
# The following line is safe because we're only using
# the MD5 as a checksum.
m = hashlib.new('md5') # nosec
m.update(self.data.ravel('K'))
m.update(array)
return m.digest()

def validate_checksum(self):
Expand All @@ -929,7 +922,7 @@ def validate_checksum(self):
`False`.
"""
if self._checksum:
checksum = self._calculate_checksum(self.data)
checksum = self._calculate_checksum(self._flattened_data)
if checksum != self._checksum:
return False
return True
Expand All @@ -938,7 +931,7 @@ def update_checksum(self):
"""
Update the checksum based on the current data contents.
"""
self._checksum = self._calculate_checksum(self.data)
self._checksum = self._calculate_checksum(self._flattened_data)

def update_size(self):
"""
Expand All @@ -947,13 +940,14 @@ def update_size(self):
updating the file in-place, otherwise the work is redundant.
"""
if self._data is not None:
self._data_size = self._data.data.nbytes
data = self._flattened_data
self._data_size = data.nbytes

if not self.output_compression:
self._size = self._data_size
else:
self._size = mcompression.get_compressed_size(
self._data, self.output_compression)
data, self.output_compression)
else:
self._data_size = self._size = 0

Expand Down Expand Up @@ -1098,22 +1092,55 @@ def _memmap_data(self):
self._data = self._fd.memmap_array(self.data_offset, self._size)
self._memmapped = True

@property
def _flattened_data(self):
"""
Retrieve flattened data suitable for writing.
Returns
-------
np.ndarray
1D contiguous array.
"""
data = self.data

# Reverse axes with negative strides so that we write the array
# according to the memory layout of the underlying buffer.
if any(s < 0 for s in data.strides):
slices = []
for stride in data.strides:
if stride < 0:
slices.append(slice(None, None, -1))
else:
slices.append(slice(None))
data = data[tuple(slices)]

# 'K' order flattens the array in the order that elements
# occur in memory (except negative strides are reversed,
# which we handled above).
return data.ravel(order='K')

def write(self, fd):
"""
Write an internal block to the given Python file-like object.
"""
self._header_size = self._header.size

if self._data is not None:
data = self._flattened_data
else:
data = None

flags = 0
data_size = used_size = allocated_size = 0
if self._array_storage == 'streamed':
flags |= constants.BLOCK_FLAG_STREAMED
elif self._data is not None:
self.update_checksum()
data_size = self._data.nbytes
elif data is not None:
self._checksum = self._calculate_checksum(data)
data_size = data.nbytes
if not fd.seekable() and self.output_compression:
buff = io.BytesIO()
mcompression.compress(buff, self._data,
mcompression.compress(buff, data,
self.output_compression)
self.allocated = self._size = buff.tell()
allocated_size = self.allocated
Expand All @@ -1138,7 +1165,7 @@ def write(self, fd):
used_size=used_size, data_size=data_size,
checksum=checksum))

if self._data is not None:
if data is not None:
if self.output_compression:
if not fd.seekable():
fd.write(buff.getvalue())
Expand All @@ -1149,7 +1176,7 @@ def write(self, fd):
# header.
start = fd.tell()
mcompression.compress(
fd, self._data, self.output_compression)
fd, data, self.output_compression)
end = fd.tell()
self.allocated = self._size = end - start
fd.seek(self.offset + 6)
Expand All @@ -1161,7 +1188,7 @@ def write(self, fd):
else:
if used_size != data_size:
raise RuntimeError(f"Block used size {used_size} is not equal to the data size {data_size}")
fd.write_array(self._data)
fd.write_array(data)

@property
def data(self):
Expand Down
7 changes: 0 additions & 7 deletions asdf/fits_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,6 @@ def readonly(self):
def array_storage(self):
return 'fits'

def override_byteorder(self, byteorder):
# FITS data is always stored in big-endian byte order.
# The data array may not report big-endian, but we want
# the value written to the tree to match the actual
# byte order on disk.
return 'big'
Comment on lines -49 to -54
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why removed, and what was this ever supposed to do?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FITS always writes arrays in big-endian byte order, so we needed a mechanism to tell NDArrayType to ignore the byte order reported by the base array. This logic has been moved inside NDArrayType itself.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Glad to see the logic moved inside the class itself.


@property
def trust_data_dtype(self):
# astropy.io.fits returns arrays in native byte order
Expand Down
27 changes: 21 additions & 6 deletions asdf/generic_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def _array_fromfile(fd, size):
"""


def _array_tofile_chunked(write, array, chunksize): # pragma: no cover
def _array_tofile_chunked(write, array, chunksize):
array = array.view(np.uint8)
for i in range(0, array.nbytes, chunksize):
write(array[i:i + chunksize].data)
Expand All @@ -101,8 +101,7 @@ def _array_tofile_chunked(write, array, chunksize): # pragma: no cover
def _array_tofile_simple(fd, write, array):
return write(array.data)


if sys.platform == 'darwin': # pragma: no cover
if sys.platform == 'darwin':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Odd to have this logic at the module level. Wouldn't it be clearer to have the logic in the calling function? Or are there more than one calling functions?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know for sure, but I suspect whoever wrote this was trying to be efficient by doing the check once at import time instead of on every call.

def _array_tofile(fd, write, array):
# This value is currently set as a workaround for a known bug in Python
# on OSX. Individual writes must be less than 2GB, which necessitates
Expand All @@ -112,7 +111,7 @@ def _array_tofile(fd, write, array):
if fd is None or array.nbytes >= OSX_WRITE_LIMIT and array.nbytes % 4096 == 0:
return _array_tofile_chunked(write, array, OSX_WRITE_LIMIT)
return _array_tofile_simple(fd, write, array)
elif sys.platform.startswith('win'): # pragma: no cover
elif sys.platform.startswith('win'):
def _array_tofile(fd, write, array):
WIN_WRITE_LIMIT = 2 ** 30
return _array_tofile_chunked(write, array, WIN_WRITE_LIMIT)
Comment on lines 111 to 117
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OSX_WRITE_LIMIT and WIN_WRITE_LIMIT are not really global variables here, so should probably be lower case. Or you could make them global variables. They're identical.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is a bit strange. I'll move those outside the functions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I'm going to tackle this in a separate PR. These workarounds seem to no longer be necessary, but this PR will be included in an asdf 2.7.3 and I'd rather drop them in 2.8.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Expand Down Expand Up @@ -370,7 +369,20 @@ def write(self, content):
"""

def write_array(self, array):
_array_tofile(None, self.write, array.ravel(order='K'))
"""
Write array content to the file. Array must be 1D contiguous
so that this method can avoid making assumptions about the
intended memory layout. Endianness is preserved.
Parameters
----------
array : np.ndarray
Must be 1D contiguous.
"""
if len(array.shape) != 1 or not array.flags.contiguous:
raise ValueError("Requires 1D contiguous array.")

_array_tofile(None, self.write, array)

def seek(self, offset, whence=0):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the first time I've seen whence as a kwarg. I like. Though it's not really clear from the docstring what it does.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's funny, I didn't notice that. Seems to be a bit of a tradition.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😅 Shows you how much I paid attention in my C programming!

"""
Expand Down Expand Up @@ -749,7 +761,10 @@ def write_array(self, arr):
arr.flush()
self.fast_forward(len(arr.data))
else:
_array_tofile(self._fd, self._fd.write, arr.ravel(order='K'))
if len(arr.shape) != 1 or not arr.flags.contiguous:
raise ValueError("Requires 1D contiguous array.")

_array_tofile(self._fd, self._fd.write, arr)

def can_memmap(self):
return True
Expand Down
70 changes: 51 additions & 19 deletions asdf/tags/core/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ def asdf_datatype_to_numpy_dtype(datatype, byteorder=None):
raise ValueError("Unknown datatype {0}".format(datatype))


def numpy_byteorder_to_asdf_byteorder(byteorder):
def numpy_byteorder_to_asdf_byteorder(byteorder, override=None):
if override is not None:
return override

if byteorder == '=':
return sys.byteorder
elif byteorder == '<':
Expand All @@ -90,38 +93,38 @@ def numpy_byteorder_to_asdf_byteorder(byteorder):
return 'big'


def numpy_dtype_to_asdf_datatype(dtype, include_byteorder=True):
def numpy_dtype_to_asdf_datatype(dtype, include_byteorder=True, override_byteorder=None):
dtype = np.dtype(dtype)
if dtype.names is not None:
fields = []
for name in dtype.names:
field = dtype.fields[name][0]
d = {}
d['name'] = name
field_dtype, byteorder = numpy_dtype_to_asdf_datatype(field)
field_dtype, byteorder = numpy_dtype_to_asdf_datatype(field, override_byteorder=override_byteorder)
d['datatype'] = field_dtype
if include_byteorder:
d['byteorder'] = byteorder
if field.shape:
d['shape'] = list(field.shape)
fields.append(d)
return fields, numpy_byteorder_to_asdf_byteorder(dtype.byteorder)
return fields, numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder)

elif dtype.subdtype is not None:
return numpy_dtype_to_asdf_datatype(dtype.subdtype[0])
return numpy_dtype_to_asdf_datatype(dtype.subdtype[0], override_byteorder=override_byteorder)

elif dtype.name in _datatype_names:
return dtype.name, numpy_byteorder_to_asdf_byteorder(dtype.byteorder)
return dtype.name, numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder)

elif dtype.name == 'bool':
return 'bool8', numpy_byteorder_to_asdf_byteorder(dtype.byteorder)
return 'bool8', numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder)

elif dtype.name.startswith('string') or dtype.name.startswith('bytes'):
return ['ascii', dtype.itemsize], 'big'

elif dtype.name.startswith('unicode') or dtype.name.startswith('str'):
return (['ucs4', int(dtype.itemsize / 4)],
numpy_byteorder_to_asdf_byteorder(dtype.byteorder))
numpy_byteorder_to_asdf_byteorder(dtype.byteorder, override=override_byteorder))

raise ValueError("Unknown dtype {0}".format(dtype))

Expand Down Expand Up @@ -408,32 +411,61 @@ def reserve_blocks(cls, data, ctx):

@classmethod
def to_tree(cls, data, ctx):
# The ndarray-1.0.0 schema does not permit 0 valued strides.
# Perhaps we'll want to allow this someday, to efficiently
# represent an array of all the same value.
Comment on lines +414 to +416
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't the ndarray-1.0.0 invalid? What about ndarray-1.1.0? I.e. test_example2.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We haven't created ndarray-1.1.0 yet, have we? I don't see it in asdf-standard...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doh! Nevermind.

I guess this would be a candidate feature of 1.1.0. =)

if any(stride == 0 for stride in data.strides):
data = np.ascontiguousarray(data)

base = util.get_array_base(data)
shape = data.shape
dtype = data.dtype
offset = data.ctypes.data - base.ctypes.data

if data.flags.c_contiguous:
block = ctx.blocks.find_or_create_block_for_array(data, ctx)

if block.array_storage == "fits":
# Views over arrays stored in FITS files have some idiosyncracies.
# astropy.io.fits always writes arrays C-contiguous with big-endian
# byte order, whereas asdf preserves the "contiguousity" and byte order
# of the base array.
if (block.data.shape != data.shape or
block.data.dtype.itemsize != data.dtype.itemsize or
block.data.ctypes.data != data.ctypes.data or
block.data.strides != data.strides):
raise ValueError(
"ASDF has only limited support for serializing views over arrays stored "
"in FITS HDUs. This error likely means that a slice of such an array "
"was found in the ASDF tree. The slice can be decoupled from the FITS "
"array by calling copy() before assigning it to the tree."
)

offset = 0
strides = None
dtype, byteorder = numpy_dtype_to_asdf_datatype(
data.dtype,
include_byteorder=(block.array_storage != "inline"),
override_byteorder="big",
)
else:
strides = data.strides
base = util.get_array_base(data)
# Compute the offset relative to the base array and not the
# block data, in case the block is compressed.
offset = data.ctypes.data - base.ctypes.data

block = ctx.blocks.find_or_create_block_for_array(data, ctx)
if data.flags.c_contiguous:
strides = None
else:
strides = data.strides

dtype, byteorder = numpy_dtype_to_asdf_datatype(
data.dtype,
include_byteorder=(block.array_storage != "inline"),
)

result = {}

result['shape'] = list(shape)
if block.array_storage == 'streamed':
result['shape'][0] = '*'

dtype, byteorder = numpy_dtype_to_asdf_datatype(
dtype, include_byteorder=(block.array_storage != 'inline'))

byteorder = block.override_byteorder(byteorder)

if block.array_storage == 'inline':
listdata = numpy_array_to_list(data)
result['data'] = listdata
Expand Down
Loading