From 06bcf8d495770e0016ca221e1669d40c5ca00556 Mon Sep 17 00:00:00 2001 From: Luca Versari Date: Tue, 9 Jul 2024 00:55:26 +0200 Subject: [PATCH] Unsafe improvements: arrow/{array_reader,buffer}. --- parquet/src/arrow/array_reader/fixed_len_byte_array.rs | 6 ++++++ parquet/src/arrow/array_reader/fixed_size_list_array.rs | 1 + parquet/src/arrow/array_reader/list_array.rs | 1 + parquet/src/arrow/array_reader/map_array.rs | 2 +- parquet/src/arrow/array_reader/primitive_array.rs | 5 +++++ parquet/src/arrow/array_reader/struct_array.rs | 1 + parquet/src/arrow/buffer/dictionary_buffer.rs | 4 ++++ parquet/src/arrow/buffer/offset_buffer.rs | 7 ++++++- 8 files changed, 25 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs index a9159bb47125..42a04f0b372c 100644 --- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs +++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs @@ -160,6 +160,12 @@ impl ArrayReader for FixedLenByteArrayReader { .add_buffer(Buffer::from_vec(record_data.buffer)) .null_bit_buffer(self.record_reader.consume_bitmap_buffer()); + // SAFETY: Assuming that a RecordReader produces a valid FixedLenByteArrayBuffer (as + // otherwise that code itself would be unsound), this call is safe if self.byte_length + // matches the byte length of the FixedLenByteArrayBuffer produced by the RecordReader + // matches the byte length passed as a type parameter to ArrayDataBuilder. + // FIXME: how do we know that the byte lengths match? What if record_data.byte_length + // is None? let binary = FixedSizeBinaryArray::from(unsafe { array_data.build_unchecked() }); // TODO: An improvement might be to do this conversion on read diff --git a/parquet/src/arrow/array_reader/fixed_size_list_array.rs b/parquet/src/arrow/array_reader/fixed_size_list_array.rs index 4cf68a06601c..96ddcc5dc456 100644 --- a/parquet/src/arrow/array_reader/fixed_size_list_array.rs +++ b/parquet/src/arrow/array_reader/fixed_size_list_array.rs @@ -202,6 +202,7 @@ impl ArrayReader for FixedSizeListArrayReader { list_builder = list_builder.null_bit_buffer(Some(builder.into())); } + // SAFETY: FIXME: document why this call is safe. let list_data = unsafe { list_builder.build_unchecked() }; let result_array = FixedSizeListArray::from(list_data); diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index 7c66c5c23112..8425fc615a60 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -225,6 +225,7 @@ impl ArrayReader for ListArrayReader { data_builder = data_builder.null_bit_buffer(Some(builder.into())) } + // SAFETY: FIXME: document why this call is safe. let list_data = unsafe { data_builder.build_unchecked() }; let result_array = GenericListArray::::from(list_data); diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index 9bfc047322a7..a98afd2a4b3e 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -99,7 +99,7 @@ impl ArrayReader for MapArrayReader { let data = array.to_data(); let builder = data.into_builder().data_type(self.data_type.clone()); - // SAFETY - we can assume that ListArrayReader produces valid ListArray + // SAFETY: we can assume that ListArrayReader produces valid ListArray // of the expected type, and as such its output can be reinterpreted as // a MapArray without validation Ok(Arc::new(MapArray::from(unsafe { diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index 07ecc27d9b0b..4f268eb18f7a 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -179,6 +179,11 @@ where .add_buffer(record_data) .null_bit_buffer(self.record_reader.consume_bitmap_buffer()); + // SAFETY: Assuming that the RecordReader's null buffer is sufficiently sized for the + // corresponding Vec buffer (or None), this is safe, as we have a single buffer + // obtained from a vector of `T`s, which must therefore have a valid memory representation + // for T. The type mapping above ensures that a valid T results in a valid Arrow array of + // type `arrow_data_type`. let array_data = unsafe { array_data.build_unchecked() }; let array: ArrayRef = match T::get_physical_type() { PhysicalType::BOOLEAN => Arc::new(BooleanArray::from(array_data)), diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index 4af194774bfb..c8252d137bca 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -173,6 +173,7 @@ impl ArrayReader for StructArrayReader { array_data_builder.null_bit_buffer(Some(bitmap_builder.into())); } + // SAFETY: FIXME: document why this call is safe. let array_data = unsafe { array_data_builder.build_unchecked() }; Ok(Arc::new(StructArray::from(array_data))) } diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index 59f1cfa056a1..fc4818b20d1b 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -162,6 +162,10 @@ impl DictionaryBuffer { let data = match cfg!(debug_assertions) { true => builder.build().unwrap(), + // SAFETY: FIXME: this is unsound. data_type is passed by the caller without + // any validation, which might result in creating invalid arrays, and this is a + // safe function which cannot have preconditions. Similar considerations apply + // to null_buffer. false => unsafe { builder.build_unchecked() }, }; diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index 806f144d9666..2b4572f3b44a 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -146,6 +146,10 @@ impl OffsetBuffer { let data = match cfg!(debug_assertions) { true => array_data_builder.build().unwrap(), + // SAFETY: FIXME: this is unsound. data_type is passed by the caller without + // any validation, which might result in creating invalid arrays, and this is a + // safe function which cannot have preconditions. Similar considerations apply + // to null_buffer. false => unsafe { array_data_builder.build_unchecked() }, }; @@ -164,7 +168,8 @@ impl OffsetBuffer { let len = (end - start).to_usize().unwrap(); if len != 0 { - // Safety: (1) the buffer is valid (2) the offsets are valid (3) the values in between are of ByteViewType + // SAFETY: (1) the buffer is valid (2) the offsets are valid (3) the values in between are of ByteViewType. + // This follows from `self` being itself valid. unsafe { builder.append_view_unchecked(block, start.as_usize() as u32, len as u32); }