Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify Validation/Alignment APIs of ArrayDataBuilder: validate and align #6966

Merged
merged 5 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 100 additions & 34 deletions arrow-data/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,12 @@ impl ArrayData {
offset,
buffers,
child_data,
align_buffers: false,
// SAFETY: caller responsible for ensuring data is valid
skip_validation: true,
}
.build_unchecked()
.build()
.unwrap()
}

/// Create a new ArrayData, validating that the provided buffers form a valid
Expand Down Expand Up @@ -1775,7 +1779,7 @@ impl PartialEq for ArrayData {
}
}

/// Builder for `ArrayData` type
/// Builder for [`ArrayData`] type
#[derive(Debug)]
pub struct ArrayDataBuilder {
data_type: DataType,
Expand All @@ -1786,6 +1790,20 @@ pub struct ArrayDataBuilder {
offset: usize,
buffers: Vec<Buffer>,
child_data: Vec<ArrayData>,
/// Should buffers be realigned (copying if necessary)?
///
/// Defaults to false.
align_buffers: bool,
/// Should data validation be skipped for this [`ArrayData`]?
///
/// Defaults to false.
///
/// # Safety
///
/// This flag can only be set to true using `unsafe` APIs. However, once true
/// subsequent calls to `build()` may result in undefined behavior if the data
/// is not valid.
skip_validation: bool,
}

impl ArrayDataBuilder {
Expand All @@ -1801,6 +1819,8 @@ impl ArrayDataBuilder {
offset: 0,
buffers: vec![],
child_data: vec![],
align_buffers: false,
skip_validation: false,
}
}

Expand Down Expand Up @@ -1877,51 +1897,79 @@ impl ArrayDataBuilder {

/// Creates an array data, without any validation
///
/// Note: This is shorthand for `self.with_skip_validation(true).build()`
///
/// # Safety
///
/// The same caveats as [`ArrayData::new_unchecked`]
/// apply.
#[allow(clippy::let_and_return)]
pub unsafe fn build_unchecked(self) -> ArrayData {
let data = self.build_impl();
// Provide a force_validate mode
#[cfg(feature = "force_validate")]
data.validate_data().unwrap();
data
self.skip_validation(true).build().unwrap()
}

/// Same as [`Self::build_unchecked`] but ignoring `force_validate` feature flag
unsafe fn build_impl(self) -> ArrayData {
let nulls = self
.nulls
/// Creates an `ArrayData`, consuming `self`
///
/// # Safety
///
/// By default the underlying buffers are checked to ensure they are valid
/// Arrow data. However, if the [`Self::skip_validation`] flag has been set
/// to true (by the `unsafe` API) this validation is skipped. If the data is
/// not valid, undefined behavior will result.
pub fn build(self) -> Result<ArrayData, ArrowError> {
let Self {
data_type,
len,
null_count,
null_bit_buffer,
nulls,
offset,
buffers,
child_data,
align_buffers,
skip_validation,
} = self;

let nulls = nulls
.or_else(|| {
let buffer = self.null_bit_buffer?;
let buffer = BooleanBuffer::new(buffer, self.offset, self.len);
Some(match self.null_count {
Some(n) => NullBuffer::new_unchecked(buffer, n),
let buffer = null_bit_buffer?;
let buffer = BooleanBuffer::new(buffer, offset, len);
Some(match null_count {
Some(n) => {
// SAFETY: call to `data.validate_data()` below validates the null buffer is valid
unsafe { NullBuffer::new_unchecked(buffer, n) }
}
None => NullBuffer::new(buffer),
})
})
.filter(|b| b.null_count() != 0);

ArrayData {
data_type: self.data_type,
len: self.len,
offset: self.offset,
buffers: self.buffers,
child_data: self.child_data,
let mut data = ArrayData {
data_type,
len,
offset,
buffers,
child_data,
nulls,
};

if align_buffers {
data.align_buffers();
}
}

/// Creates an array data, validating all inputs
pub fn build(self) -> Result<ArrayData, ArrowError> {
let data = unsafe { self.build_impl() };
data.validate_data()?;
// SAFETY: `skip_validation` is only set to true using `unsafe` APIs
if !skip_validation || cfg!(feature = "force_validate") {
data.validate_data()?;
}
Ok(data)
}

/// Creates an array data, validating all inputs, and aligning any buffers
#[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
self.align_buffers(true).build()
}

/// Ensure that all buffers are aligned, copying data if necessary
///
/// Rust requires that arrays are aligned to their corresponding primitive,
/// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
Expand All @@ -1930,17 +1978,33 @@ impl ArrayDataBuilder {
/// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
///
/// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
/// this method is provided to automatically copy buffers to a new correctly aligned allocation
/// this flag is provided to automatically copy buffers to a new correctly aligned allocation
/// when necessary, making it useful when interacting with buffers produced by other systems,
/// e.g. IPC or FFI.
///
/// This is unlike `[Self::build`] which will instead return an error on encountering
/// If this flag is not enabled, `[Self::build`] return an error on encountering
/// insufficiently aligned buffers.
pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
let mut data = unsafe { self.build_impl() };
data.align_buffers();
data.validate_data()?;
Ok(data)
pub fn align_buffers(mut self, align_buffers: bool) -> Self {
self.align_buffers = align_buffers;
self
}

/// Skips validation of the data.
///
/// If this flag is enabled, `[Self::build`] will skip validation of the
/// data
///
/// If this flag is not enabled, `[Self::build`] will validate that all
/// buffers are valid and will return an error if any data is invalid.
/// Validation can be expensive.
///
/// # Safety
///
/// If validation is skipped, the buffers must form a valid Arrow array,
/// otherwise undefined behavior will result
pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
self.skip_validation = skip_validation;
self
}
}

Expand All @@ -1955,6 +2019,8 @@ impl From<ArrayData> for ArrayDataBuilder {
nulls: d.nulls,
null_bit_buffer: None,
null_count: None,
align_buffers: false,
skip_validation: false,
}
}
}
Expand Down
48 changes: 14 additions & 34 deletions arrow-ipc/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,17 +177,13 @@ fn create_array(
let values = create_array(reader, values_field, variadic_counts, require_alignment)?;

let run_array_length = run_node.length() as usize;
let builder = ArrayData::builder(data_type.clone())
let array_data = ArrayData::builder(data_type.clone())
.len(run_array_length)
.offset(0)
.add_child_data(run_ends.into_data())
.add_child_data(values.into_data());

let array_data = if require_alignment {
builder.build()?
} else {
builder.build_aligned()?
};
.add_child_data(values.into_data())
.align_buffers(!require_alignment)
.build()?;

Ok(make_array(array_data))
}
Expand Down Expand Up @@ -257,15 +253,11 @@ fn create_array(
)));
}

let builder = ArrayData::builder(data_type.clone())
let array_data = ArrayData::builder(data_type.clone())
.len(length as usize)
.offset(0);

let array_data = if require_alignment {
builder.build()?
} else {
builder.build_aligned()?
};
.offset(0)
.align_buffers(!require_alignment)
.build()?;

// no buffer increases
Ok(Arc::new(NullArray::from(array_data)))
Expand Down Expand Up @@ -311,11 +303,7 @@ fn create_primitive_array(
t => unreachable!("Data type {:?} either unsupported or not primitive", t),
};

let array_data = if require_alignment {
builder.build()?
} else {
builder.build_aligned()?
};
let array_data = builder.align_buffers(!require_alignment).build()?;

Ok(make_array(array_data))
}
Expand Down Expand Up @@ -347,11 +335,7 @@ fn create_list_array(
_ => unreachable!("Cannot create list or map array from {:?}", data_type),
};

let array_data = if require_alignment {
builder.build()?
} else {
builder.build_aligned()?
};
let array_data = builder.align_buffers(!require_alignment).build()?;

Ok(make_array(array_data))
}
Expand All @@ -367,17 +351,13 @@ fn create_dictionary_array(
) -> Result<ArrayRef, ArrowError> {
if let Dictionary(_, _) = *data_type {
let null_buffer = (field_node.null_count() > 0).then_some(buffers[0].clone());
let builder = ArrayData::builder(data_type.clone())
let array_data = ArrayData::builder(data_type.clone())
.len(field_node.length() as usize)
.add_buffer(buffers[1].clone())
.add_child_data(value_array.into_data())
.null_bit_buffer(null_buffer);

let array_data = if require_alignment {
builder.build()?
} else {
builder.build_aligned()?
};
.null_bit_buffer(null_buffer)
.align_buffers(!require_alignment)
.build()?;

Ok(make_array(array_data))
} else {
Expand Down
Loading