Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 19 additions & 17 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use crate::types::bytes::ByteArrayNativeType;
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
use arrow_schema::{ArrowError, DataType};
use core::str;
use num::ToPrimitive;
Expand Down Expand Up @@ -77,8 +77,9 @@ use super::ByteArrayType;
/// 0 31 63 95 127
/// ```
///
/// * Strings with length <= 12 are stored directly in the view. See
/// [`Self::inline_value`] to access the inlined prefix from a short view.
/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in
/// the view. See [`Self::inline_value`] to access the inlined prefix from a
/// short view.
///
/// * Strings with length > 12: The first four bytes are stored inline in the
/// view and the entire string is stored in one of the buffers. See [`ByteView`]
Expand Down Expand Up @@ -128,6 +129,7 @@ use super::ByteArrayType;
/// assert_eq!(value, "this string is also longer than 12 bytes");
/// ```
///
/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN
/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
///
/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
Expand Down Expand Up @@ -316,7 +318,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
let v = self.views.get_unchecked(idx);
let len = *v as u32;
let b = if len <= 12 {
let b = if len <= MAX_INLINE_VIEW_LEN {
Self::inline_value(v, len as usize)
} else {
let view = ByteView::from(*v);
Expand All @@ -331,10 +333,10 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
///
/// # Safety
/// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
/// - The `len` must be the length of the inlined value. It should never be larger than 12.
/// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`].
#[inline(always)]
pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
debug_assert!(len <= 12);
debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize);
std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
}

Expand All @@ -347,7 +349,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
self.views.iter().map(move |v| {
let len = *v as u32;
if len <= 12 {
if len <= MAX_INLINE_VIEW_LEN {
unsafe { Self::inline_value(v, len as usize) }
} else {
let view = ByteView::from(*v);
Expand All @@ -371,7 +373,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
return &[] as &[u8];
}

if prefix_len <= 4 || len <= 12 {
if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN {
unsafe { StringViewArray::inline_value(v, prefix_len) }
} else {
let view = ByteView::from(*v);
Expand Down Expand Up @@ -401,7 +403,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
return &[] as &[u8];
}

if len <= 12 {
if len as u32 <= MAX_INLINE_VIEW_LEN {
unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
} else {
let view = ByteView::from(*v);
Expand Down Expand Up @@ -495,9 +497,9 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
self.views()
.iter()
.map(|v| {
let len = (*v as u32) as usize;
if len > 12 {
len
let len = *v as u32;
if len > MAX_INLINE_VIEW_LEN {
len as usize
} else {
0
}
Expand All @@ -511,11 +513,11 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
/// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
///
/// ByteView types give us the following two advantages, and we need to be careful not to lose them:
/// (1) For string/byte smaller than 12 bytes, the entire data is inlined in the view.
/// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view.
/// Meaning that reading one array element requires only one memory access
/// (two memory access required for StringArray, one for offset buffer, the other for value buffer).
///
/// (2) For string/byte larger than 12 bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
/// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
/// thanks to the inlined 4 bytes.
/// Consider equality check:
/// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
Expand All @@ -525,8 +527,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
/// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
///
/// # Order check flow
/// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view.
/// (2) if any of the string is larger than 12 bytes, we need to compare the full string.
/// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view.
/// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string.
/// (2.1) if the inlined 4 bytes are different, we can return the result immediately.
/// (2.2) o.w., we need to compare the full string.
///
Expand All @@ -544,7 +546,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
let r_view = right.views().get_unchecked(right_idx);
let r_len = *r_view as u32;

if l_len <= 12 && r_len <= 12 {
if l_len <= MAX_INLINE_VIEW_LEN && r_len <= MAX_INLINE_VIEW_LEN {
let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
return l_data.cmp(r_data);
Expand Down
14 changes: 7 additions & 7 deletions arrow-array/src/builder/generic_bytes_view_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use std::marker::PhantomData;
use std::sync::Arc;

use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer};
use arrow_data::ByteView;
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
use arrow_schema::ArrowError;
use hashbrown::hash_table::Entry;
use hashbrown::HashTable;
Expand Down Expand Up @@ -68,8 +68,8 @@ impl BlockSizeGrowthStrategy {
///
/// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable
/// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`]
/// writes values larger than 12 bytes to the current in-progress block, with values smaller
/// than 12 bytes inlined into the views. If a value is appended that will not fit in the
/// writes values larger than [`MAX_INLINE_VIEW_LEN`] bytes to the current in-progress block, with values smaller
/// than [`MAX_INLINE_VIEW_LEN`] bytes inlined into the views. If a value is appended that will not fit in the
/// in-progress block, it will be closed, and a new block of sufficient size allocated
///
/// # Append Views
Expand Down Expand Up @@ -114,7 +114,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
/// Set a fixed buffer size for variable length strings
///
/// The block size is the size of the buffer used to store values greater
/// than 12 bytes. The builder allocates new buffers when the current
/// than [`MAX_INLINE_VIEW_LEN`] bytes. The builder allocates new buffers when the current
/// buffer is full.
///
/// By default the builder balances buffer size and buffer count by
Expand Down Expand Up @@ -221,7 +221,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
} else {
self.views_buffer.extend(array.views().iter().map(|v| {
let mut byte_view = ByteView::from(*v);
if byte_view.length > 12 {
if byte_view.length > MAX_INLINE_VIEW_LEN {
// Small views (<=12 bytes) are inlined, so only need to update large views
byte_view.buffer_index += starting_buffer;
};
Expand Down Expand Up @@ -289,7 +289,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
pub fn get_value(&self, index: usize) -> &[u8] {
let view = self.views_buffer.as_slice().get(index).unwrap();
let len = *view as u32;
if len <= 12 {
if len <= MAX_INLINE_VIEW_LEN {
// # Safety
// The view is valid from the builder
unsafe { GenericByteViewArray::<T>::inline_value(view, len as usize) }
Expand All @@ -315,7 +315,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
let v: &[u8] = value.as_ref().as_ref();
let length: u32 = v.len().try_into().unwrap();
if length <= 12 {
if length <= MAX_INLINE_VIEW_LEN {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
view_buffer[4..4 + v.len()].copy_from_slice(v);
Expand Down
18 changes: 13 additions & 5 deletions arrow-data/src/byte_view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@
use arrow_buffer::Buffer;
use arrow_schema::ArrowError;

/// The maximum number of bytes that can be stored inline in a byte view.
///
/// See [`ByteView`] and [`GenericByteViewArray`] for more information on the
/// layout of the views.
///
/// [`GenericByteViewArray`]: https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html
pub const MAX_INLINE_VIEW_LEN: u32 = 12;

/// Helper to access views of [`GenericByteViewArray`] (`StringViewArray` and
/// `BinaryViewArray`) where the length is greater than 12 bytes.
///
Expand Down Expand Up @@ -76,15 +84,15 @@ impl ByteView {
/// See example on [`ByteView`] docs
///
/// Notes:
/// * the length should always be greater than 12 (Data less than 12
/// bytes is stored as an inline view)
/// * the length should always be greater than [`MAX_INLINE_VIEW_LEN`]
/// (Data less than 12 bytes is stored as an inline view)
/// * buffer and offset are set to `0`
///
/// # Panics
/// If the prefix is not exactly 4 bytes
#[inline]
pub fn new(length: u32, prefix: &[u8]) -> Self {
debug_assert!(length > 12);
debug_assert!(length > MAX_INLINE_VIEW_LEN);
Self {
length,
prefix: u32::from_le_bytes(prefix.try_into().unwrap()),
Expand Down Expand Up @@ -159,8 +167,8 @@ where
{
for (idx, v) in views.iter().enumerate() {
let len = *v as u32;
if len <= 12 {
if len < 12 && (v >> (32 + len * 8)) != 0 {
if len <= MAX_INLINE_VIEW_LEN {
if len < MAX_INLINE_VIEW_LEN && (v >> (32 + len * 8)) != 0 {
return Err(ArrowError::InvalidArgumentError(format!(
"View at index {idx} contained non-zero padding for string of length {len}",
)));
Expand Down
10 changes: 5 additions & 5 deletions arrow-select/src/coalesce/byte_view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use arrow_array::cast::AsArray;
use arrow_array::types::ByteViewType;
use arrow_array::{Array, ArrayRef, GenericByteViewArray};
use arrow_buffer::{Buffer, NullBufferBuilder};
use arrow_data::ByteView;
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
use arrow_schema::ArrowError;
use std::marker::PhantomData;
use std::sync::Arc;
Expand Down Expand Up @@ -125,7 +125,7 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
// If there are buffers, we need to update the buffer index
let updated_views = views.iter().map(|v| {
let mut byte_view = ByteView::from(*v);
if byte_view.length > 12 {
if byte_view.length > MAX_INLINE_VIEW_LEN {
// Small views (<=12 bytes) are inlined, so only need to update large views
byte_view.buffer_index += starting_buffer;
};
Expand Down Expand Up @@ -182,7 +182,7 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
if remaining_capacity < str_len as usize {
break;
}
if str_len > 12 {
if str_len > MAX_INLINE_VIEW_LEN {
remaining_capacity -= str_len as usize;
}
num_view_to_current += 1;
Expand Down Expand Up @@ -233,7 +233,7 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
.iter()
.filter_map(|v| {
let b = ByteView::from(*v);
if b.length > 12 {
if b.length > MAX_INLINE_VIEW_LEN {
Some(b.length as usize)
} else {
None
Expand All @@ -251,7 +251,7 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
// Copy the views, updating the buffer index and copying the data as needed
let new_views = views.iter().map(|v| {
let mut b: ByteView = ByteView::from(*v);
if b.length > 12 {
if b.length > MAX_INLINE_VIEW_LEN {
let buffer_index = b.buffer_index as usize;
let buffer_offset = b.offset as usize;
let str_len = b.length as usize;
Expand Down
Loading