Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 2 additions & 20 deletions vortex-array/src/arrays/primitive/compute/take/portable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,6 @@ use crate::validity::Validity;

pub(super) struct TakeKernelPortableSimd;

// SIMD types larger than the SIMD register size are beneficial for
// performance as this leads to better instruction level parallelism.
const SIMD_WIDTH: usize = 64;

impl TakeImpl for TakeKernelPortableSimd {
fn take(
&self,
Expand All @@ -47,7 +43,7 @@ impl TakeImpl for TakeKernelPortableSimd {
if array.ptype() == PType::F16 {
// Special handling for f16 to treat as opaque u16
let decoded = match_each_unsigned_integer_ptype!(unsigned_indices.ptype(), |C| {
portable::take_portable_simd::<u16, C, SIMD_WIDTH>(
portable::take_portable_simd::<u16, C, { portable::SIMD_WIDTH }>(
array.reinterpret_cast(PType::U16).as_slice(),
unsigned_indices.as_slice(),
)
Expand All @@ -58,7 +54,7 @@ impl TakeImpl for TakeKernelPortableSimd {
} else {
match_each_unsigned_integer_ptype!(unsigned_indices.ptype(), |C| {
match_each_native_simd_ptype!(array.ptype(), |V| {
let decoded = portable::take_portable_simd::<V, C, SIMD_WIDTH>(
let decoded = portable::take_portable_simd::<V, C, { portable::SIMD_WIDTH }>(
array.as_slice(),
unsigned_indices.as_slice(),
);
Expand All @@ -68,17 +64,3 @@ impl TakeImpl for TakeKernelPortableSimd {
}
}
}

#[cfg(test)]
mod tests {
use super::take_portable_simd;

#[test]
fn test_take_out_of_bounds() {
let indices = vec![2_000_000u32; 64];
let values = vec![1i32];

let result = take_portable_simd::<u32, i32, 64>(&indices, &values);
assert_eq!(result.as_slice(), [0i32; 64]);
}
}
1 change: 1 addition & 0 deletions vortex-compute/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ vortex-vector = { workspace = true }
arrow-array = { workspace = true, optional = true }
arrow-buffer = { workspace = true, optional = true }
arrow-schema = { workspace = true, optional = true }
half = { workspace = true }
log = { workspace = true }
multiversion = { workspace = true }
num-traits = { workspace = true }
Expand Down
1 change: 1 addition & 0 deletions vortex-compute/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

//! A collection of compute functions primarily for operating over Vortex vectors.

#![cfg_attr(vortex_nightly, feature(portable_simd))]
#![deny(missing_docs)]
#![deny(clippy::missing_panics_doc)]
#![deny(clippy::missing_safety_doc)]
Expand Down
1 change: 1 addition & 0 deletions vortex-compute/src/take/slice/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ impl<T: NativePType, I: UnsignedPType> Take<[I]> for &[T] {
}
}

#[allow(unreachable_code, reason = "`vortex_nightly` path returns early")]
take_scalar(self, indices)
}
}
Expand Down
173 changes: 162 additions & 11 deletions vortex-compute/src/take/slice/portable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#![cfg(vortex_nightly)]

use std::mem::MaybeUninit;
use std::mem::size_of;
use std::mem::transmute;
use std::simd;
use std::simd::num::SimdUint;
Expand All @@ -17,27 +18,55 @@ use vortex_buffer::BufferMut;
use vortex_dtype::NativePType;
use vortex_dtype::PType;
use vortex_dtype::UnsignedPType;
use vortex_dtype::match_each_native_simd_ptype;
use vortex_dtype::match_each_unsigned_integer_ptype;

/// SIMD types larger than the SIMD register size are beneficial for
/// performance as this leads to better instruction level parallelism.
pub const SIMD_WIDTH: usize = 64;

/// Takes the specified indices into a new [`Buffer`] using portable SIMD.
///
/// This function handles the type matching required to satisfy `SimdElement` bounds.
/// For `f16` values, it reinterprets them as `u16` since `f16` doesn't implement `SimdElement`.
#[inline]
pub fn take_portable<T, I>(buffer: &[T], indices: &[I]) -> Buffer<T>
where
T: NativePType + simd::SimdElement,
I: UnsignedPType + simd::SimdElement,
{
pub fn take_portable<T: NativePType, I: UnsignedPType>(buffer: &[T], indices: &[I]) -> Buffer<T> {
if T::PTYPE == PType::F16 {
assert_eq!(size_of::<half::f16>(), size_of::<T>());

// Since Rust does not actually support 16-bit floats, we first reinterpret the data as
// `u16` integers.
// SAFETY: We know that f16 has the same bit pattern as u16, so this transmute is fine to
// make.
let u16_slice: &[u16] =
unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const u16, buffer.len()) };
return take_with_indices(u16_slice, indices).cast_into::<T>();
}

let taken_u16 = take_portable_simd::<u16, I, SIMD_WIDTH>(u16_slice, indices);
let taken_f16 = taken_u16.cast_into::<T>();
match_each_native_simd_ptype!(T::PTYPE, |TC| {
assert_eq!(size_of::<TC>(), size_of::<T>());

taken_f16
} else {
take_portable_simd::<T, I, SIMD_WIDTH>(buffer, indices)
}
// SAFETY: This is essentially a no-op that tricks the compiler into adding the
// `simd::SimdElement` bound we need to call `take_with_indices`.
let buffer: &[TC] =
unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const TC, buffer.len()) };
take_with_indices(buffer, indices).cast_into::<T>()
})
}

/// Helper that matches on index type and calls `take_portable_simd`.
///
/// We separate this code out from above to add the [`simd::SimdElement`] constraint.
#[inline]
fn take_with_indices<T: NativePType + simd::SimdElement, I: UnsignedPType>(
buffer: &[T],
indices: &[I],
) -> Buffer<T> {
match_each_unsigned_integer_ptype!(I::PTYPE, |IC| {
let indices: &[IC] =
unsafe { std::slice::from_raw_parts(indices.as_ptr() as *const IC, indices.len()) };
take_portable_simd::<T, IC, SIMD_WIDTH>(buffer, indices)
})
}

/// Takes elements from an array using SIMD indexing.
Expand Down Expand Up @@ -110,4 +139,126 @@ mod tests {
let result = take_portable_simd::<i32, u32, 64>(&values, &indices);
assert_eq!(result.as_slice(), [0i32; 64]);
}

/// Tests SIMD gather with a mix of sequential, strided, and repeated indices. This exercises
/// irregular access patterns that stress the gather operation.
#[test]
fn test_take_mixed_access_patterns() {
// Create a values array with distinct elements.
let values: Vec<i64> = (0..256).map(|i| i * 100).collect();

// Build indices with mixed patterns:
// - Sequential access (0, 1, 2, ...)
// - Strided access (0, 4, 8, ...)
// - Repeated indices (same index multiple times)
// - Reverse order
let mut indices: Vec<u32> = Vec::with_capacity(200);

// Sequential: indices 0..64.
indices.extend(0u32..64);
// Strided by 4: 0, 4, 8, ..., 252.
indices.extend((0u32..64).map(|i| i * 4));
// Repeated: index 42 repeated 32 times.
indices.extend(std::iter::repeat(42u32).take(32));
// Reverse: 255, 254, ..., 216.
indices.extend((216u32..256).rev());

let result = take_portable_simd::<i64, u32, 64>(&values, &indices);
let result_slice = result.as_slice();

// Verify sequential portion.
for i in 0..64 {
assert_eq!(result_slice[i], (i as i64) * 100, "sequential at index {i}");
}

// Verify strided portion.
for i in 0..64 {
assert_eq!(
result_slice[64 + i],
(i as i64) * 4 * 100,
"strided at index {i}"
);
}

// Verify repeated portion.
for i in 0..32 {
assert_eq!(result_slice[128 + i], 42 * 100, "repeated at index {i}");
}

// Verify reverse portion.
for i in 0..40 {
assert_eq!(
result_slice[160 + i],
(255 - i as i64) * 100,
"reverse at index {i}"
);
}
}

/// Tests that the scalar remainder path works correctly when the number of indices is not
/// evenly divisible by the SIMD lane count.
#[test]
fn test_take_with_remainder() {
let values: Vec<u16> = (0..1000).collect();

// Use 64 + 37 = 101 indices to test both the SIMD loop (64 elements) and the scalar
// remainder (37 elements).
let indices: Vec<u8> = (0u8..101).collect();

let result = take_portable_simd::<u16, u8, 64>(&values, &indices);
let result_slice = result.as_slice();

assert_eq!(result_slice.len(), 101);

// Verify all elements.
for i in 0..101 {
assert_eq!(result_slice[i], i as u16, "mismatch at index {i}");
}

// Also test with exactly 1 remainder element.
let indices_one_remainder: Vec<u8> = (0u8..65).collect();
let result_one = take_portable_simd::<u16, u8, 64>(&values, &indices_one_remainder);
assert_eq!(result_one.as_slice().len(), 65);
assert_eq!(result_one.as_slice()[64], 64);
}

/// Tests gather with large 64-bit values and various index types to ensure no truncation
/// occurs during the operation.
#[test]
fn test_take_large_values_no_truncation() {
// Create values near the edges of i64 range.
let values: Vec<i64> = vec![
i64::MIN,
i64::MIN + 1,
-1_000_000_000_000i64,
-1,
0,
1,
1_000_000_000_000i64,
i64::MAX - 1,
i64::MAX,
];

// Indices that access each value multiple times in different orders.
let indices: Vec<u16> = vec![
0, 8, 1, 7, 2, 6, 3, 5, 4, // Forward-backward interleaved.
8, 8, 8, 0, 0, 0, // Repeated extremes.
4, 4, 4, 4, 4, 4, 4, 4, // Repeated zero.
0, 1, 2, 3, 4, 5, 6, 7, 8, // Sequential.
8, 7, 6, 5, 4, 3, 2, 1, 0, // Reverse.
// Pad to 64 to ensure we hit the SIMD path.
0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3,
];

let result = take_portable_simd::<i64, u16, 64>(&values, &indices);
let result_slice = result.as_slice();

// Verify each result matches the expected value.
for (i, &idx) in indices.iter().enumerate() {
assert_eq!(
result_slice[i], values[idx as usize],
"mismatch at position {i} for index {idx}"
);
}
}
}
Loading