vortex-data · 0ax1 · Feb 3, 2026 · Feb 2, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs
@@ -409,8 +409,8 @@ impl RunEndArray {
     #[inline]
     pub fn into_parts(self) -> RunEndArrayParts {
         RunEndArrayParts {
-            values: self.values,
             ends: self.ends,
+            values: self.values,
         }
     }
 }

diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
@@ -41,6 +41,8 @@ vortex-fastlanes = { workspace = true }
 vortex-io = { workspace = true }
 vortex-mask = { workspace = true }
 vortex-nvcomp = { path = "nvcomp" }
+vortex-runend = { workspace = true }
+vortex-scalar = { workspace = true }
 vortex-sequence = { workspace = true }
 vortex-session = { workspace = true }
 vortex-utils = { workspace = true }
@@ -75,3 +77,7 @@ harness = false
 [[bench]]
 name = "filter_cuda"
 harness = false
+
+[[bench]]
+name = "runend_cuda"
+harness = false
diff --git a/vortex-cuda/benches/runend_cuda.rs b/vortex-cuda/benches/runend_cuda.rs
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! CUDA benchmarks for run-end decoding.
+
+#![allow(clippy::unwrap_used)]
+#![allow(clippy::cast_possible_truncation)]
+
+use std::mem::size_of;
+use std::time::Duration;
+
+use criterion::BenchmarkId;
+use criterion::Criterion;
+use criterion::Throughput;
+use cudarc::driver::DeviceRepr;
+use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
+use futures::executor::block_on;
+use vortex_array::IntoArray;
+use vortex_array::ToCanonical;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_cuda::CudaBufferExt;
+use vortex_cuda::CudaExecutionCtx;
+use vortex_cuda::CudaSession;
+use vortex_cuda_macros::cuda_available;
+use vortex_cuda_macros::cuda_not_available;
+use vortex_dtype::NativePType;
+use vortex_dtype::PType;
+use vortex_error::VortexExpect;
+use vortex_runend::RunEndArray;
+use vortex_session::VortexSession;
+
+/// Creates a run-end encoded array with the specified output length and average run length.
+fn make_runend_array_typed<T>(output_len: usize, avg_run_len: usize) -> RunEndArray
+where
+    T: NativePType + From<u8>,
+{
+    let num_runs = output_len.div_ceil(avg_run_len);
+    let mut ends: Vec<u64> = Vec::with_capacity(num_runs);
+    let mut values: Vec<T> = Vec::with_capacity(num_runs);
+
+    let mut pos: usize = 0;
+    for i in 0..num_runs {
+        pos += avg_run_len;
+        if pos > output_len {
+            pos = output_len;
+        }
+        ends.push(pos as u64);
+        values.push(<T as From<u8>>::from((i % 256) as u8));
+    }
+
+    let ends_array = PrimitiveArray::new(Buffer::from(ends), Validity::NonNullable).into_array();
+    let values_array =
+        PrimitiveArray::new(Buffer::from(values), Validity::NonNullable).into_array();
+    RunEndArray::new(ends_array, values_array)
+}
+
+/// Launches runend decode kernel and returns elapsed GPU time.
+fn launch_runend_kernel_timed_typed<T>(
+    runend_array: &RunEndArray,
+    cuda_ctx: &mut CudaExecutionCtx,
+) -> vortex_error::VortexResult<Duration>
+where
+    T: NativePType + DeviceRepr,
+{
+    let ends_prim = runend_array.ends().to_primitive();
+    let values_prim = runend_array.values().to_primitive();
+
+    let output_len = runend_array.len();
+    let num_runs = ends_prim.len();
+    let offset = runend_array.offset();
+
+    let ends_device = block_on(
+        cuda_ctx
+            .copy_to_device(ends_prim.as_slice::<u64>().to_vec())
+            .unwrap(),
+    )
+    .vortex_expect("failed to copy ends to device");
+
+    let values_device = block_on(
+        cuda_ctx
+            .copy_to_device(values_prim.as_slice::<T>().to_vec())
+            .unwrap(),
+    )
+    .vortex_expect("failed to copy values to device");
+
+    let output_device = block_on(
+        cuda_ctx
+            .copy_to_device(vec![T::default(); output_len])
+            .unwrap(),
+    )
+    .vortex_expect("failed to allocate output buffer");
+
+    let ends_view = ends_device
+        .cuda_view::<u64>()
+        .vortex_expect("failed to get ends view");
+    let values_view = values_device
+        .cuda_view::<T>()
+        .vortex_expect("failed to get values view");
+    let output_view = output_device
+        .cuda_view::<T>()
+        .vortex_expect("failed to get output view");
+
+    let events = vortex_cuda::launch_cuda_kernel!(
+        execution_ctx: cuda_ctx,
+        module: "runend",
+        ptypes: &[T::PTYPE, PType::U64],
+        launch_args: [ends_view, num_runs, values_view, offset, output_len, output_view],
+        event_recording: CU_EVENT_BLOCKING_SYNC,
+        array_len: output_len
+    );
+
+    events.duration()
+}
+
+/// Benchmark run-end decoding for a specific type with varying run lengths
+fn benchmark_runend_typed<T>(c: &mut Criterion, type_name: &str)
+where
+    T: NativePType + DeviceRepr + From<u8>,
+{
+    let mut group = c.benchmark_group("runend_cuda");
+    group.sample_size(10);
+
+    for (len, len_str) in [
+        (1_000_000usize, "1M"),
+        (10_000_000usize, "10M"),
+        (100_000_000usize, "100M"),
+    ] {
+        group.throughput(Throughput::Bytes((len * size_of::<T>()) as u64));
+
+        for run_len in [10, 100, 1000, 10000, 100000] {
+            let runend_array = make_runend_array_typed::<T>(len, run_len);
+
+            group.bench_with_input(
+                BenchmarkId::new("runend", format!("{len_str}_{type_name}_runlen_{run_len}")),
+                &runend_array,
+                |b, runend_array| {
+                    b.iter_custom(|iters| {
+                        let mut cuda_ctx =
+                            CudaSession::create_execution_ctx(&VortexSession::empty())
+                                .vortex_expect("failed to create execution context");
+
+                        let mut total_time = Duration::ZERO;
+
+                        for _ in 0..iters {
+                            let kernel_time =
+                                launch_runend_kernel_timed_typed::<T>(runend_array, &mut cuda_ctx)
+                                    .vortex_expect("kernel launch failed");
+                            total_time += kernel_time;
+                        }
+
+                        total_time
+                    });
+                },
+            );
+        }
+    }
+
+    group.finish();
+}
+
+/// Benchmark run-end decoding with varying run lengths for all types
+fn benchmark_runend(c: &mut Criterion) {
+    benchmark_runend_typed::<i32>(c, "i32");
+}
+
+criterion::criterion_group!(benches, benchmark_runend);
+
+#[cuda_available]
+criterion::criterion_main!(benches);
+
+#[cuda_not_available]
+fn main() {}
diff --git a/vortex-cuda/kernels/src/runend.cu b/vortex-cuda/kernels/src/runend.cu
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+
+#include "config.cuh"
+#include "types.cuh"
+
+constexpr uint32_t MAX_CACHED_RUNS = 512;
+
+/// Binary search for the first element strictly greater than `value`.
+///
+/// Uses `thrust::upper_bound` with sequential execution policy. `thrust::seq`
+/// is chosen as the binary search runs on a single GPU thread. This is
+/// preferred over `thrust::device` as this would spawn an additional kernel
+/// launch.
+/// See: https://nvidia.github.io/cccl/thrust/api/group__binary__search_1gac85cc9ea00f4bdd8f80ad25fff16741d.html#thrust-upper-bound
+///
+/// Returns the index of the first element that is greater than `value`, or
+/// `len` if no such element exists.
+template<typename T>
+__device__ __forceinline__ uint64_t upper_bound(const T *data, uint64_t len, uint64_t value) {
+
+    auto it = thrust::upper_bound(thrust::seq, data, data + len, value);
+    return it - data;
+}
+
+
+// Decodes run-end encoded data on the GPU.
+//
+// Run-end stores data as pairs of (value, end_position) where each run contains
+// repeated values from the previous end position to the current end position.
+//
+// Steps:
+// 1. Each CUDA block processes a contiguous chunk of output elements (elements_per_block).
+//
+// 2. Block Initialization (Thread 0 only):
+//    - Compute the global position range [block_start + offset, block_end + offset) for this block
+//    - Use binary search (upper_bound) to find the first and last runs that overlap this range
+//    - Store the run range in shared memory (block_first_run, block_num_runs)
+//
+// 3. Shared Memory Caching:
+//    - If the number of runs for this block fits in shared memory (< MAX_CACHED_RUNS),
+//      all threads cooperatively load the relevant ends[] and values[] into shared memory
+//    - This is to reduce global memory access during decoding
+//
+// 4. Decoding:
+//    a) Cached path: Each thread decodes multiple elements using a forward scan.
+//       Since thread positions are strided (idx += blockDim.x), and positions are monotonically
+//       increasing across iterations, we maintain a current_run index that only moves forward.
+//
+//    b) Fallback path: If too many runs span this block (exceeds MAX_CACHED_RUNS),
+//       fall back to binary search in global memory for each element.
+//
+// TODO(0ax1): Investigate whether there are faster solutions.
+template<typename ValueT, typename EndsT>
+__device__ void runend_decode_kernel(
+    const EndsT *const __restrict ends,
+    uint64_t num_runs,
+    const ValueT *const __restrict values,
+    uint64_t offset,
+    uint64_t output_len,
+    ValueT *const __restrict output
+) {
+    __shared__ EndsT shared_ends[MAX_CACHED_RUNS];
+    __shared__ ValueT shared_values[MAX_CACHED_RUNS];
+    __shared__ uint64_t block_first_run;
+    __shared__ uint32_t block_num_runs;
+
+    const uint32_t elements_per_block = blockDim.x * ELEMENTS_PER_THREAD;
+    const uint64_t block_start = static_cast<uint64_t>(blockIdx.x) * elements_per_block;
+    const uint64_t block_end = min(block_start + elements_per_block, output_len);
+
+    if (block_start >= output_len) return;
+
+    // Thread 0 finds the run range for this block.
+    if (threadIdx.x == 0) {
+        uint64_t first_pos = block_start + offset;
+        uint64_t last_pos = (block_end - 1) + offset;
+
+        uint64_t first_run = upper_bound(ends, num_runs, first_pos);
+        uint64_t last_run = upper_bound(ends, num_runs, last_pos);
+
+        block_first_run = first_run;
+        block_num_runs = static_cast<uint32_t>(min(last_run - first_run + 1, static_cast<uint64_t>(MAX_CACHED_RUNS)));
+    }
+    __syncthreads();
+
+    // Cooperatively load ends and values into shared memory.
+    if (block_num_runs < MAX_CACHED_RUNS) {
+        for (uint32_t i = threadIdx.x; i < block_num_runs; i += blockDim.x) {
+            shared_ends[i] = ends[block_first_run + i];
+            shared_values[i] = values[block_first_run + i];
+        }
+    }
+    __syncthreads();
+
+    if (block_num_runs < MAX_CACHED_RUNS) {
+        uint32_t current_run = 0;
+        for (uint64_t idx = block_start + threadIdx.x; idx < block_end; idx += blockDim.x) {
+            uint64_t pos = idx + offset;
+
+            // Scan forward to find the run containing this position
+            while (current_run < block_num_runs && static_cast<uint64_t>(shared_ends[current_run]) <= pos) {
+                current_run++;
+            }
+
+            output[idx] = shared_values[current_run < block_num_runs ? current_run : block_num_runs - 1];
+        }
+    } else {
+        // Fallback for blocks with very short runs. Search the full `num_runs`
+        // array. `block_num_runs` is clamped to `MAX_CACHED_RUNS`.
+        for (uint64_t idx = block_start + threadIdx.x; idx < block_end; idx += blockDim.x) {
+            uint64_t pos = idx + offset;
+            uint64_t run_idx = upper_bound(ends, num_runs, pos);
+            if (run_idx >= num_runs) run_idx = num_runs - 1;
+            output[idx] = values[run_idx];
+        }
+    }
+}
+
+#define GENERATE_RUNEND_KERNEL(value_suffix, ValueType, ends_suffix, EndsType) \
+extern "C" __global__ void runend_##value_suffix##_##ends_suffix( \
+    const EndsType *const __restrict ends, \
+    uint64_t num_runs, \
+    const ValueType *const __restrict values, \
+    uint64_t offset, \
+    uint64_t output_len, \
+    ValueType *const __restrict output \
+) { \
+    runend_decode_kernel<ValueType, EndsType>(ends, num_runs, values, offset, output_len, output); \
+}
+
+#define GENERATE_RUNEND_KERNELS_FOR_VALUE(value_suffix, ValueType) \
+    GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u8, uint8_t) \
+    GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u16, uint16_t) \
+    GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u32, uint32_t) \
+    GENERATE_RUNEND_KERNEL(value_suffix, ValueType, u64, uint64_t)
+
+GENERATE_RUNEND_KERNELS_FOR_VALUE(u8, uint8_t)
+GENERATE_RUNEND_KERNELS_FOR_VALUE(i8, int8_t)
+GENERATE_RUNEND_KERNELS_FOR_VALUE(u16, uint16_t)
+GENERATE_RUNEND_KERNELS_FOR_VALUE(i16, int16_t)
+GENERATE_RUNEND_KERNELS_FOR_VALUE(u32, uint32_t)
+GENERATE_RUNEND_KERNELS_FOR_VALUE(i32, int32_t)
+GENERATE_RUNEND_KERNELS_FOR_VALUE(u64, uint64_t)
+GENERATE_RUNEND_KERNELS_FOR_VALUE(i64, int64_t)
+GENERATE_RUNEND_KERNELS_FOR_VALUE(f16, __half)
+GENERATE_RUNEND_KERNELS_FOR_VALUE(f32, float)
+GENERATE_RUNEND_KERNELS_FOR_VALUE(f64, double)
diff --git a/vortex-cuda/src/kernel/encodings/mod.rs b/vortex-cuda/src/kernel/encodings/mod.rs
@@ -5,6 +5,7 @@ mod alp;
 mod bitpacked;
 mod decimal_byte_parts;
 mod for_;
+mod runend;
 mod sequence;
 mod zigzag;
 mod zstd;
@@ -13,6 +14,7 @@ pub use alp::ALPExecutor;
 pub use bitpacked::BitPackedExecutor;
 pub use decimal_byte_parts::DecimalBytePartsExecutor;
 pub use for_::FoRExecutor;
+pub use runend::RunEndExecutor;
 pub use sequence::SequenceExecutor;
 pub use zigzag::ZigZagExecutor;
 pub use zstd::ZstdExecutor;