perf[gpu]: fused AoT for and bitpacking kernel (#4872)

joseph-isaacs · web-flow · commit 680e107a4dc6 · 2025-10-08T18:47:21.000+01:00
I have only implemented a PoC fused FoR-BP kernel, I don't want to
implemented them all since there will be a lot of duplication. I think
we likely need to compile these at runtime.

I have also fixed up the kernels build system.

Fused is fast

```
gpu_for_bp_fused_decompress_kernel_only/u32/1GB
                        time:   [5.5376 ms 5.5410 ms 5.5443 ms]
                        thrpt:  [180.37 GiB/s 180.47 GiB/s 180.58 GiB/s]
```

Signed-off-by: Joe Isaacs &lt;joe.isaacs@live.co.uk&gt;

---------

Signed-off-by: Joe Isaacs &lt;joe.isaacs@live.co.uk&gt;
diff --git a/fls-gpu-kernel-gen/src/bit_unpack.rs b/fls-gpu-kernel-gen/src/bit_unpack.rs
@@ -101,17 +101,7 @@ fn generate_unpack_for_width<T: FastLanes, W: Write>(
     writeln!(output, "#include <cuda.h>")?;
     writeln!(output, "#include <cuda_runtime.h>")?;
     writeln!(output, "#include <stdint.h>")?;
-    writeln!(output)?;
-
-    writeln!(
-        output,
-        "__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};"
-    )?;
-    writeln!(
-        output,
-        "#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)"
-    )?;
-    writeln!(output, "#define MASK(T, width) (((T)1 << width) - 1)")?;
+    writeln!(output, "#include \"fastlanes_common.cuh\"")?;
     writeln!(output)?;
 
     for bit_width in 0..=<T>::T {
@@ -123,10 +113,10 @@ fn generate_unpack_for_width<T: FastLanes, W: Write>(
 }
 
 pub fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> anyhow::Result<()> {
-    let filename = format!("fls_{}_bit_unpack.cu", T::T);
-    let path = output_dir.join(&filename);
-    let mut file = File::create(&path)?;
-    let mut writer = IndentedWriter::new(&mut file);
-    generate_unpack_for_width::<T, _>(&mut writer, thread_count)?;
+    let cu_filename = format!("gen/fls_{}_bit_unpack.cu", T::T);
+    let cu_path = output_dir.join(&cu_filename);
+    let mut cu_file = File::create(&cu_path)?;
+    let mut cu_writer = IndentedWriter::new(&mut cu_file);
+    generate_unpack_for_width::<T, _>(&mut cu_writer, thread_count)?;
     Ok(())
 }
diff --git a/vortex-gpu/.gitignore b/vortex-gpu/.gitignore
@@ -1,2 +1,2 @@
-kernels/fls*
+kernels/gen/*
 *.ptx
diff --git a/vortex-gpu/benches/gpu_bitunpack.rs b/vortex-gpu/benches/gpu_bitunpack.rs
@@ -15,7 +15,7 @@ use vortex_buffer::BufferMut;
 use vortex_dtype::NativePType;
 use vortex_error::VortexUnwrap;
 use vortex_fastlanes::{BitPackedArray, FoRArray};
-use vortex_gpu::{cuda_bit_unpack_timed, cuda_for_unpack_timed};
+use vortex_gpu::{cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed};
 
 // Data sizes: 1GB, 2.5GB, 5GB, 10GB
 // These are approximate sizes in bytes, accounting for bit-packing compression
@@ -124,6 +124,37 @@ fn benchmark_gpu_for_decompress_kernel_only(c: &mut Criterion) {
     group.finish();
 }
 
+fn benchmark_gpu_for_bp_fused_decompress_kernel_only(c: &mut Criterion) {
+    let mut group = c.benchmark_group("gpu_for_bp_fused_decompress_kernel_only");
+
+    group.sample_size(10);
+
+    for (len, label) in DATA_SIZES {
+        let len = len.next_multiple_of(1024);
+        let array = make_for_bitpackable_array(len);
+
+        let ctx = CudaContext::new(0).unwrap();
+        ctx.set_blocking_synchronize().unwrap();
+        let ctx = Arc::new(ctx);
+
+        group.throughput(Throughput::Bytes((len * size_of::<u32>()) as u64));
+        group.bench_with_input(BenchmarkId::new("u32", label), &array, |b, array| {
+            b.iter_custom(|iters| {
+                let mut total_time = Duration::ZERO;
+                for _ in 0..iters {
+                    // This only measures kernel execution time, not memory transfers
+                    let (_result, kernel_time) =
+                        cuda_for_bp_unpack_timed(array, Arc::clone(&ctx)).unwrap();
+                    total_time += kernel_time;
+                }
+                total_time
+            });
+        });
+    }
+
+    group.finish();
+}
+
 #[allow(dead_code)]
 fn benchmark_cpu_canonicalize(c: &mut Criterion) {
     let mut group = c.benchmark_group("cpu_canonicalize");
@@ -145,5 +176,6 @@ criterion_group!(
     benches,
     benchmark_gpu_decompress_kernel_only,
     benchmark_gpu_for_decompress_kernel_only,
+    benchmark_gpu_for_bp_fused_decompress_kernel_only
 );
 criterion_main!(benches);
diff --git a/vortex-gpu/build.rs b/vortex-gpu/build.rs
@@ -37,21 +37,23 @@ fn main() -> anyhow::Result<()> {
 
     println!("cargo:rerun-if-changed={}", generator_dir.to_str().unwrap());
 
-    for entry in WalkDir::new(kernels_dir).into_iter().flatten() {
+    for entry in WalkDir::new(&kernels_dir).into_iter().flatten() {
         if entry.path().extension().is_some_and(|ext| ext == "cu") {
             println!("cargo:rerun-if-changed={}", entry.path().display());
-            nvcc_compile_ptx(entry.path())?;
+            nvcc_compile_ptx(kernels_dir.as_path(), entry.path())?;
         }
     }
 
     Ok(())
 }
 
-fn nvcc_compile_ptx(cu_path: &Path) -> anyhow::Result<()> {
+fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> anyhow::Result<()> {
     let res = Command::new("nvcc")
         .arg("-arch=sm_80")
         .arg("--restrict")
         .arg("--ptx")
+        .arg("--include-path")
+        .arg(kernel_dir)
         .arg("-c")
         .arg(cu_path)
         .arg("-o")
diff --git a/vortex-gpu/kernels/fastlanes_common.cuh b/vortex-gpu/kernels/fastlanes_common.cuh
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+// Auto-generated by fls-gpu-kernel-gen. Do not edit by hand!
+// Common FastLanes definitions shared across multiple kernels
+
+#ifndef GEN_FASTLANES_COMMON_CUH
+#define GEN_FASTLANES_COMMON_CUH
+
+#include <stdint.h>
+
+// FastLanes ordering array
+__device__ int FL_ORDER[] = {0, 4, 2, 6, 1, 5, 3, 7};
+
+// Compute the index in the FastLanes layout
+#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)
+
+// Create a mask with 'width' bits set
+#define MASK(T, width) (((T)1 << width) - 1)
+
+#endif // GEN_FASTLANES_COMMON_CUH
diff --git a/vortex-gpu/kernels/for.cu b/vortex-gpu/kernels/for.cu
@@ -5,22 +5,36 @@
 #include <cuda_runtime.h>
 #include <stdint.h>
 
+// Device function template (callable from device code)
+template<typename ValueT>
+__device__ void for_device(
+    ValueT *__restrict values_in_out,
+    ValueT reference,
+    int thread_idx
+) {
+    auto i = thread_idx;
+    const uint32_t thread_ops = blockDim.x;
+
+    for (auto j = 0; j < thread_ops; j++) {
+        auto idx = i * thread_ops + j;
+        values_in_out[idx] = values_in_out[idx] + reference;
+    }
+}
+
+// Kernel wrapper template (callable from host)
 template<typename ValueT>
 __device__ void for_(
     ValueT *__restrict values_in_out_array,
     ValueT reference
 ) {
     auto i = threadIdx.x;
-    auto block_offset = (blockIdx.x * 1024);
+    const uint32_t fl_lane_count = 32;
+    auto blockSize = blockDim.x * fl_lane_count;
+    auto block_size = 1024;
+    auto block_offset = (blockIdx.x * block_size);
 
     auto values_in_out = values_in_out_array + block_offset;
-
-    const int thread_ops = 32;
-
-    for (auto j = 0; j < thread_ops; j++) {
-        auto idx = i * thread_ops + j;
-        values_in_out[idx] = values_in_out[idx] + reference;
-    }
+    for_device(values_in_out, reference, i);
 }
 
 // Macro to generate the extern "C" wrapper for each type combination
diff --git a/vortex-gpu/kernels/for.cuh b/vortex-gpu/kernels/for.cuh
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+// Frame-of-Reference kernel declarations
+
+#ifndef FOR_CUH
+#define FOR_CUH
+
+#include <stdint.h>
+
+// Device function template (callable from other kernels)
+template<typename ValueT>
+__device__ __forceinline__ void for_device(
+    ValueT *__restrict values_in_out,
+    ValueT reference,
+    int thread_idx
+);
+
+// Kernel functions (callable from host)
+extern "C" __global__ void for_vu8(uint8_t *__restrict values, uint8_t reference);
+extern "C" __global__ void for_vu16(uint16_t *__restrict values, uint16_t reference);
+extern "C" __global__ void for_vu32(uint32_t *__restrict values, uint32_t reference);
+extern "C" __global__ void for_vu64(uint64_t *__restrict values, uint64_t reference);
+
+extern "C" __global__ void for_vi8(int8_t *__restrict values, int8_t reference);
+extern "C" __global__ void for_vi16(int16_t *__restrict values, int16_t reference);
+extern "C" __global__ void for_vi32(int32_t *__restrict values, int32_t reference);
+extern "C" __global__ void for_vi64(int64_t *__restrict values, int64_t reference);
+
+#endif // FOR_CUH
diff --git a/vortex-gpu/kernels/fused_bitpack_for.cu b/vortex-gpu/kernels/fused_bitpack_for.cu
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+// Fused kernel combining FastLanes bitpacking unpack with Frame-of-Reference addition
+// This avoids an intermediate memory write/read by fusing the operations
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include "fastlanes_common.cuh"
+
+
+__device__ void fls_unpack_6bw_32ow_device(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) {
+    int i = thread_idx;
+    uint32_t src;
+    uint32_t tmp;
+
+    src = in[i * 1 + 0];
+    tmp = (src >> 0) & MASK(uint32_t, 6);
+    out[INDEX(0, (i * 1 + 0))] = tmp;
+    tmp = (src >> 6) & MASK(uint32_t, 6);
+    out[INDEX(1, (i * 1 + 0))] = tmp;
+    tmp = (src >> 12) & MASK(uint32_t, 6);
+    out[INDEX(2, (i * 1 + 0))] = tmp;
+    tmp = (src >> 18) & MASK(uint32_t, 6);
+    out[INDEX(3, (i * 1 + 0))] = tmp;
+    tmp = (src >> 24) & MASK(uint32_t, 6);
+    out[INDEX(4, (i * 1 + 0))] = tmp;
+    tmp = (src >> 30) & MASK(uint32_t, 2);
+    src = in[i * 1 + 0 + 32 * 1];
+    tmp |= (src & MASK(uint32_t, 4)) << 2;
+    out[INDEX(5, (i * 1 + 0))] = tmp;
+    tmp = (src >> 4) & MASK(uint32_t, 6);
+    out[INDEX(6, (i * 1 + 0))] = tmp;
+    tmp = (src >> 10) & MASK(uint32_t, 6);
+    out[INDEX(7, (i * 1 + 0))] = tmp;
+    tmp = (src >> 16) & MASK(uint32_t, 6);
+    out[INDEX(8, (i * 1 + 0))] = tmp;
+    tmp = (src >> 22) & MASK(uint32_t, 6);
+    out[INDEX(9, (i * 1 + 0))] = tmp;
+    tmp = (src >> 28) & MASK(uint32_t, 4);
+    src = in[i * 1 + 0 + 32 * 2];
+    tmp |= (src & MASK(uint32_t, 2)) << 4;
+    out[INDEX(10, (i * 1 + 0))] = tmp;
+    tmp = (src >> 2) & MASK(uint32_t, 6);
+    out[INDEX(11, (i * 1 + 0))] = tmp;
+    tmp = (src >> 8) & MASK(uint32_t, 6);
+    out[INDEX(12, (i * 1 + 0))] = tmp;
+    tmp = (src >> 14) & MASK(uint32_t, 6);
+    out[INDEX(13, (i * 1 + 0))] = tmp;
+    tmp = (src >> 20) & MASK(uint32_t, 6);
+    out[INDEX(14, (i * 1 + 0))] = tmp;
+    tmp = (src >> 26) & MASK(uint32_t, 6);
+    src = in[i * 1 + 0 + 32 * 3];
+    tmp |= (src & MASK(uint32_t, 0)) << 6;
+    out[INDEX(15, (i * 1 + 0))] = tmp;
+    tmp = (src >> 0) & MASK(uint32_t, 6);
+    out[INDEX(16, (i * 1 + 0))] = tmp;
+    tmp = (src >> 6) & MASK(uint32_t, 6);
+    out[INDEX(17, (i * 1 + 0))] = tmp;
+    tmp = (src >> 12) & MASK(uint32_t, 6);
+    out[INDEX(18, (i * 1 + 0))] = tmp;
+    tmp = (src >> 18) & MASK(uint32_t, 6);
+    out[INDEX(19, (i * 1 + 0))] = tmp;
+    tmp = (src >> 24) & MASK(uint32_t, 6);
+    out[INDEX(20, (i * 1 + 0))] = tmp;
+    tmp = (src >> 30) & MASK(uint32_t, 2);
+    src = in[i * 1 + 0 + 32 * 4];
+    tmp |= (src & MASK(uint32_t, 4)) << 2;
+    out[INDEX(21, (i * 1 + 0))] = tmp;
+    tmp = (src >> 4) & MASK(uint32_t, 6);
+    out[INDEX(22, (i * 1 + 0))] = tmp;
+    tmp = (src >> 10) & MASK(uint32_t, 6);
+    out[INDEX(23, (i * 1 + 0))] = tmp;
+    tmp = (src >> 16) & MASK(uint32_t, 6);
+    out[INDEX(24, (i * 1 + 0))] = tmp;
+    tmp = (src >> 22) & MASK(uint32_t, 6);
+    out[INDEX(25, (i * 1 + 0))] = tmp;
+    tmp = (src >> 28) & MASK(uint32_t, 4);
+    src = in[i * 1 + 0 + 32 * 5];
+    tmp |= (src & MASK(uint32_t, 2)) << 4;
+    out[INDEX(26, (i * 1 + 0))] = tmp;
+    tmp = (src >> 2) & MASK(uint32_t, 6);
+    out[INDEX(27, (i * 1 + 0))] = tmp;
+    tmp = (src >> 8) & MASK(uint32_t, 6);
+    out[INDEX(28, (i * 1 + 0))] = tmp;
+    tmp = (src >> 14) & MASK(uint32_t, 6);
+    out[INDEX(29, (i * 1 + 0))] = tmp;
+    tmp = (src >> 20) & MASK(uint32_t, 6);
+    out[INDEX(30, (i * 1 + 0))] = tmp;
+    tmp = (src >> 26) & MASK(uint32_t, 6);
+    out[INDEX(31, (i * 1 + 0))] = tmp;
+}
+
+// Device function template (callable from device code)
+template<typename ValueT>
+__device__ __forceinline__ void for_device(
+    ValueT *__restrict values_in_out,
+    ValueT reference,
+    int thread_idx
+) {
+    auto i = thread_idx;
+    const int thread_ops = blockDim.x;
+
+    for (auto j = 0; j < thread_ops; j++) {
+        auto idx = INDEX(j, i);
+        values_in_out[idx] = values_in_out[idx] + reference;
+    }
+}
+
+
+// Fused kernel: bitpack unpack (3bw) + FoR addition in one pass
+// This eliminates the intermediate write-to-memory and read-from-memory
+// by keeping unpacked values in registers/L1 cache and immediately adding the reference
+extern "C" __global__ void fused_bitpack6_for_u32(
+    const uint32_t *__restrict packed_in,
+    uint32_t *__restrict unpacked_out,
+    uint32_t reference
+) {
+    int i = threadIdx.x;
+    auto in = packed_in + (blockIdx.x * (128 * 6 / sizeof(uint32_t)));
+    const uint32_t fl_lane_count = 32;
+    auto blockSize = blockDim.x * fl_lane_count;
+    auto out = unpacked_out + (blockIdx.x * 1024);
+
+    __shared__ uint32_t shared_data[1024];
+
+    fls_unpack_6bw_32ow_device(in, shared_data, i);
+
+    for_device(shared_data, reference, i);
+
+    for (int i = 0; i < 32; i++) {
+        auto idx = i * 32 + threadIdx.x;
+        out[idx] = shared_data[idx];
+    }
+}
diff --git a/vortex-gpu/kernels/gen/.gitkeep b/vortex-gpu/kernels/gen/.gitkeep
diff --git a/vortex-gpu/src/bit_unpack.rs b/vortex-gpu/src/bit_unpack.rs
@@ -52,7 +52,7 @@ fn cuda_bit_unpack_kernel(
     }
     let module = ctx
         .load_module(Ptx::from_file(format!(
-            "kernels/fls_{}_bit_unpack.ptx",
+            "kernels/gen/fls_{}_bit_unpack.ptx",
             kernel_id.output_bit_width
         )))
         .map_err(|e| vortex_err!("Failed to load kernel module: {e}"))?;
diff --git a/vortex-gpu/src/for_bp.rs b/vortex-gpu/src/for_bp.rs
diff --git a/vortex-gpu/src/lib.rs b/vortex-gpu/src/lib.rs

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-kernels/fls*`
	`1`	`+kernels/gen/*`
`2`	`2`	`*.ptx`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ fn cuda_bit_unpack_kernel(`
`52`	`52`	`}`
`53`	`53`	`let module = ctx`
`54`	`54`	`.load_module(Ptx::from_file(format!(`
`55`		`- "kernels/fls_{}_bit_unpack.ptx",`
	`55`	`+ "kernels/gen/fls_{}_bit_unpack.ptx",`
`56`	`56`	`kernel_id.output_bit_width`
`57`	`57`	`)))`
`58`	`58`	`.map_err(\|e\| vortex_err!("Failed to load kernel module: {e}"))?;`