vortex-data · robert3005 · Jan 28, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/REUSE.toml b/REUSE.toml
@@ -35,3 +35,9 @@ path = ["**/.gitignore", ".gitmodules", ".python-version", "**/*.lock", "**/*.lo
 precedence = "override"
 SPDX-FileCopyrightText = "Copyright the Vortex contributors"
 SPDX-License-Identifier = "Apache-2.0"
+
+[[annotations]]
+path = ["vortex-cuda/kernels/bit_unpack_*"]
+precedence = "override"
+SPDX-FileCopyrightText = "Copyright the Vortex contributors"
+SPDX-License-Identifier = "Apache-2.0"
diff --git a/encodings/fastlanes/src/bitpacking/array/mod.rs b/encodings/fastlanes/src/bitpacking/array/mod.rs
@@ -23,6 +23,15 @@ use crate::bitpack_compress::bitpack_encode;
 use crate::unpack_iter::BitPacked;
 use crate::unpack_iter::BitUnpackedChunks;
 
+pub struct BitPackedArrayParts {
+    pub offset: u16,
+    pub bit_width: u8,
+    pub len: usize,
+    pub packed: BufferHandle,
+    pub patches: Option<Patches>,
+    pub validity: Validity,
+}
+
 #[derive(Clone, Debug)]
 pub struct BitPackedArray {
     /// The offset within the first block (created with a slice).
@@ -275,6 +284,17 @@ impl BitPackedArray {
     pub fn max_packed_value(&self) -> usize {
         (1 << self.bit_width()) - 1
     }
+
+    pub fn into_parts(self) -> BitPackedArrayParts {
+        BitPackedArrayParts {
+            offset: self.offset,
+            bit_width: self.bit_width,
+            len: self.len,
+            packed: self.packed,
+            patches: self.patches,
+            validity: self.validity,
+        }
+    }
 }
 
 #[cfg(test)]

diff --git a/encodings/fastlanes/src/bitpacking/mod.rs b/encodings/fastlanes/src/bitpacking/mod.rs
@@ -3,6 +3,7 @@
 
 mod array;
 pub use array::BitPackedArray;
+pub use array::BitPackedArrayParts;
 pub use array::bitpack_compress;
 pub use array::bitpack_decompress;
 pub use array::unpack_iter;

diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
@@ -23,6 +23,7 @@ _test-harness = []
 [dependencies]
 async-trait = { workspace = true }
 cudarc = { workspace = true }
+fastlanes = { workspace = true }
 futures = { workspace = true, features = ["executor"] }
 kanal = { workspace = true }
 tracing = { workspace = true }
@@ -52,6 +53,7 @@ vortex-dtype = { workspace = true, features = ["cudarc"] }
 vortex-scalar = { workspace = true }
 
 [build-dependencies]
+fastlanes = { workspace = true }
 
 [[bench]]
 name = "for_cuda"

diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs
@@ -6,9 +6,18 @@
 #![allow(clippy::use_debug)]
 
 use std::env;
+use std::fs::File;
+use std::io;
 use std::path::Path;
 use std::process::Command;
 
+use fastlanes::FastLanes;
+
+use crate::cuda_kernel_generator::IndentedWriter;
+use crate::cuda_kernel_generator::generate_cuda_unpack_for_width;
+
+pub mod cuda_kernel_generator;
+
 fn main() {
     let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get manifest dir");
     let kernels_dir = Path::new(&manifest_dir).join("kernels");
@@ -22,12 +31,17 @@ fn main() {
         kernels_dir.display()
     );
 
+    println!("cargo:rerun-if-changed={}", kernels_dir.to_str().unwrap());
+
+    generate_unpack::<u8>(&kernels_dir, 32).expect("Failed to generate unpack for u8");
+    generate_unpack::<u16>(&kernels_dir, 32).expect("Failed to generate unpack for u16");
+    generate_unpack::<u32>(&kernels_dir, 32).expect("Failed to generate unpack for u32");
+    generate_unpack::<u64>(&kernels_dir, 16).expect("Failed to generate unpack for u64");
+
     if !is_cuda_available() {
         return;
     }
 
-    println!("cargo:rerun-if-changed={}", kernels_dir.to_str().unwrap());
-
     if let Ok(entries) = std::fs::read_dir(&kernels_dir) {
         for path in entries.flatten().map(|entry| entry.path()) {
             match path.extension().and_then(|e| e.to_str()) {
@@ -47,7 +61,13 @@ fn main() {
     }
 }
 
-fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> std::io::Result<()> {
+fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<()> {
+    let mut cu_file = File::create(output_dir.join(format!("bit_unpack_{}.cu", T::T)))?;
+    let mut cu_writer = IndentedWriter::new(&mut cu_file);
+    generate_cuda_unpack_for_width::<T, _>(&mut cu_writer, thread_count)
+}
+
+fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> io::Result<()> {
     // https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-build-scripts
     let profile = env::var("PROFILE").unwrap();
 
@@ -114,7 +134,7 @@ fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> std::io::Result<()> {
             }
         }
 
-        return Err(std::io::Error::other(format!(
+        return Err(io::Error::other(format!(
             "nvcc compilation failed for {}",
             cu_path.display()
         )));

diff --git a/vortex-cuda/cuda_kernel_generator/indent.rs b/vortex-cuda/cuda_kernel_generator/indent.rs
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt;
+use std::io;
+use std::io::Write;
+
+pub struct IndentedWriter<W: Write> {
+    write: W,
+    indent: String,
+}
+
+impl<W: Write> IndentedWriter<W> {
+    pub fn new(write: W) -> Self {
+        Self {
+            write,
+            indent: String::new(),
+        }
+    }
+
+    /// # Errors
+    ///
+    /// Will return Err if writing to the underlying writer fails.
+    pub fn indent<F>(&mut self, indented: F) -> io::Result<()>
+    where
+        F: FnOnce(&mut IndentedWriter<W>) -> io::Result<()>,
+    {
+        let original_ident = self.indent.clone();
+        self.indent += "    ";
+        let res = indented(self);
+        self.indent = original_ident;
+        res
+    }
+
+    /// # Errors
+    ///
+    /// Will return Err if writing to the underlying writer fails.
+    pub fn write_fmt(&mut self, fmt: fmt::Arguments<'_>) -> io::Result<()> {
+        write!(self.write, "{}{}", self.indent, fmt)
+    }
+}
diff --git a/vortex-cuda/cuda_kernel_generator/mod.rs b/vortex-cuda/cuda_kernel_generator/mod.rs
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+mod indent;
+
+use std::io;
+use std::io::Write;
+
+use fastlanes::FastLanes;
+pub use indent::IndentedWriter;
+
+fn generate_lane_decoder<T: FastLanes, W: Write>(
+    output: &mut IndentedWriter<W>,
+    bit_width: usize,
+) -> io::Result<()> {
+    let bits = <T>::T;
+    let lanes = T::LANES;
+
+    let func_name = format!("bit_unpack_{bits}_{bit_width}bw_lane");
+
+    writeln!(
+        output,
+        "__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, unsigned int lane) {{"
+    )?;
+
+    output.indent(|output| {
+        writeln!(output, "unsigned int LANE_COUNT = {lanes};")?;
+        if bit_width == 0 {
+            writeln!(output, "uint{bits}_t zero = 0ULL;")?;
+            writeln!(output)?;
+            for row in 0..bits {
+                writeln!(output, "out[INDEX({row}, lane)] = zero;")?;
+            }
+        } else if bit_width == bits {
+            writeln!(output)?;
+            for row in 0..bits {
+                writeln!(
+                    output,
+                    "out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];",
+                )?;
+            }
+        } else {
+            writeln!(output, "uint{bits}_t src;")?;
+            writeln!(output, "uint{bits}_t tmp;")?;
+
+            writeln!(output)?;
+            writeln!(output, "src = in[lane];")?;
+            for row in 0..bits {
+                let curr_word = (row * bit_width) / bits;
+                let next_word = ((row + 1) * bit_width) / bits;
+                let shift = (row * bit_width) % bits;
+
+                if next_word > curr_word {
+                    let remaining_bits = ((row + 1) * bit_width) % bits;
+                    let current_bits = bit_width - remaining_bits;
+                    writeln!(
+                        output,
+                        "tmp = (src >> {shift}) & MASK(uint{bits}_t, {current_bits});"
+                    )?;
+
+                    if next_word < bit_width {
+                        writeln!(output, "src = in[lane + LANE_COUNT * {next_word}];")?;
+                        writeln!(
+                            output,
+                            "tmp |= (src & MASK(uint{bits}_t, {remaining_bits})) << {current_bits};"
+                        )?;
+                    }
+                } else {
+                    writeln!(
+                        output,
+                        "tmp = (src >> {shift}) & MASK(uint{bits}_t, {bit_width});"
+                    )?;
+                }
+
+                writeln!(output, "out[INDEX({row}, lane)] = tmp;")?;
+            }
+        }
+        Ok(())
+    })?;
+
+    writeln!(output, "}}")
+}
+
+fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
+    output: &mut IndentedWriter<W>,
+    bit_width: usize,
+    thread_count: usize,
+) -> io::Result<()> {
+    let bits = <T>::T;
+    let lanes = T::LANES;
+    let per_thread_loop_count = lanes / thread_count;
+
+    let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");
+
+    let local_func_params = format!(
+        "(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, int thread_idx)"
+    );
+
+    writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?;
+
+    output.indent(|output| {
+        for thread_lane in 0..per_thread_loop_count {
+            writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, out, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
+        }
+        Ok(())
+    })?;
+
+    writeln!(output, "}}")
+}
+
+fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
+    output: &mut IndentedWriter<W>,
+    bit_width: usize,
+    thread_count: usize,
+) -> io::Result<()> {
+    let bits = <T>::T;
+
+    let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");
+    let func_params =
+        format!("(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out)");
+
+    writeln!(
+        output,
+        "extern \"C\" __global__ void {func_name}{func_params} {{"
+    )?;
+
+    output.indent(|output| {
+        writeln!(output, "int thread_idx = threadIdx.x;")?;
+        writeln!(
+            output,
+            "auto in = full_in + (blockIdx.x * (128 * {bit_width} / sizeof(uint{bits}_t)));"
+        )?;
+        writeln!(output, "auto out = full_out + (blockIdx.x * 1024);")?;
+
+        writeln!(output, "_{func_name}(in, out, thread_idx);")
+    })?;
+
+    writeln!(output, "}}")
+}
+
+/// # Errors
+///
+/// Will return Err if writing to the underlying writer fails.
+pub fn generate_cuda_unpack_for_width<T: FastLanes, W: Write>(
+    output: &mut IndentedWriter<W>,
+    thread_count: usize,
+) -> io::Result<()> {
+    writeln!(output, "// AUTO-GENERATED. Do not edit by hand!")?;
+    writeln!(output, "#include <cuda.h>")?;
+    writeln!(output, "#include <cuda_runtime.h>")?;
+    writeln!(output, "#include <stdint.h>")?;
+    writeln!(output, "#include \"fastlanes_common.cuh\"")?;
+    writeln!(output)?;
+
+    for bit_width in 0..=<T>::T {
+        generate_lane_decoder::<T, _>(output, bit_width)?;
+        writeln!(output)?;
+        generate_device_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
+        writeln!(output)?;
+
+        generate_global_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
+        writeln!(output)?;
+    }
+
+    Ok(())
+}