Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions REUSE.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,9 @@ path = ["**/.gitignore", ".gitmodules", ".python-version", "**/*.lock", "**/*.lo
precedence = "override"
SPDX-FileCopyrightText = "Copyright the Vortex contributors"
SPDX-License-Identifier = "Apache-2.0"

[[annotations]]
path = ["vortex-cuda/kernels/bit_unpack_*"]
precedence = "override"
SPDX-FileCopyrightText = "Copyright the Vortex contributors"
SPDX-License-Identifier = "Apache-2.0"
20 changes: 20 additions & 0 deletions encodings/fastlanes/src/bitpacking/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ use crate::bitpack_compress::bitpack_encode;
use crate::unpack_iter::BitPacked;
use crate::unpack_iter::BitUnpackedChunks;

pub struct BitPackedArrayParts {
pub offset: u16,
pub bit_width: u8,
pub len: usize,
pub packed: BufferHandle,
pub patches: Option<Patches>,
pub validity: Validity,
}

#[derive(Clone, Debug)]
pub struct BitPackedArray {
/// The offset within the first block (created with a slice).
Expand Down Expand Up @@ -275,6 +284,17 @@ impl BitPackedArray {
pub fn max_packed_value(&self) -> usize {
(1 << self.bit_width()) - 1
}

pub fn into_parts(self) -> BitPackedArrayParts {
BitPackedArrayParts {
offset: self.offset,
bit_width: self.bit_width,
len: self.len,
packed: self.packed,
patches: self.patches,
validity: self.validity,
}
}
}

#[cfg(test)]
Expand Down
1 change: 1 addition & 0 deletions encodings/fastlanes/src/bitpacking/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

mod array;
pub use array::BitPackedArray;
pub use array::BitPackedArrayParts;
pub use array::bitpack_compress;
pub use array::bitpack_decompress;
pub use array::unpack_iter;
Expand Down
2 changes: 2 additions & 0 deletions vortex-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ _test-harness = []
[dependencies]
async-trait = { workspace = true }
cudarc = { workspace = true }
fastlanes = { workspace = true }
futures = { workspace = true, features = ["executor"] }
kanal = { workspace = true }
tracing = { workspace = true }
Expand Down Expand Up @@ -52,6 +53,7 @@ vortex-dtype = { workspace = true, features = ["cudarc"] }
vortex-scalar = { workspace = true }

[build-dependencies]
fastlanes = { workspace = true }

[[bench]]
name = "for_cuda"
Expand Down
28 changes: 24 additions & 4 deletions vortex-cuda/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,18 @@
#![allow(clippy::use_debug)]

use std::env;
use std::fs::File;
use std::io;
use std::path::Path;
use std::process::Command;

use fastlanes::FastLanes;

use crate::cuda_kernel_generator::IndentedWriter;
use crate::cuda_kernel_generator::generate_cuda_unpack_for_width;

pub mod cuda_kernel_generator;

fn main() {
let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("Failed to get manifest dir");
let kernels_dir = Path::new(&manifest_dir).join("kernels");
Expand All @@ -22,12 +31,17 @@ fn main() {
kernels_dir.display()
);

println!("cargo:rerun-if-changed={}", kernels_dir.to_str().unwrap());

generate_unpack::<u8>(&kernels_dir, 32).expect("Failed to generate unpack for u8");
generate_unpack::<u16>(&kernels_dir, 32).expect("Failed to generate unpack for u16");
generate_unpack::<u32>(&kernels_dir, 32).expect("Failed to generate unpack for u32");
generate_unpack::<u64>(&kernels_dir, 16).expect("Failed to generate unpack for u64");

if !is_cuda_available() {
return;
}

println!("cargo:rerun-if-changed={}", kernels_dir.to_str().unwrap());

if let Ok(entries) = std::fs::read_dir(&kernels_dir) {
for path in entries.flatten().map(|entry| entry.path()) {
match path.extension().and_then(|e| e.to_str()) {
Expand All @@ -47,7 +61,13 @@ fn main() {
}
}

fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> std::io::Result<()> {
fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> io::Result<()> {
let mut cu_file = File::create(output_dir.join(format!("bit_unpack_{}.cu", T::T)))?;
let mut cu_writer = IndentedWriter::new(&mut cu_file);
generate_cuda_unpack_for_width::<T, _>(&mut cu_writer, thread_count)
}

fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> io::Result<()> {
// https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-build-scripts
let profile = env::var("PROFILE").unwrap();

Expand Down Expand Up @@ -114,7 +134,7 @@ fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> std::io::Result<()> {
}
}

return Err(std::io::Error::other(format!(
return Err(io::Error::other(format!(
"nvcc compilation failed for {}",
cu_path.display()
)));
Expand Down
41 changes: 41 additions & 0 deletions vortex-cuda/cuda_kernel_generator/indent.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::fmt;
use std::io;
use std::io::Write;

pub struct IndentedWriter<W: Write> {
write: W,
indent: String,
}

impl<W: Write> IndentedWriter<W> {
pub fn new(write: W) -> Self {
Self {
write,
indent: String::new(),
}
}

/// # Errors
///
/// Will return Err if writing to the underlying writer fails.
pub fn indent<F>(&mut self, indented: F) -> io::Result<()>
where
F: FnOnce(&mut IndentedWriter<W>) -> io::Result<()>,
{
let original_ident = self.indent.clone();
self.indent += " ";
let res = indented(self);
self.indent = original_ident;
res
}

/// # Errors
///
/// Will return Err if writing to the underlying writer fails.
pub fn write_fmt(&mut self, fmt: fmt::Arguments<'_>) -> io::Result<()> {
write!(self.write, "{}{}", self.indent, fmt)
}
}
166 changes: 166 additions & 0 deletions vortex-cuda/cuda_kernel_generator/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

mod indent;

use std::io;
use std::io::Write;

use fastlanes::FastLanes;
pub use indent::IndentedWriter;

fn generate_lane_decoder<T: FastLanes, W: Write>(
output: &mut IndentedWriter<W>,
bit_width: usize,
) -> io::Result<()> {
let bits = <T>::T;
let lanes = T::LANES;

let func_name = format!("bit_unpack_{bits}_{bit_width}bw_lane");

writeln!(
output,
"__device__ void _{func_name}(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, unsigned int lane) {{"
)?;

output.indent(|output| {
writeln!(output, "unsigned int LANE_COUNT = {lanes};")?;
if bit_width == 0 {
writeln!(output, "uint{bits}_t zero = 0ULL;")?;
writeln!(output)?;
for row in 0..bits {
writeln!(output, "out[INDEX({row}, lane)] = zero;")?;
}
} else if bit_width == bits {
writeln!(output)?;
for row in 0..bits {
writeln!(
output,
"out[INDEX({row}, lane)] = in[LANE_COUNT * {row} + lane];",
)?;
}
} else {
writeln!(output, "uint{bits}_t src;")?;
writeln!(output, "uint{bits}_t tmp;")?;

writeln!(output)?;
writeln!(output, "src = in[lane];")?;
for row in 0..bits {
let curr_word = (row * bit_width) / bits;
let next_word = ((row + 1) * bit_width) / bits;
let shift = (row * bit_width) % bits;

if next_word > curr_word {
let remaining_bits = ((row + 1) * bit_width) % bits;
let current_bits = bit_width - remaining_bits;
writeln!(
output,
"tmp = (src >> {shift}) & MASK(uint{bits}_t, {current_bits});"
)?;

if next_word < bit_width {
writeln!(output, "src = in[lane + LANE_COUNT * {next_word}];")?;
writeln!(
output,
"tmp |= (src & MASK(uint{bits}_t, {remaining_bits})) << {current_bits};"
)?;
}
} else {
writeln!(
output,
"tmp = (src >> {shift}) & MASK(uint{bits}_t, {bit_width});"
)?;
}

writeln!(output, "out[INDEX({row}, lane)] = tmp;")?;
}
}
Ok(())
})?;

writeln!(output, "}}")
}

fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
output: &mut IndentedWriter<W>,
bit_width: usize,
thread_count: usize,
) -> io::Result<()> {
let bits = <T>::T;
let lanes = T::LANES;
let per_thread_loop_count = lanes / thread_count;

let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");

let local_func_params = format!(
"(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, int thread_idx)"
);

writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?;

output.indent(|output| {
for thread_lane in 0..per_thread_loop_count {
writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, out, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
}
Ok(())
})?;

writeln!(output, "}}")
}

fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
output: &mut IndentedWriter<W>,
bit_width: usize,
thread_count: usize,
) -> io::Result<()> {
let bits = <T>::T;

let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");
let func_params =
format!("(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out)");

writeln!(
output,
"extern \"C\" __global__ void {func_name}{func_params} {{"
)?;

output.indent(|output| {
writeln!(output, "int thread_idx = threadIdx.x;")?;
writeln!(
output,
"auto in = full_in + (blockIdx.x * (128 * {bit_width} / sizeof(uint{bits}_t)));"
)?;
writeln!(output, "auto out = full_out + (blockIdx.x * 1024);")?;

writeln!(output, "_{func_name}(in, out, thread_idx);")
})?;

writeln!(output, "}}")
}

/// # Errors
///
/// Will return Err if writing to the underlying writer fails.
pub fn generate_cuda_unpack_for_width<T: FastLanes, W: Write>(
output: &mut IndentedWriter<W>,
thread_count: usize,
) -> io::Result<()> {
writeln!(output, "// AUTO-GENERATED. Do not edit by hand!")?;
writeln!(output, "#include <cuda.h>")?;
writeln!(output, "#include <cuda_runtime.h>")?;
writeln!(output, "#include <stdint.h>")?;
writeln!(output, "#include \"fastlanes_common.cuh\"")?;
writeln!(output)?;

for bit_width in 0..=<T>::T {
generate_lane_decoder::<T, _>(output, bit_width)?;
writeln!(output)?;
generate_device_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
writeln!(output)?;

generate_global_kernel_for_width::<T, _>(output, bit_width, thread_count)?;
writeln!(output)?;
}

Ok(())
}
Loading
Loading