Skip to content

Commit 680e107

Browse files
perf[gpu]: fused AoT for and bitpacking kernel (#4872)
I have only implemented a PoC fused FoR-BP kernel, I don't want to implemented them all since there will be a lot of duplication. I think we likely need to compile these at runtime. I have also fixed up the kernels build system. Fused is fast ``` gpu_for_bp_fused_decompress_kernel_only/u32/1GB time: [5.5376 ms 5.5410 ms 5.5443 ms] thrpt: [180.37 GiB/s 180.47 GiB/s 180.58 GiB/s] ``` Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk> --------- Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent ad2cfb6 commit 680e107

File tree

12 files changed

+458
-30
lines changed

12 files changed

+458
-30
lines changed

fls-gpu-kernel-gen/src/bit_unpack.rs

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -101,17 +101,7 @@ fn generate_unpack_for_width<T: FastLanes, W: Write>(
101101
writeln!(output, "#include <cuda.h>")?;
102102
writeln!(output, "#include <cuda_runtime.h>")?;
103103
writeln!(output, "#include <stdint.h>")?;
104-
writeln!(output)?;
105-
106-
writeln!(
107-
output,
108-
"__device__ int FL_ORDER[] = {{0, 4, 2, 6, 1, 5, 3, 7}};"
109-
)?;
110-
writeln!(
111-
output,
112-
"#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)"
113-
)?;
114-
writeln!(output, "#define MASK(T, width) (((T)1 << width) - 1)")?;
104+
writeln!(output, "#include \"fastlanes_common.cuh\"")?;
115105
writeln!(output)?;
116106

117107
for bit_width in 0..=<T>::T {
@@ -123,10 +113,10 @@ fn generate_unpack_for_width<T: FastLanes, W: Write>(
123113
}
124114

125115
pub fn generate_unpack<T: FastLanes>(output_dir: &Path, thread_count: usize) -> anyhow::Result<()> {
126-
let filename = format!("fls_{}_bit_unpack.cu", T::T);
127-
let path = output_dir.join(&filename);
128-
let mut file = File::create(&path)?;
129-
let mut writer = IndentedWriter::new(&mut file);
130-
generate_unpack_for_width::<T, _>(&mut writer, thread_count)?;
116+
let cu_filename = format!("gen/fls_{}_bit_unpack.cu", T::T);
117+
let cu_path = output_dir.join(&cu_filename);
118+
let mut cu_file = File::create(&cu_path)?;
119+
let mut cu_writer = IndentedWriter::new(&mut cu_file);
120+
generate_unpack_for_width::<T, _>(&mut cu_writer, thread_count)?;
131121
Ok(())
132122
}

vortex-gpu/.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
kernels/fls*
1+
kernels/gen/*
22
*.ptx

vortex-gpu/benches/gpu_bitunpack.rs

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ use vortex_buffer::BufferMut;
1515
use vortex_dtype::NativePType;
1616
use vortex_error::VortexUnwrap;
1717
use vortex_fastlanes::{BitPackedArray, FoRArray};
18-
use vortex_gpu::{cuda_bit_unpack_timed, cuda_for_unpack_timed};
18+
use vortex_gpu::{cuda_bit_unpack_timed, cuda_for_bp_unpack_timed, cuda_for_unpack_timed};
1919

2020
// Data sizes: 1GB, 2.5GB, 5GB, 10GB
2121
// These are approximate sizes in bytes, accounting for bit-packing compression
@@ -124,6 +124,37 @@ fn benchmark_gpu_for_decompress_kernel_only(c: &mut Criterion) {
124124
group.finish();
125125
}
126126

127+
fn benchmark_gpu_for_bp_fused_decompress_kernel_only(c: &mut Criterion) {
128+
let mut group = c.benchmark_group("gpu_for_bp_fused_decompress_kernel_only");
129+
130+
group.sample_size(10);
131+
132+
for (len, label) in DATA_SIZES {
133+
let len = len.next_multiple_of(1024);
134+
let array = make_for_bitpackable_array(len);
135+
136+
let ctx = CudaContext::new(0).unwrap();
137+
ctx.set_blocking_synchronize().unwrap();
138+
let ctx = Arc::new(ctx);
139+
140+
group.throughput(Throughput::Bytes((len * size_of::<u32>()) as u64));
141+
group.bench_with_input(BenchmarkId::new("u32", label), &array, |b, array| {
142+
b.iter_custom(|iters| {
143+
let mut total_time = Duration::ZERO;
144+
for _ in 0..iters {
145+
// This only measures kernel execution time, not memory transfers
146+
let (_result, kernel_time) =
147+
cuda_for_bp_unpack_timed(array, Arc::clone(&ctx)).unwrap();
148+
total_time += kernel_time;
149+
}
150+
total_time
151+
});
152+
});
153+
}
154+
155+
group.finish();
156+
}
157+
127158
#[allow(dead_code)]
128159
fn benchmark_cpu_canonicalize(c: &mut Criterion) {
129160
let mut group = c.benchmark_group("cpu_canonicalize");
@@ -145,5 +176,6 @@ criterion_group!(
145176
benches,
146177
benchmark_gpu_decompress_kernel_only,
147178
benchmark_gpu_for_decompress_kernel_only,
179+
benchmark_gpu_for_bp_fused_decompress_kernel_only
148180
);
149181
criterion_main!(benches);

vortex-gpu/build.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,21 +37,23 @@ fn main() -> anyhow::Result<()> {
3737

3838
println!("cargo:rerun-if-changed={}", generator_dir.to_str().unwrap());
3939

40-
for entry in WalkDir::new(kernels_dir).into_iter().flatten() {
40+
for entry in WalkDir::new(&kernels_dir).into_iter().flatten() {
4141
if entry.path().extension().is_some_and(|ext| ext == "cu") {
4242
println!("cargo:rerun-if-changed={}", entry.path().display());
43-
nvcc_compile_ptx(entry.path())?;
43+
nvcc_compile_ptx(kernels_dir.as_path(), entry.path())?;
4444
}
4545
}
4646

4747
Ok(())
4848
}
4949

50-
fn nvcc_compile_ptx(cu_path: &Path) -> anyhow::Result<()> {
50+
fn nvcc_compile_ptx(kernel_dir: &Path, cu_path: &Path) -> anyhow::Result<()> {
5151
let res = Command::new("nvcc")
5252
.arg("-arch=sm_80")
5353
.arg("--restrict")
5454
.arg("--ptx")
55+
.arg("--include-path")
56+
.arg(kernel_dir)
5557
.arg("-c")
5658
.arg(cu_path)
5759
.arg("-o")
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
// Auto-generated by fls-gpu-kernel-gen. Do not edit by hand!
5+
// Common FastLanes definitions shared across multiple kernels
6+
7+
#ifndef GEN_FASTLANES_COMMON_CUH
8+
#define GEN_FASTLANES_COMMON_CUH
9+
10+
#include <stdint.h>
11+
12+
// FastLanes ordering array
13+
__device__ int FL_ORDER[] = {0, 4, 2, 6, 1, 5, 3, 7};
14+
15+
// Compute the index in the FastLanes layout
16+
#define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)
17+
18+
// Create a mask with 'width' bits set
19+
#define MASK(T, width) (((T)1 << width) - 1)
20+
21+
#endif // GEN_FASTLANES_COMMON_CUH

vortex-gpu/kernels/for.cu

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,36 @@
55
#include <cuda_runtime.h>
66
#include <stdint.h>
77

8+
// Device function template (callable from device code)
9+
template<typename ValueT>
10+
__device__ void for_device(
11+
ValueT *__restrict values_in_out,
12+
ValueT reference,
13+
int thread_idx
14+
) {
15+
auto i = thread_idx;
16+
const uint32_t thread_ops = blockDim.x;
17+
18+
for (auto j = 0; j < thread_ops; j++) {
19+
auto idx = i * thread_ops + j;
20+
values_in_out[idx] = values_in_out[idx] + reference;
21+
}
22+
}
23+
24+
// Kernel wrapper template (callable from host)
825
template<typename ValueT>
926
__device__ void for_(
1027
ValueT *__restrict values_in_out_array,
1128
ValueT reference
1229
) {
1330
auto i = threadIdx.x;
14-
auto block_offset = (blockIdx.x * 1024);
31+
const uint32_t fl_lane_count = 32;
32+
auto blockSize = blockDim.x * fl_lane_count;
33+
auto block_size = 1024;
34+
auto block_offset = (blockIdx.x * block_size);
1535

1636
auto values_in_out = values_in_out_array + block_offset;
17-
18-
const int thread_ops = 32;
19-
20-
for (auto j = 0; j < thread_ops; j++) {
21-
auto idx = i * thread_ops + j;
22-
values_in_out[idx] = values_in_out[idx] + reference;
23-
}
37+
for_device(values_in_out, reference, i);
2438
}
2539

2640
// Macro to generate the extern "C" wrapper for each type combination

vortex-gpu/kernels/for.cuh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
// Frame-of-Reference kernel declarations
5+
6+
#ifndef FOR_CUH
7+
#define FOR_CUH
8+
9+
#include <stdint.h>
10+
11+
// Device function template (callable from other kernels)
12+
template<typename ValueT>
13+
__device__ __forceinline__ void for_device(
14+
ValueT *__restrict values_in_out,
15+
ValueT reference,
16+
int thread_idx
17+
);
18+
19+
// Kernel functions (callable from host)
20+
extern "C" __global__ void for_vu8(uint8_t *__restrict values, uint8_t reference);
21+
extern "C" __global__ void for_vu16(uint16_t *__restrict values, uint16_t reference);
22+
extern "C" __global__ void for_vu32(uint32_t *__restrict values, uint32_t reference);
23+
extern "C" __global__ void for_vu64(uint64_t *__restrict values, uint64_t reference);
24+
25+
extern "C" __global__ void for_vi8(int8_t *__restrict values, int8_t reference);
26+
extern "C" __global__ void for_vi16(int16_t *__restrict values, int16_t reference);
27+
extern "C" __global__ void for_vi32(int32_t *__restrict values, int32_t reference);
28+
extern "C" __global__ void for_vi64(int64_t *__restrict values, int64_t reference);
29+
30+
#endif // FOR_CUH
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
// Fused kernel combining FastLanes bitpacking unpack with Frame-of-Reference addition
5+
// This avoids an intermediate memory write/read by fusing the operations
6+
7+
#include <cuda.h>
8+
#include <cuda_runtime.h>
9+
#include <stdint.h>
10+
#include "fastlanes_common.cuh"
11+
12+
13+
__device__ void fls_unpack_6bw_32ow_device(const uint32_t *__restrict in, uint32_t *__restrict out, int thread_idx) {
14+
int i = thread_idx;
15+
uint32_t src;
16+
uint32_t tmp;
17+
18+
src = in[i * 1 + 0];
19+
tmp = (src >> 0) & MASK(uint32_t, 6);
20+
out[INDEX(0, (i * 1 + 0))] = tmp;
21+
tmp = (src >> 6) & MASK(uint32_t, 6);
22+
out[INDEX(1, (i * 1 + 0))] = tmp;
23+
tmp = (src >> 12) & MASK(uint32_t, 6);
24+
out[INDEX(2, (i * 1 + 0))] = tmp;
25+
tmp = (src >> 18) & MASK(uint32_t, 6);
26+
out[INDEX(3, (i * 1 + 0))] = tmp;
27+
tmp = (src >> 24) & MASK(uint32_t, 6);
28+
out[INDEX(4, (i * 1 + 0))] = tmp;
29+
tmp = (src >> 30) & MASK(uint32_t, 2);
30+
src = in[i * 1 + 0 + 32 * 1];
31+
tmp |= (src & MASK(uint32_t, 4)) << 2;
32+
out[INDEX(5, (i * 1 + 0))] = tmp;
33+
tmp = (src >> 4) & MASK(uint32_t, 6);
34+
out[INDEX(6, (i * 1 + 0))] = tmp;
35+
tmp = (src >> 10) & MASK(uint32_t, 6);
36+
out[INDEX(7, (i * 1 + 0))] = tmp;
37+
tmp = (src >> 16) & MASK(uint32_t, 6);
38+
out[INDEX(8, (i * 1 + 0))] = tmp;
39+
tmp = (src >> 22) & MASK(uint32_t, 6);
40+
out[INDEX(9, (i * 1 + 0))] = tmp;
41+
tmp = (src >> 28) & MASK(uint32_t, 4);
42+
src = in[i * 1 + 0 + 32 * 2];
43+
tmp |= (src & MASK(uint32_t, 2)) << 4;
44+
out[INDEX(10, (i * 1 + 0))] = tmp;
45+
tmp = (src >> 2) & MASK(uint32_t, 6);
46+
out[INDEX(11, (i * 1 + 0))] = tmp;
47+
tmp = (src >> 8) & MASK(uint32_t, 6);
48+
out[INDEX(12, (i * 1 + 0))] = tmp;
49+
tmp = (src >> 14) & MASK(uint32_t, 6);
50+
out[INDEX(13, (i * 1 + 0))] = tmp;
51+
tmp = (src >> 20) & MASK(uint32_t, 6);
52+
out[INDEX(14, (i * 1 + 0))] = tmp;
53+
tmp = (src >> 26) & MASK(uint32_t, 6);
54+
src = in[i * 1 + 0 + 32 * 3];
55+
tmp |= (src & MASK(uint32_t, 0)) << 6;
56+
out[INDEX(15, (i * 1 + 0))] = tmp;
57+
tmp = (src >> 0) & MASK(uint32_t, 6);
58+
out[INDEX(16, (i * 1 + 0))] = tmp;
59+
tmp = (src >> 6) & MASK(uint32_t, 6);
60+
out[INDEX(17, (i * 1 + 0))] = tmp;
61+
tmp = (src >> 12) & MASK(uint32_t, 6);
62+
out[INDEX(18, (i * 1 + 0))] = tmp;
63+
tmp = (src >> 18) & MASK(uint32_t, 6);
64+
out[INDEX(19, (i * 1 + 0))] = tmp;
65+
tmp = (src >> 24) & MASK(uint32_t, 6);
66+
out[INDEX(20, (i * 1 + 0))] = tmp;
67+
tmp = (src >> 30) & MASK(uint32_t, 2);
68+
src = in[i * 1 + 0 + 32 * 4];
69+
tmp |= (src & MASK(uint32_t, 4)) << 2;
70+
out[INDEX(21, (i * 1 + 0))] = tmp;
71+
tmp = (src >> 4) & MASK(uint32_t, 6);
72+
out[INDEX(22, (i * 1 + 0))] = tmp;
73+
tmp = (src >> 10) & MASK(uint32_t, 6);
74+
out[INDEX(23, (i * 1 + 0))] = tmp;
75+
tmp = (src >> 16) & MASK(uint32_t, 6);
76+
out[INDEX(24, (i * 1 + 0))] = tmp;
77+
tmp = (src >> 22) & MASK(uint32_t, 6);
78+
out[INDEX(25, (i * 1 + 0))] = tmp;
79+
tmp = (src >> 28) & MASK(uint32_t, 4);
80+
src = in[i * 1 + 0 + 32 * 5];
81+
tmp |= (src & MASK(uint32_t, 2)) << 4;
82+
out[INDEX(26, (i * 1 + 0))] = tmp;
83+
tmp = (src >> 2) & MASK(uint32_t, 6);
84+
out[INDEX(27, (i * 1 + 0))] = tmp;
85+
tmp = (src >> 8) & MASK(uint32_t, 6);
86+
out[INDEX(28, (i * 1 + 0))] = tmp;
87+
tmp = (src >> 14) & MASK(uint32_t, 6);
88+
out[INDEX(29, (i * 1 + 0))] = tmp;
89+
tmp = (src >> 20) & MASK(uint32_t, 6);
90+
out[INDEX(30, (i * 1 + 0))] = tmp;
91+
tmp = (src >> 26) & MASK(uint32_t, 6);
92+
out[INDEX(31, (i * 1 + 0))] = tmp;
93+
}
94+
95+
// Device function template (callable from device code)
96+
template<typename ValueT>
97+
__device__ __forceinline__ void for_device(
98+
ValueT *__restrict values_in_out,
99+
ValueT reference,
100+
int thread_idx
101+
) {
102+
auto i = thread_idx;
103+
const int thread_ops = blockDim.x;
104+
105+
for (auto j = 0; j < thread_ops; j++) {
106+
auto idx = INDEX(j, i);
107+
values_in_out[idx] = values_in_out[idx] + reference;
108+
}
109+
}
110+
111+
112+
// Fused kernel: bitpack unpack (3bw) + FoR addition in one pass
113+
// This eliminates the intermediate write-to-memory and read-from-memory
114+
// by keeping unpacked values in registers/L1 cache and immediately adding the reference
115+
extern "C" __global__ void fused_bitpack6_for_u32(
116+
const uint32_t *__restrict packed_in,
117+
uint32_t *__restrict unpacked_out,
118+
uint32_t reference
119+
) {
120+
int i = threadIdx.x;
121+
auto in = packed_in + (blockIdx.x * (128 * 6 / sizeof(uint32_t)));
122+
const uint32_t fl_lane_count = 32;
123+
auto blockSize = blockDim.x * fl_lane_count;
124+
auto out = unpacked_out + (blockIdx.x * 1024);
125+
126+
__shared__ uint32_t shared_data[1024];
127+
128+
fls_unpack_6bw_32ow_device(in, shared_data, i);
129+
130+
for_device(shared_data, reference, i);
131+
132+
for (int i = 0; i < 32; i++) {
133+
auto idx = i * 32 + threadIdx.x;
134+
out[idx] = shared_data[idx];
135+
}
136+
}

vortex-gpu/kernels/gen/.gitkeep

Whitespace-only changes.

vortex-gpu/src/bit_unpack.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ fn cuda_bit_unpack_kernel(
5252
}
5353
let module = ctx
5454
.load_module(Ptx::from_file(format!(
55-
"kernels/fls_{}_bit_unpack.ptx",
55+
"kernels/gen/fls_{}_bit_unpack.ptx",
5656
kernel_id.output_bit_width
5757
)))
5858
.map_err(|e| vortex_err!("Failed to load kernel module: {e}"))?;

0 commit comments

Comments
 (0)