Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
299 changes: 282 additions & 17 deletions vortex-cuda/benches/for_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,37 +31,121 @@ const BENCH_ARGS: &[(usize, &str)] = &[
(100_000, "100K"),
(1_000_000, "1M"),
(10_000_000, "10M"),
(100_000_000, "100M"),
];

/// Creates a FoR array for the given size.
fn make_for_array(len: usize) -> FoRArray {
/// Creates a FoR array of u8 for the given size.
fn make_for_array_u8(len: usize) -> FoRArray {
let data: Vec<u8> = (0..len as u8).map(|i| i.wrapping_add(10)).collect();
let primitive_array = PrimitiveArray::new(
Buffer::from(data),
vortex_array::validity::Validity::NonNullable,
)
.into_array();

FoRArray::try_new(primitive_array, 10u8.into()).vortex_expect("failed to create FoR array")
}

/// Creates a FoR array of u16 for the given size.
fn make_for_array_u16(len: usize) -> FoRArray {
let data: Vec<u16> = (0..len as u16).map(|i| i.wrapping_add(10)).collect();
let primitive_array = PrimitiveArray::new(
Buffer::from(data),
vortex_array::validity::Validity::NonNullable,
)
.into_array();

FoRArray::try_new(primitive_array, 10u16.into()).vortex_expect("failed to create FoR array")
}

/// Creates a FoR array of u32 for the given size.
fn make_for_array_u32(len: usize) -> FoRArray {
let primitive_array = PrimitiveArray::new(
Buffer::from((0u32..len as u32).collect::<Vec<u32>>()),
vortex_array::validity::Validity::NonNullable,
)
.into_array();

let for_offset = 10u32;
FoRArray::try_new(primitive_array, 10u32.into()).vortex_expect("failed to create FoR array")
}

FoRArray::try_new(primitive_array, for_offset.into())
.vortex_expect("failed to create FoR array")
/// Creates a FoR array of u64 for the given size.
fn make_for_array_u64(len: usize) -> FoRArray {
let data: Vec<u64> = (0..len as u64).map(|i| i.wrapping_add(10)).collect();
let primitive_array = PrimitiveArray::new(
Buffer::from(data),
vortex_array::validity::Validity::NonNullable,
)
.into_array();

FoRArray::try_new(primitive_array, 10u64.into()).vortex_expect("failed to create FoR array")
}

/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
fn launch_for_kernel_timed(
fn launch_for_kernel_timed_u8(
for_array: &FoRArray,
device_data: cudarc::driver::CudaSlice<u8>,
reference: u8,
cuda_ctx: &mut CudaExecutionCtx,
) -> vortex_error::VortexResult<Duration> {
let array_len_u64 = for_array.len() as u64;

let events = vortex_cuda::launch_cuda_kernel!(
execution_ctx: cuda_ctx,
module: "for",
ptypes: &[for_array.ptype()],
launch_args: [device_data, reference, array_len_u64],
event_recording: CU_EVENT_BLOCKING_SYNC,
array_len: for_array.len()
);

let elapsed_ms = events
.before_launch
.elapsed_ms(&events.after_launch) // synchronizes
.map_err(|e| vortex_error::vortex_err!("failed to get elapsed time: {}", e))?;

Ok(Duration::from_secs_f32(elapsed_ms / 1000.0))
}

/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
fn launch_for_kernel_timed_u16(
for_array: &FoRArray,
device_data: cudarc::driver::CudaSlice<u16>,
reference: u16,
cuda_ctx: &mut CudaExecutionCtx,
) -> vortex_error::VortexResult<Duration> {
let array_len_u64 = for_array.len() as u64;

let events = vortex_cuda::launch_cuda_kernel!(
execution_ctx: cuda_ctx,
module: "for",
ptypes: &[for_array.ptype()],
launch_args: [device_data, reference, array_len_u64],
event_recording: CU_EVENT_BLOCKING_SYNC,
array_len: for_array.len()
);

let elapsed_ms = events
.before_launch
.elapsed_ms(&events.after_launch) // synchronizes
.map_err(|e| vortex_error::vortex_err!("failed to get elapsed time: {}", e))?;

Ok(Duration::from_secs_f32(elapsed_ms / 1000.0))
}

/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
fn launch_for_kernel_timed_u32(
for_array: &FoRArray,
reference: u32,
device_data: cudarc::driver::CudaSlice<u32>,
reference: u32,
cuda_ctx: &mut CudaExecutionCtx,
) -> vortex_error::VortexResult<Duration> {
let array_len = for_array.len() as u64;
let array_len_u64 = for_array.len() as u64;

let events = vortex_cuda::launch_cuda_kernel!(
execution_ctx: cuda_ctx,
module: "for",
ptypes: &[for_array.ptype()],
launch_args: [device_data, reference, array_len],
launch_args: [device_data, reference, array_len_u64],
event_recording: CU_EVENT_BLOCKING_SYNC,
array_len: for_array.len()
);
Expand All @@ -74,17 +158,137 @@ fn launch_for_kernel_timed(
Ok(Duration::from_secs_f32(elapsed_ms / 1000.0))
}

fn benchmark_for_cuda(c: &mut Criterion) {
if !has_nvcc() {
eprintln!("nvcc not found, skipping CUDA benchmarks");
return;
/// Launches FoR decompression kernel and returns elapsed GPU time in seconds.
fn launch_for_kernel_timed_u64(
for_array: &FoRArray,
device_data: cudarc::driver::CudaSlice<u64>,
reference: u64,
cuda_ctx: &mut CudaExecutionCtx,
) -> vortex_error::VortexResult<Duration> {
let array_len_u64 = for_array.len() as u64;

let events = vortex_cuda::launch_cuda_kernel!(
execution_ctx: cuda_ctx,
module: "for",
ptypes: &[for_array.ptype()],
launch_args: [device_data, reference, array_len_u64],
event_recording: CU_EVENT_BLOCKING_SYNC,
array_len: for_array.len()
);

let elapsed_ms = events
.before_launch
.elapsed_ms(&events.after_launch) // synchronizes
.map_err(|e| vortex_error::vortex_err!("failed to get elapsed time: {}", e))?;

Ok(Duration::from_secs_f32(elapsed_ms / 1000.0))
}

/// Benchmark u8 FoR decompression
fn benchmark_for_u8(c: &mut Criterion) {
let mut group = c.benchmark_group("FoR_cuda_u8");
group.sample_size(10);

for (len, label) in BENCH_ARGS {
let for_array = make_for_array_u8(*len);

group.throughput(Throughput::Bytes((len * size_of::<u8>()) as u64));
group.bench_with_input(
BenchmarkId::new("u8_FoR", label),
&for_array,
|b, for_array| {
b.iter_custom(|iters| {
let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty())
.vortex_expect("failed to create execution context");

let encoded = for_array.encoded();
let unpacked_array = encoded.to_primitive();
let unpacked_slice = unpacked_array.as_slice::<u8>();

let reference = 10u8;
let mut total_time = Duration::ZERO;

for _ in 0..iters {
let device_data = cuda_ctx
.to_device(unpacked_slice)
.vortex_expect("failed to copy to device");

let kernel_time = launch_for_kernel_timed_u8(
for_array,
device_data,
reference,
&mut cuda_ctx,
)
.vortex_expect("kernel launch failed");

total_time += kernel_time;
}

total_time
});
},
);
}

let mut group = c.benchmark_group("FoR_cuda");
group.finish();
}

/// Benchmark u16 FoR decompression
fn benchmark_for_u16(c: &mut Criterion) {
let mut group = c.benchmark_group("FoR_cuda_u16");
group.sample_size(10);

for (len, label) in BENCH_ARGS {
let for_array = make_for_array(*len);
let for_array = make_for_array_u16(*len);

group.throughput(Throughput::Bytes((len * size_of::<u16>()) as u64));
group.bench_with_input(
BenchmarkId::new("u16_FoR", label),
&for_array,
|b, for_array| {
b.iter_custom(|iters| {
let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty())
.vortex_expect("failed to create execution context");

let encoded = for_array.encoded();
let unpacked_array = encoded.to_primitive();
let unpacked_slice = unpacked_array.as_slice::<u16>();

let reference = 10u16;
let mut total_time = Duration::ZERO;

for _ in 0..iters {
let device_data = cuda_ctx
.to_device(unpacked_slice)
.vortex_expect("failed to copy to device");

let kernel_time = launch_for_kernel_timed_u16(
for_array,
device_data,
reference,
&mut cuda_ctx,
)
.vortex_expect("kernel launch failed");

total_time += kernel_time;
}

total_time
});
},
);
}

group.finish();
}

/// Benchmark u32 FoR decompression
fn benchmark_for_u32(c: &mut Criterion) {
let mut group = c.benchmark_group("FoR_cuda_u32");
group.sample_size(10);

for (len, label) in BENCH_ARGS {
let for_array = make_for_array_u32(*len);

group.throughput(Throughput::Bytes((len * size_of::<u32>()) as u64));
group.bench_with_input(
Expand All @@ -107,10 +311,59 @@ fn benchmark_for_cuda(c: &mut Criterion) {
.to_device(unpacked_slice)
.vortex_expect("failed to copy to device");

let kernel_time = launch_for_kernel_timed(
let kernel_time = launch_for_kernel_timed_u32(
for_array,
device_data,
reference,
&mut cuda_ctx,
)
.vortex_expect("kernel launch failed");

total_time += kernel_time;
}

total_time
});
},
);
}

group.finish();
}

/// Benchmark u64 FoR decompression
fn benchmark_for_u64(c: &mut Criterion) {
let mut group = c.benchmark_group("FoR_cuda_u64");
group.sample_size(10);

for (len, label) in BENCH_ARGS {
let for_array = make_for_array_u64(*len);

group.throughput(Throughput::Bytes((len * size_of::<u64>()) as u64));
group.bench_with_input(
BenchmarkId::new("u64_FoR", label),
&for_array,
|b, for_array| {
b.iter_custom(|iters| {
let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty())
.vortex_expect("failed to create execution context");

let encoded = for_array.encoded();
let unpacked_array = encoded.to_primitive();
let unpacked_slice = unpacked_array.as_slice::<u64>();

let reference = 10u64;
let mut total_time = Duration::ZERO;

for _ in 0..iters {
let device_data = cuda_ctx
.to_device(unpacked_slice)
.vortex_expect("failed to copy to device");

let kernel_time = launch_for_kernel_timed_u64(
for_array,
device_data,
reference,
&mut cuda_ctx,
)
.vortex_expect("kernel launch failed");
Expand All @@ -127,5 +380,17 @@ fn benchmark_for_cuda(c: &mut Criterion) {
group.finish();
}

fn benchmark_for_cuda(c: &mut Criterion) {
if !has_nvcc() {
eprintln!("nvcc not found, skipping CUDA benchmarks");
return;
}

benchmark_for_u8(c);
benchmark_for_u16(c);
benchmark_for_u32(c);
benchmark_for_u64(c);
}

criterion_group!(benches, benchmark_for_cuda);
criterion_main!(benches);
Loading
Loading