Skip to content

Commit 9468c05

Browse files
committed
[examples] updated DGEMM example with more optimized kernel and perf comparison
1 parent 03fd274 commit 9468c05

File tree

10 files changed

+361
-155
lines changed

10 files changed

+361
-155
lines changed
Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
[package]
2-
name = "gemm"
3-
version = "0.1.0"
2+
name = "bench_dgemm"
3+
version = "0.2.0"
44
edition = "2021"
55

66
[dependencies]
7+
clap = { version = "3.2.23", features = ["derive"] }
78
cust = { version = "0.3", path = "../../../../crates/cust" }
89
nanorand = "0.6.1"
10+
vek = "0.15"
911

1012
[build-dependencies]
1113
cuda_builder = { version = "0.3", path = "../../../../crates/cuda_builder" }
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# DGEMM benchmark example
2+
3+
Simple performance comparison of two implementation of a DGEMM in Rust-CUDA:
4+
- a naive one with no particular optimization;
5+
- a tiled one which leverages shared memory and attributes more work per device thread to force vectorized loads and stores.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
use cuda_builder::CudaBuilder;
2+
3+
fn main() {
4+
println!("cargo:rerun-if-changed=../../resources/dgemm.ptx");
5+
CudaBuilder::new("../../gpu/bench_dgemm_gpu")
6+
.arch(cuda_builder::NvvmArch::Compute75)
7+
.copy_to("../../resources/dgemm.ptx")
8+
.build()
9+
.unwrap();
10+
}
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
use clap::Parser;
2+
use cust::{
3+
function::{BlockSize, GridSize},
4+
prelude::*,
5+
};
6+
use nanorand::{Rng, WyRand};
7+
8+
use std::{
9+
error::Error,
10+
time::{Duration, Instant},
11+
};
12+
13+
const REPETITIONS: usize = 31;
14+
15+
const M: usize = 4096;
16+
const N: usize = M;
17+
const K: usize = M;
18+
19+
const BS: usize = 32;
20+
const WPT: usize = 8;
21+
22+
static DGEMM_PTX: &str = include_str!("../../../resources/dgemm.ptx");
23+
24+
#[derive(Clone, Copy, Debug, Parser, PartialEq)]
25+
struct Args {
26+
#[clap(short, long, default_value_t = M)]
27+
m: usize,
28+
#[clap(short, long, default_value_t = N)]
29+
n: usize,
30+
#[clap(short, long, default_value_t = K)]
31+
k: usize,
32+
#[clap(short, long, default_value_t = REPETITIONS)]
33+
repetitions: usize,
34+
#[clap(short, long, default_value_t = false)]
35+
debug: bool,
36+
}
37+
38+
#[allow(non_snake_case)]
39+
fn dgemm_host(
40+
m: usize,
41+
n: usize,
42+
k: usize,
43+
alpha: f64,
44+
beta: f64,
45+
A: &[f64],
46+
B: &[f64],
47+
C: &mut [f64],
48+
) {
49+
for i in 0..m {
50+
for j in 0..n {
51+
let mut acc = 0.0;
52+
for l in 0..k {
53+
acc += A[i + m * l] * B[l + k * j]
54+
}
55+
C[i + m * j] *= beta + alpha * acc;
56+
}
57+
}
58+
}
59+
60+
#[allow(non_snake_case)]
61+
fn main() -> Result<(), Box<dyn Error>> {
62+
let args = Args::parse();
63+
64+
// Initialize CUDA context, module and stream
65+
let _ctx = cust::quick_init()?;
66+
let module = Module::from_ptx(DGEMM_PTX, &[])?;
67+
let stream = Stream::new(StreamFlags::NON_BLOCKING, None)?;
68+
69+
// Get kernels from the PTX module
70+
let dgemm_naive = module.get_function("dgemm_naive")?;
71+
let dgemm_optim = module.get_function("dgemm_optim")?;
72+
73+
// Initialize RNG from seed
74+
let mut rng = WyRand::new_seed(42);
75+
76+
// Use small coefficients to avoid the matrices' contents from growing too much
77+
let alpha = 0.002;
78+
let beta = 0.001;
79+
80+
// Initialize matrices randomly
81+
let mut A = vec![0.0; args.m * args.k];
82+
rng.fill(&mut A);
83+
let mut B = vec![0.0; args.k * args.n];
84+
rng.fill(&mut B);
85+
let mut C = vec![0.0; args.m * args.n];
86+
rng.fill(&mut C);
87+
88+
// Create host result matrix
89+
let mut h_C = C.clone();
90+
91+
// Create device matrices for naive DGEMM
92+
let d_A_naive = DeviceBuffer::from_slice(&A)?;
93+
let d_B_naive = DeviceBuffer::from_slice(&B)?;
94+
let d_C_naive = DeviceBuffer::from_slice(&C)?;
95+
96+
// Create device matrices for optimized DGEMM
97+
let d_A_optim = DeviceBuffer::from_slice(&A)?;
98+
let d_B_optim = DeviceBuffer::from_slice(&B)?;
99+
let d_C_optim = DeviceBuffer::from_slice(&C)?;
100+
101+
// Define the thread grid dimensions (same for both kernels)
102+
let grid = GridSize::xy((args.m / BS) as u32, (args.n / BS) as u32);
103+
104+
// Define the thread blocks dimensions (custom for each GPU implementation)
105+
let blocks_naive = BlockSize::xy(BS as u32, BS as u32);
106+
let blocks_optim = BlockSize::xy(BS as u32, (BS / WPT) as u32);
107+
108+
let mut res_C_naive: Vec<f64> = Vec::new();
109+
let mut durations_naive = Vec::with_capacity(REPETITIONS);
110+
// Benchmark the naive DGEMM GPU implementation
111+
for i in 0..REPETITIONS {
112+
let t = Instant::now();
113+
unsafe {
114+
launch!(
115+
dgemm_naive<<<grid, blocks_naive, 0, stream>>>(
116+
args.m,
117+
args.n,
118+
args.k,
119+
alpha,
120+
beta,
121+
d_A_naive.as_device_ptr(),
122+
d_A_naive.len(),
123+
d_B_naive.as_device_ptr(),
124+
d_B_naive.len(),
125+
d_C_naive.as_device_ptr(),
126+
)
127+
)?;
128+
}
129+
stream.synchronize()?;
130+
131+
// Register duration
132+
durations_naive.push((t.elapsed()).as_secs_f64());
133+
134+
// Store result after the first iteration
135+
if i == 0 {
136+
res_C_naive = d_C_naive.as_host_vec()?;
137+
}
138+
}
139+
140+
let mean_naive = durations_naive.iter().sum::<f64>() / REPETITIONS as f64;
141+
let sdev_naive = (durations_naive
142+
.iter()
143+
.fold(0.0, |acc, d| acc + (d - mean_naive) * (d - mean_naive))
144+
/ (REPETITIONS as f64 - 1.0))
145+
.sqrt();
146+
147+
let mut res_C_optim: Vec<f64> = Vec::new();
148+
let mut durations_optim = Vec::with_capacity(REPETITIONS);
149+
150+
// Benchmark the optim DGEMM GPU implementation
151+
for i in 0..REPETITIONS {
152+
let t = Instant::now();
153+
unsafe {
154+
launch!(
155+
dgemm_optim<<<grid, blocks_optim, 0, stream>>>(
156+
args.m,
157+
args.n,
158+
args.k,
159+
alpha,
160+
beta,
161+
d_A_optim.as_device_ptr(),
162+
d_A_optim.len(),
163+
d_B_optim.as_device_ptr(),
164+
d_B_optim.len(),
165+
d_C_optim.as_device_ptr(),
166+
)
167+
)?;
168+
}
169+
stream.synchronize()?;
170+
171+
// Register duration
172+
durations_optim.push((t.elapsed()).as_secs_f64());
173+
174+
// Store result after the first iteration
175+
if i == 0 {
176+
res_C_optim = d_C_optim.as_host_vec()?;
177+
}
178+
}
179+
180+
let mean_optim = durations_optim.iter().sum::<f64>() / REPETITIONS as f64;
181+
let sdev_optim = (durations_optim
182+
.iter()
183+
.fold(0.0, |acc, d| acc + (d - mean_optim) * (d - mean_optim))
184+
/ (REPETITIONS as f64 - 1.0))
185+
.sqrt();
186+
187+
// Verify that results after one iteration are correct
188+
if args.debug == true {
189+
dgemm_host(args.m, args.n, args.k, alpha, beta, &A, &B, &mut h_C);
190+
191+
for (idx, (h, (n, o))) in h_C
192+
.iter()
193+
.zip(res_C_optim.iter().zip(res_C_naive.iter()))
194+
.enumerate()
195+
{
196+
debug_assert!(
197+
(h - n).abs() < std::f64::EPSILON * (args.m * args.n * args.k) as f64,
198+
"Naive differs from host at index {idx}"
199+
);
200+
debug_assert!(
201+
(h - o).abs() < std::f64::EPSILON * (args.m * args.n * args.k) as f64,
202+
"Optimized differs from host at index {idx}"
203+
);
204+
}
205+
}
206+
207+
println!(
208+
"\x1b[1m{:20}{:20}{:20}{:20}{}\x1b[0m",
209+
"Implementation",
210+
"Matrix dimensions",
211+
"Grid dimensions",
212+
"Block dimensions",
213+
"Average runtime"
214+
);
215+
println!(
216+
"{:20}{:20}{:20}{:20}{}",
217+
"naive",
218+
format!("{}x{}", args.m, args.n),
219+
format!("{}x{}", grid.x, grid.y),
220+
format!("{}x{}", blocks_naive.x, blocks_naive.y),
221+
format!(
222+
"{:?} ± {:?}",
223+
Duration::from_secs_f64(mean_naive),
224+
Duration::from_secs_f64(sdev_naive)
225+
)
226+
);
227+
println!(
228+
"{:20}{:20}{:20}{:20}{}",
229+
"optimized",
230+
format!("{}x{}", args.m, args.n),
231+
format!("{}x{}", grid.x, grid.y),
232+
format!("{}x{}", blocks_optim.x, blocks_optim.y),
233+
format!(
234+
"{:?} ± {:?}",
235+
Duration::from_secs_f64(mean_optim),
236+
Duration::from_secs_f64(sdev_optim)
237+
)
238+
);
239+
240+
Ok(())
241+
}

examples/cuda/cpu/gemm/build.rs

Lines changed: 0 additions & 8 deletions
This file was deleted.

examples/cuda/cpu/gemm/src/main.rs

Lines changed: 0 additions & 106 deletions
This file was deleted.

examples/cuda/gpu/gemm_gpu/Cargo.toml renamed to examples/cuda/gpu/bench_dgemm_gpu/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
2-
name = "gemm_gpu"
3-
version = "0.1.0"
2+
name = "bench_dgemm_gpu"
3+
version = "0.2.0"
44
edition = "2021"
55

66
[dependencies]

0 commit comments

Comments
 (0)