Skip to content

Commit 4fb2813

Browse files
authored
Add Coralgemm benchmark (#262)
1 parent d5a9194 commit 4fb2813

File tree

1 file changed

+164
-0
lines changed

1 file changed

+164
-0
lines changed
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
# Copyright 2024 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
2+
# ReFrame Project Developers. See the top-level LICENSE file for details.
3+
#
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
6+
import reframe as rfm
7+
import reframe.utility.sanity as sn
8+
9+
10+
@rfm.simple_test
11+
class CoralGemm(rfm.RunOnlyRegressionTest):
12+
valid_systems = ['+amdgpu']
13+
valid_prog_environs = ['+rocm']
14+
build_system = 'CMake'
15+
16+
# Data precision for matrix A, B, C and computation
17+
precision_A = variable(str, value='R_64F')
18+
precision_B = variable(str, value='R_64F')
19+
precision_C = variable(str, value='R_64F')
20+
compute_precision = variable(str, value='R_64F')
21+
22+
# Operation applied to matrix A and B, eg. OP_N, OP_T, OP_C
23+
op_A = variable(str, value='OP_N')
24+
op_B = variable(str, value='OP_T')
25+
26+
# Matrix dimensions
27+
M = variable(int, value=9728)
28+
N = variable(int, value=6144)
29+
K = variable(int, value=8192)
30+
31+
# Leading dimensions of matrix A, B, C
32+
lda = variable(int, value=9728)
33+
ldb = variable(int, value=6144)
34+
ldc = variable(int, value=9728)
35+
36+
# Number of batched matrices
37+
batch_count = variable(int, value=10)
38+
39+
# Duration to run the GEMM operation in seconds
40+
duration = variable(int, value=45)
41+
42+
# Optional argument to run the extended version of the benchmark
43+
batched = variable(bool, value=False)
44+
strided = variable(bool, value=False)
45+
ex_api = variable(bool, value=False)
46+
hipBLASLt_api = variable(bool, value=False)
47+
48+
# A, B, C matrices are stored in host memory
49+
host_A = variable(bool, value=False)
50+
host_B = variable(bool, value=False)
51+
host_C = variable(bool, value=False)
52+
53+
# if in host memory, A/B/C is coherent (not cached)
54+
coherent_A = variable(bool, value=False)
55+
coherent_B = variable(bool, value=False)
56+
coherent_C = variable(bool, value=False)
57+
58+
shared_A = variable(bool, value=False)
59+
shared_B = variable(bool, value=False)
60+
61+
# set beta to zero
62+
zero_beta = variable(bool, value=False)
63+
64+
sourcesdir = 'https://github.com/AMD-HPC/CoralGemm.git'
65+
num_tasks_per_node = 1
66+
tags = {'benchmark'}
67+
68+
@run_after('setup')
69+
def set_num_gpus(self):
70+
curr_part = self.current_partition
71+
self.num_gpus = curr_part.select_devices('gpu')[0].num_devices
72+
73+
@run_before('run')
74+
def set_executable(self):
75+
# Set mandatory arguments of the benchmark
76+
self.executable = (
77+
'./gemm '
78+
f'{self.precision_A} '
79+
f'{self.precision_B} '
80+
f'{self.precision_C} '
81+
f'{self.compute_precision} '
82+
f'{self.op_A} '
83+
f'{self.op_B} '
84+
f'{self.M} '
85+
f'{self.N} '
86+
f'{self.K} '
87+
f'{self.lda} '
88+
f'{self.ldb} '
89+
f'{self.ldc} '
90+
f'{self.batch_count} '
91+
f'{self.duration}'
92+
)
93+
94+
# Set optional arguments of the benchmark
95+
if self.batched:
96+
self.executable += ' batched'
97+
98+
if self.strided:
99+
self.executable += ' strided'
100+
101+
if self.ex_api:
102+
self.executable += ' ex'
103+
104+
if self.hipBLASLt_api:
105+
self.executable += ' lt'
106+
107+
if self.host_A:
108+
self.executable += ' hostA'
109+
110+
if self.host_B:
111+
self.executable += ' hostB'
112+
113+
if self.host_C:
114+
self.executable += ' hostC'
115+
116+
if self.coherent_A:
117+
self.executable += ' coherentA'
118+
119+
if self.coherent_B:
120+
self.executable += ' coherentB'
121+
122+
if self.coherent_C:
123+
self.executable += ' coherentC'
124+
125+
if self.shared_A:
126+
self.executable += ' sharedA'
127+
128+
if self.shared_B:
129+
self.executable += ' sharedB'
130+
131+
if self.zero_beta:
132+
self.executable += ' zeroBeta'
133+
134+
# Set the time limit with a padding of 2 minutes
135+
self.time_limit = self.duration + 120
136+
137+
@sanity_function
138+
def assert_results(self):
139+
# The binary automatically launches on all available GPUs
140+
# simultaneously, so we check that the output contains performance
141+
# results for all GPUs.
142+
s1 = sn.all([
143+
sn.assert_found(rf'device_{i}_\[GFLOPS\]', self.stdout) for i in range(self.num_gpus)
144+
])
145+
146+
# We also check that the output does not contain more GPUs than
147+
# the expected number. In case of misconfiguration, the node can
148+
# appear to have more GPUs than it actually has, with lower
149+
# performance.
150+
s2 = sn.assert_not_found(rf'device_{self.num_gpus+1}', self.stdout)
151+
152+
return sn.all([s1, s2])
153+
154+
@performance_function('GFlops')
155+
def min_gflops(self):
156+
regex = r'^'
157+
# We get one column per GPU and one for the timestamp
158+
regex += ''.join(r'\s*(\d+.\d+)' for i in range(self.num_gpus + 1))
159+
regex += r'\s*$'
160+
return sn.min(
161+
sn.min(
162+
sn.extractall(regex, self.stdout, i+1, float)
163+
) for i in range(self.num_gpus)
164+
)

0 commit comments

Comments
 (0)