Skip to content

Commit 35f2422

Browse files
Esteban Padilla Cerdiofacebook-github-bot
authored andcommitted
Add buffer cacheline size metric (pytorch#4228)
Summary: Pull Request resolved: pytorch#4228 {F1753540374} Differential Revision: D59649561
1 parent 47260e4 commit 35f2422

File tree

3 files changed

+120
-1
lines changed

3 files changed

+120
-1
lines changed
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
layout(std430) buffer;
14+
15+
layout(set = 0, binding = 0) buffer PRECISION restrict readonly InBuffer {
16+
float data[];
17+
}
18+
source;
19+
20+
layout(set = 0, binding = 1) buffer PRECISION restrict writeonly OutBuffer {
21+
float data[];
22+
}
23+
destination;
24+
25+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
26+
27+
layout(constant_id = 3) const int niter = 1;
28+
layout(constant_id = 4) const int stride = 1;
29+
layout(constant_id = 5) const int pitch = 1;
30+
31+
32+
void main() {
33+
float c = 0;
34+
for (int i = 0; i < niter; ++i) {
35+
c += source.data[stride * 0 + pitch * gl_GlobalInvocationID[0]];
36+
c += source.data[stride * 1 + pitch * gl_GlobalInvocationID[0]];
37+
}
38+
destination.data[0] = c;
39+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
buf_cacheline_size:
8+
parameter_names_with_default_values:
9+
DTYPE: float
10+
STORAGE: buffer
11+
shader_variants:
12+
- NAME: buf_cacheline_size

backends/vulkan/tools/gpuinfo/src/app.cpp

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class App {
4444
const uint32_t NREG_MAX = 512;
4545
const uint32_t NREG_STEP = 1;
4646

47+
// TODO: Make these values configurable
4748
const double COMPENSATE = 0.01;
4849
const double THRESHOLD = 3;
4950

@@ -146,11 +147,78 @@ class App {
146147
<< std::endl;
147148
std::cout << "Register type," << reg_ty << std::endl;
148149
}
150+
151+
void buf_cacheline_size() {
152+
std::cout << "\n------ Buffer Cacheline Size ------" << std::endl;
153+
154+
// TODO: Make these values configurable
155+
const double COMPENSATE = 0.01;
156+
const double THRESHOLD = 2;
157+
158+
const uint32_t PITCH = buf_cache_size_ * 2 / nthread_logic_;
159+
const uint32_t BUF_SIZE = PITCH * nthread_logic_;
160+
const uint32_t MAX_STRIDE = PITCH / 2;
161+
162+
uint32_t NITER;
163+
164+
auto bench = [&](int stride) {
165+
size_t len = sizeof(float);
166+
StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
167+
StorageBuffer out_buf(context(), vkapi::kFloat, len);
168+
vkapi::PipelineBarrier pipeline_barrier{};
169+
170+
auto shader_name = "buf_cacheline_size";
171+
172+
uint32_t stride_div = stride / sizeof(float);
173+
uint32_t pitch_div = PITCH / sizeof(float);
174+
175+
auto time = benchmark_on_gpu(shader_name, 100, [&]() {
176+
context()->submit_compute_job(
177+
VK_KERNEL_FROM_STR(shader_name),
178+
pipeline_barrier,
179+
{nthread_logic_, 1, 1},
180+
{nthread_logic_, 1, 1},
181+
{SV(NITER), SV(stride_div), SV(pitch_div)},
182+
VK_NULL_HANDLE,
183+
0,
184+
in_buf.buffer(),
185+
out_buf.buffer());
186+
});
187+
return time;
188+
};
189+
190+
ensure_min_niter(1000, NITER, [&]() { return bench(sizeof(float)); });
191+
192+
uint32_t cacheline_size;
193+
194+
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
195+
uint32_t stride = sizeof(float);
196+
for (; stride <= MAX_STRIDE; stride += sizeof(float)) {
197+
double time = bench(stride);
198+
std::cout << "Testing stride=" << stride << ", time=\t" << time
199+
<< std::endl;
200+
201+
if (dj.push(time)) {
202+
cacheline_size = stride;
203+
break;
204+
}
205+
}
206+
if (stride >= MAX_STRIDE) {
207+
std::cout
208+
<< "Reached end of memory, line size must be equal to global memory cache size available across all threads"
209+
<< std::endl;
210+
cacheline_size = MAX_STRIDE;
211+
}
212+
213+
std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
214+
}
149215
};
150216

151217
int main(int argc, const char** argv) {
152218
App app;
153219

154-
app.reg_count();
220+
// app.reg_count();
221+
app.buf_cacheline_size();
222+
155223
return 0;
156224
}

0 commit comments

Comments
 (0)