Skip to content

Commit ae0624f

Browse files
Esteban Padilla Cerdiofacebook-github-bot
authored andcommitted
Add buffer cacheline size metric (#4228)
Summary: Pull Request resolved: #4228 This diff introduces a metric to GPUInfo that calculates the cacheline size of the buffer data pathway. In this experiment, all threads read from the cache with a varying stride. Reading two values from the same cacheline is cheap because the whole line is fetched as a block, regardless of which data we actually want. By varying the separation between the addresses of these two values, there will be a point where the shader will be forced to fetch two separate cachelines, which will have an effect in latency that we can detect. [This article](https://igoro.com/archive/gallery-of-processor-cache-effects/) has more information on the topic. The experiment first calculates the number of iterations (NITER) that would take the lowest stride to run in 1000 microseconds. All experiments will then run this number of times. This is to have a timing baseline and avoid timing errors. Each run of the shader fetches the two values from different points in memory. The shader also has a seemingly redundant variable `zero` that will force the compiler to avoid optimizing the for loop. The experiment will look like this: {F1754670481} Differential Revision: D59649561
1 parent 24aef7a commit ae0624f

File tree

3 files changed

+120
-0
lines changed

3 files changed

+120
-0
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#version 450 core
10+
11+
#define PRECISION ${PRECISION}
12+
13+
layout(std430) buffer;
14+
15+
layout(set = 0, binding = 0) buffer PRECISION restrict readonly InBuffer {
16+
float data[];
17+
}
18+
source;
19+
20+
layout(set = 0, binding = 1) buffer PRECISION restrict writeonly OutBuffer {
21+
float data[];
22+
}
23+
destination;
24+
25+
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
26+
27+
layout(constant_id = 3) const int niter = 1;
28+
layout(constant_id = 4) const int stride = 1;
29+
layout(constant_id = 5) const int pitch = 1;
30+
31+
32+
void main() {
33+
float c = 0;
34+
for (int i = 0; i < niter; ++i) {
35+
const int zero = i >> 31;
36+
c += source.data[zero + pitch * gl_GlobalInvocationID[0]];
37+
c += source.data[zero + stride + pitch * gl_GlobalInvocationID[0]];
38+
}
39+
destination.data[0] = c;
40+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
buf_cacheline_size:
8+
parameter_names_with_default_values:
9+
DTYPE: float
10+
STORAGE: buffer
11+
shader_variants:
12+
- NAME: buf_cacheline_size

backends/vulkan/tools/gpuinfo/src/app.cpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class App {
4444
const uint32_t NREG_MAX = 512;
4545
const uint32_t NREG_STEP = 1;
4646

47+
// TODO: Make these values configurable
4748
const double COMPENSATE = 0.01;
4849
const double THRESHOLD = 3;
4950

@@ -146,11 +147,78 @@ class App {
146147
<< std::endl;
147148
std::cout << "Register type," << reg_ty << std::endl;
148149
}
150+
151+
void buf_cacheline_size() {
152+
std::cout << "\n------ Buffer Cacheline Size ------" << std::endl;
153+
154+
// TODO: Make these values configurable
155+
const double COMPENSATE = 0.01;
156+
const double THRESHOLD = 2;
157+
158+
const uint32_t PITCH = buf_cache_size_ * 2 / nthread_logic_;
159+
const uint32_t BUF_SIZE = PITCH * nthread_logic_;
160+
const uint32_t MAX_STRIDE = PITCH / 2;
161+
162+
uint32_t NITER;
163+
164+
auto bench = [&](int stride) {
165+
size_t len = sizeof(float);
166+
StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
167+
StorageBuffer out_buf(context(), vkapi::kFloat, len);
168+
vkapi::PipelineBarrier pipeline_barrier{};
169+
170+
auto shader_name = "buf_cacheline_size";
171+
172+
uint32_t stride_div = stride / sizeof(float);
173+
uint32_t pitch_div = PITCH / sizeof(float);
174+
175+
auto time = benchmark_on_gpu(shader_name, 100, [&]() {
176+
context()->submit_compute_job(
177+
VK_KERNEL_FROM_STR(shader_name),
178+
pipeline_barrier,
179+
{nthread_logic_, 1, 1},
180+
{nthread_logic_, 1, 1},
181+
{SV(NITER), SV(stride_div), SV(pitch_div)},
182+
VK_NULL_HANDLE,
183+
0,
184+
in_buf.buffer(),
185+
out_buf.buffer());
186+
});
187+
return time;
188+
};
189+
190+
ensure_min_niter(1000, NITER, [&]() { return bench(sizeof(float)); });
191+
192+
uint32_t cacheline_size;
193+
194+
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
195+
uint32_t stride = sizeof(float);
196+
for (; stride <= MAX_STRIDE; stride += sizeof(float)) {
197+
double time = bench(stride);
198+
std::cout << "Testing stride=" << stride << ", time=\t" << time
199+
<< std::endl;
200+
201+
if (dj.push(time)) {
202+
cacheline_size = stride;
203+
break;
204+
}
205+
}
206+
if (stride >= MAX_STRIDE) {
207+
std::cout << "Unable to conclude a top level buffer cacheline size."
208+
<< std::endl;
209+
cacheline_size = MAX_STRIDE;
210+
}
211+
212+
std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
213+
}
149214
};
150215

151216
int main(int argc, const char** argv) {
152217
App app;
153218

219+
// TODO: Allow user to skip tests
154220
app.reg_count();
221+
app.buf_cacheline_size();
222+
155223
return 0;
156224
}

0 commit comments

Comments
 (0)