Add buffer cacheline size metric (#4228)

Esteban Padilla Cerdio · facebook-github-bot · commit ae0624f21372 · 2024-07-12T11:56:28.000-07:00
Summary: Pull Request resolved: #4228 This diff introduces a metric to GPUInfo that calculates the cacheline size of the buffer data pathway. In this experiment, all threads read from the cache with a varying stride. Reading two values from the same cacheline is cheap because the whole line is fetched as a block, regardless of which data we actually want. By varying the separation between the addresses of these two values, there will be a point where the shader will be forced to fetch two separate cachelines, which will have an effect in latency that we can detect. [This article](https://igoro.com/archive/gallery-of-processor-cache-effects/) has more information on the topic. The experiment first calculates the number of iterations (NITER) that would take the lowest stride to run in 1000 microseconds. All experiments will then run this number of times. This is to have a timing baseline and avoid timing errors. Each run of the shader fetches the two values from different points in memory. The shader also has a seemingly redundant variable `zero` that will force the compiler to avoid optimizing the for loop. The experiment will look like this: {F1754670481} Differential Revision: D59649561
diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.glsl
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0) buffer PRECISION restrict readonly InBuffer {
+  float data[];
+}
+source;
+
+layout(set = 0, binding = 1) buffer PRECISION restrict writeonly OutBuffer {
+  float data[];
+}
+destination;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int niter = 1;
+layout(constant_id = 4) const int stride = 1;
+layout(constant_id = 5) const int pitch = 1;
+
+
+void main() {
+  float c = 0;
+  for (int i = 0; i < niter; ++i) {
+    const int zero = i >> 31;
+    c += source.data[zero + pitch * gl_GlobalInvocationID[0]];
+    c += source.data[zero + stride + pitch * gl_GlobalInvocationID[0]];
+  }
+  destination.data[0] = c;
+}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.yaml b/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.yaml
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+buf_cacheline_size:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  shader_variants:
+    - NAME: buf_cacheline_size
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -44,6 +44,7 @@ class App {
     const uint32_t NREG_MAX = 512;
     const uint32_t NREG_STEP = 1;
 
+    // TODO: Make these values configurable
     const double COMPENSATE = 0.01;
     const double THRESHOLD = 3;
 
@@ -146,11 +147,78 @@ class App {
               << std::endl;
     std::cout << "Register type," << reg_ty << std::endl;
   }
+
+  void buf_cacheline_size() {
+    std::cout << "\n------ Buffer Cacheline Size ------" << std::endl;
+
+    // TODO: Make these values configurable
+    const double COMPENSATE = 0.01;
+    const double THRESHOLD = 2;
+
+    const uint32_t PITCH = buf_cache_size_ * 2 / nthread_logic_;
+    const uint32_t BUF_SIZE = PITCH * nthread_logic_;
+    const uint32_t MAX_STRIDE = PITCH / 2;
+
+    uint32_t NITER;
+
+    auto bench = [&](int stride) {
+      size_t len = sizeof(float);
+      StorageBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
+      StorageBuffer out_buf(context(), vkapi::kFloat, len);
+      vkapi::PipelineBarrier pipeline_barrier{};
+
+      auto shader_name = "buf_cacheline_size";
+
+      uint32_t stride_div = stride / sizeof(float);
+      uint32_t pitch_div = PITCH / sizeof(float);
+
+      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+        context()->submit_compute_job(
+            VK_KERNEL_FROM_STR(shader_name),
+            pipeline_barrier,
+            {nthread_logic_, 1, 1},
+            {nthread_logic_, 1, 1},
+            {SV(NITER), SV(stride_div), SV(pitch_div)},
+            VK_NULL_HANDLE,
+            0,
+            in_buf.buffer(),
+            out_buf.buffer());
+      });
+      return time;
+    };
+
+    ensure_min_niter(1000, NITER, [&]() { return bench(sizeof(float)); });
+
+    uint32_t cacheline_size;
+
+    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+    uint32_t stride = sizeof(float);
+    for (; stride <= MAX_STRIDE; stride += sizeof(float)) {
+      double time = bench(stride);
+      std::cout << "Testing stride=" << stride << ", time=\t" << time
+                << std::endl;
+
+      if (dj.push(time)) {
+        cacheline_size = stride;
+        break;
+      }
+    }
+    if (stride >= MAX_STRIDE) {
+      std::cout << "Unable to conclude a top level buffer cacheline size."
+                << std::endl;
+      cacheline_size = MAX_STRIDE;
+    }
+
+    std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
+  }
 };
 
 int main(int argc, const char** argv) {
   App app;
 
+  // TODO: Allow user to skip tests
   app.reg_count();
+  app.buf_cacheline_size();
+
   return 0;
 }