From 4b45264d31a0904dc343898475b380c8afc6a66f Mon Sep 17 00:00:00 2001
From: Esteban Padilla Cerdio <estebanpadilla@meta.com>
Date: Mon, 15 Jul 2024 04:56:09 -0700
Subject: [PATCH] Get device information from OpenCL (#4210)

Summary:
Pull Request resolved: https://github.com/pytorch/executorch/pull/4210

Some of the device's properties, like the number of available Streaming Processors, or the total size of the cache, are not supported by Vulkan as direct queries. These variables are essential for ArchProbe's algorithms, so we need to do some preprocessing with OpenCL to extract these values, before moving to the Vulkan implementation.

This diff moves the GPUInfo implementation into a class to have better control of internal variables like the device's properties, and obtains important limits that will be used in the following diffs, like the device's SM Count, Cache size and number of logic threads.

Reviewed By: jorgep31415

Differential Revision: D59636879

fbshipit-source-id: 1b9aa3a4ce48f360526e09576864514b2ac35429
---
 backends/vulkan/tools/gpuinfo/include/utils.h |  30 +++
 backends/vulkan/tools/gpuinfo/src/app.cpp     | 229 ++++++++++--------
 2 files changed, 160 insertions(+), 99 deletions(-)

diff --git a/backends/vulkan/tools/gpuinfo/include/utils.h b/backends/vulkan/tools/gpuinfo/include/utils.h
index 7fa67f2463..231fb32c5a 100644
--- a/backends/vulkan/tools/gpuinfo/include/utils.h
+++ b/backends/vulkan/tools/gpuinfo/include/utils.h
@@ -10,6 +10,10 @@
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
+#define CL_TARGET_OPENCL_VERSION 200
+#define CL_HPP_TARGET_OPENCL_VERSION CL_TARGET_OPENCL_VERSION
+#include <CL/opencl.hpp>
+
 using namespace vkcompute;
 using namespace api;
 
@@ -49,3 +53,29 @@ void ensure_min_niter(
     niter = uint32_t(niter * min_time_us / t);
   }
 }
+
+cl_platform_id get_cl_platform_id() {
+  cl_uint nplatform_id;
+  clGetPlatformIDs(0, nullptr, &nplatform_id);
+  std::vector<cl_platform_id> platform_ids;
+  platform_ids.resize(nplatform_id);
+  clGetPlatformIDs(nplatform_id, platform_ids.data(), nullptr);
+  return platform_ids[0];
+}
+
+cl_device_id get_cl_dev_id(cl_platform_id platform_id) {
+  cl_uint ndev_id;
+  clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, nullptr, &ndev_id);
+  std::vector<cl_device_id> dev_ids;
+  dev_ids.resize(ndev_id);
+  clGetDeviceIDs(
+      platform_id, CL_DEVICE_TYPE_ALL, ndev_id, dev_ids.data(), nullptr);
+  return dev_ids[0];
+}
+
+cl::Device get_cl_device() {
+  auto platform_id = get_cl_platform_id();
+  auto dev_id = get_cl_dev_id(platform_id);
+  cl::Device dev(dev_id);
+  return dev;
+}
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
index 2949104328..a0ffcaa746 100644
--- a/backends/vulkan/tools/gpuinfo/src/app.cpp
+++ b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -13,117 +13,148 @@
 #include "stats.h"
 #include "utils.h"
 
-void reg_count() {
-  const uint32_t NREG_MIN = 1;
-  const uint32_t NREG_MAX = 512;
-  const uint32_t NREG_STEP = 1;
-
-  const double COMPENSATE = 0.01;
-  const double THRESHOLD = 3;
-
-  const uint32_t NGRP_MIN = 1;
-  const uint32_t NGRP_MAX = 64;
-  const uint32_t NGRP_STEP = 1;
-
-  uint32_t NITER;
-
-  auto bench = [&](uint32_t ngrp, uint32_t nreg) {
-    size_t len = sizeof(float);
-    StorageBuffer buffer(context(), vkapi::kFloat, len);
-    ParamsBuffer params(context(), int32_t(len));
-    vkapi::PipelineBarrier pipeline_barrier{};
-
-    auto shader_name = "reg_count_" + std::to_string(nreg);
-
-    auto time = benchmark_on_gpu(shader_name, 100, [&]() {
-      context()->submit_compute_job(
-          VK_KERNEL_FROM_STR(shader_name),
-          pipeline_barrier,
-          {1, ngrp, 1},
-          {1, 1, 1},
-          {SV(NITER)},
-          VK_NULL_HANDLE,
-          0,
-          buffer.buffer(),
-          params.buffer());
-    });
-    return time;
-  };
-
-  std::cout << "Calculating NITER..." << std::endl;
-  ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
-  std::cout << "NITER," << NITER << std::endl;
-
-  uint32_t nreg_max;
-
-  DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-  uint32_t nreg = NREG_MIN;
-  for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
-    double time = bench(1, nreg);
-    std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << std::endl;
-    if (dj.push(time)) {
-      nreg -= NREG_STEP;
-      nreg_max = nreg;
-      break;
-    }
-  }
-  if (nreg >= NREG_MAX) {
-    std::cout << "Unable to conclude a maximal register count" << std::endl;
-    nreg_max = NREG_STEP;
-  } else {
-    std::cout << nreg_max << " registers are available at most" << std::endl;
+using namespace vkapi;
+
+class App {
+ private:
+  size_t buf_cache_size_;
+  uint32_t sm_count_;
+  uint32_t nthread_logic_;
+
+ public:
+  App() {
+    context()->initialize_querypool();
+
+    std::cout << context()->adapter_ptr()->stringize() << std::endl
+              << std::endl;
+
+    auto cl_device = get_cl_device();
+
+    sm_count_ = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+    nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+    buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
+
+    std::cout << std::endl;
+    std::cout << "SM count," << sm_count_ << std::endl;
+    std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
+    std::cout << "Cache Size," << buf_cache_size_ << std::endl;
   }
 
-  auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
-    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-    for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
-      auto time = bench(ngrp, nreg);
-      std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
-                << ", time=" << time << " us" << std::endl;
+  void reg_count() {
+    std::cout << std::endl;
+    std::cout << "------ Register Count ------" << std::endl;
+    const uint32_t NREG_MIN = 1;
+    const uint32_t NREG_MAX = 512;
+    const uint32_t NREG_STEP = 1;
+
+    const double COMPENSATE = 0.01;
+    const double THRESHOLD = 3;
+
+    const uint32_t NGRP_MIN = 1;
+    const uint32_t NGRP_MAX = 64;
+    const uint32_t NGRP_STEP = 1;
+
+    uint32_t NITER;
+
+    auto bench = [&](uint32_t ngrp, uint32_t nreg) {
+      size_t len = sizeof(float);
+      StorageBuffer buffer(context(), vkapi::kFloat, len);
+      ParamsBuffer params(context(), int32_t(len));
+      vkapi::PipelineBarrier pipeline_barrier{};
+
+      auto shader_name = "reg_count_" + std::to_string(nreg);
+
+      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+        context()->submit_compute_job(
+            VK_KERNEL_FROM_STR(shader_name),
+            pipeline_barrier,
+            {1, ngrp, 1},
+            {1, 1, 1},
+            {SV(NITER)},
+            VK_NULL_HANDLE,
+            0,
+            buffer.buffer(),
+            params.buffer());
+      });
+      return time;
+    };
+
+    std::cout << "Calculating NITER..." << std::endl;
+    ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
+    std::cout << "NITER," << NITER << std::endl;
+
+    uint32_t nreg_max;
 
+    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+    uint32_t nreg = NREG_MIN;
+    for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
+      double time = bench(1, nreg);
+      std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time
+                << std::endl;
       if (dj.push(time)) {
-        ngrp -= NGRP_STEP;
-        std::cout << "Using " << nreg << " registers can have " << ngrp
-                  << " concurrent single-thread workgroups" << std::endl;
-        return ngrp;
+        nreg -= NREG_STEP;
+        nreg_max = nreg;
+        break;
       }
     }
-    std::cout
-        << "Unable to conclude a maximum number of concurrent single-thread workgroups when "
-        << nreg << " registers are occupied" << std::endl;
-    return (uint32_t)1;
-  };
-
-  uint32_t ngrp_full, ngrp_half;
-  ngrp_full = find_ngrp_by_nreg(nreg_max);
-  ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
-
-  std::string reg_ty;
+    if (nreg >= NREG_MAX) {
+      std::cout << "Unable to conclude a maximal register count" << std::endl;
+      nreg_max = NREG_STEP;
+    } else {
+      std::cout << nreg_max << " registers are available at most" << std::endl;
+    }
 
-  if (ngrp_full * 1.5 < ngrp_half) {
-    std::cout << "All physical threads in an sm share " << nreg_max
-              << " registers" << std::endl;
-    reg_ty = "Pooled";
+    auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
+      DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+      for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
+        auto time = bench(ngrp, nreg);
+        std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
+                  << ", time=" << time << " us" << std::endl;
+
+        if (dj.push(time)) {
+          ngrp -= NGRP_STEP;
+          std::cout << "Using " << nreg << " registers can have " << ngrp
+                    << " concurrent single-thread workgroups" << std::endl;
+          return ngrp;
+        }
+      }
+      std::cout
+          << "Unable to conclude a maximum number of concurrent single-thread workgroups when "
+          << nreg << " registers are occupied" << std::endl;
+      return (uint32_t)1;
+    };
+
+    uint32_t ngrp_full, ngrp_half;
+    ngrp_full = find_ngrp_by_nreg(nreg_max);
+    ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
+
+    std::string reg_ty;
+
+    if (ngrp_full * 1.5 < ngrp_half) {
+      std::cout << "All physical threads in an sm share " << nreg_max
+                << " registers" << std::endl;
+      reg_ty = "Pooled";
+
+    } else {
+      std::cout << "Each physical thread has " << nreg_max << " registers"
+                << std::endl;
+      reg_ty = "Dedicated";
+    }
 
-  } else {
-    std::cout << "Each physical thread has " << nreg_max << " registers"
+    std::cout << std::endl << std::endl;
+    std::cout << "NITER," << NITER << std::endl;
+    std::cout << "Max registers," << nreg_max << std::endl;
+    std::cout << "Concurrent full single thread workgroups," << ngrp_full
+              << std::endl;
+    std::cout << "Concurrent half single thread workgroups," << ngrp_half
               << std::endl;
-    reg_ty = "Dedicated";
+    std::cout << "Register type," << reg_ty << std::endl;
   }
-
-  std::cout << "\n\nNITER," << NITER << std::endl;
-  std::cout << "Max registers," << nreg_max << std::endl;
-  std::cout << "Concurrent full single thread workgroups," << ngrp_full
-            << std::endl;
-  std::cout << "Concurrent half single thread workgroups," << ngrp_half
-            << std::endl;
-  std::cout << "Register type," << reg_ty << std::endl;
-}
+};
 
 int main(int argc, const char** argv) {
-  context()->initialize_querypool();
-
-  reg_count();
+  App app;
 
+  app.reg_count();
   return 0;
 }