Get device information from OpenCL (#4210)

Summary: Pull Request resolved: #4210 Some of the device's properties, like the number of available Streaming Processors, or the total size of the cache, are not supported by Vulkan as direct queries. These variables are essential for ArchProbe's algorithms, so we need to do some preprocessing with OpenCL to extract these values, before moving to the Vulkan implementation. This diff moves the GPUInfo implementation into a class to have better control of internal variables like the device's properties, and obtains important limits that will be used in the following diffs, like the device's SM Count, Cache size and number of logic threads. Reviewed By: jorgep31415 Differential Revision: D59636879 fbshipit-source-id: 1b9aa3a4ce48f360526e09576864514b2ac35429
pytorch · Jul 15, 2024 · 4b45264 · 4b45264
1 parent 93a7725
commit 4b45264
Show file tree

Hide file tree

Showing 2 changed files with 160 additions and 99 deletions.
diff --git a/backends/vulkan/tools/gpuinfo/include/utils.h b/backends/vulkan/tools/gpuinfo/include/utils.h
@@ -10,6 +10,10 @@
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 
+#define CL_TARGET_OPENCL_VERSION 200
+#define CL_HPP_TARGET_OPENCL_VERSION CL_TARGET_OPENCL_VERSION
+#include <CL/opencl.hpp>
+
 using namespace vkcompute;
 using namespace api;
 
@@ -49,3 +53,29 @@ void ensure_min_niter(
     niter = uint32_t(niter * min_time_us / t);
   }
 }
+
+cl_platform_id get_cl_platform_id() {
+  cl_uint nplatform_id;
+  clGetPlatformIDs(0, nullptr, &nplatform_id);
+  std::vector<cl_platform_id> platform_ids;
+  platform_ids.resize(nplatform_id);
+  clGetPlatformIDs(nplatform_id, platform_ids.data(), nullptr);
+  return platform_ids[0];
+}
+
+cl_device_id get_cl_dev_id(cl_platform_id platform_id) {
+  cl_uint ndev_id;
+  clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, nullptr, &ndev_id);
+  std::vector<cl_device_id> dev_ids;
+  dev_ids.resize(ndev_id);
+  clGetDeviceIDs(
+      platform_id, CL_DEVICE_TYPE_ALL, ndev_id, dev_ids.data(), nullptr);
+  return dev_ids[0];
+}
+
+cl::Device get_cl_device() {
+  auto platform_id = get_cl_platform_id();
+  auto dev_id = get_cl_dev_id(platform_id);
+  cl::Device dev(dev_id);
+  return dev;
+}
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -13,117 +13,148 @@
 #include "stats.h"
 #include "utils.h"
 
-void reg_count() {
-  const uint32_t NREG_MIN = 1;
-  const uint32_t NREG_MAX = 512;
-  const uint32_t NREG_STEP = 1;
-
-  const double COMPENSATE = 0.01;
-  const double THRESHOLD = 3;
-
-  const uint32_t NGRP_MIN = 1;
-  const uint32_t NGRP_MAX = 64;
-  const uint32_t NGRP_STEP = 1;
-
-  uint32_t NITER;
-
-  auto bench = [&](uint32_t ngrp, uint32_t nreg) {
-    size_t len = sizeof(float);
-    StorageBuffer buffer(context(), vkapi::kFloat, len);
-    ParamsBuffer params(context(), int32_t(len));
-    vkapi::PipelineBarrier pipeline_barrier{};
-
-    auto shader_name = "reg_count_" + std::to_string(nreg);
-
-    auto time = benchmark_on_gpu(shader_name, 100, [&]() {
-      context()->submit_compute_job(
-          VK_KERNEL_FROM_STR(shader_name),
-          pipeline_barrier,
-          {1, ngrp, 1},
-          {1, 1, 1},
-          {SV(NITER)},
-          VK_NULL_HANDLE,
-          0,
-          buffer.buffer(),
-          params.buffer());
-    });
-    return time;
-  };
-
-  std::cout << "Calculating NITER..." << std::endl;
-  ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
-  std::cout << "NITER," << NITER << std::endl;
-
-  uint32_t nreg_max;
-
-  DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-  uint32_t nreg = NREG_MIN;
-  for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
-    double time = bench(1, nreg);
-    std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << std::endl;
-    if (dj.push(time)) {
-      nreg -= NREG_STEP;
-      nreg_max = nreg;
-      break;
-    }
-  }
-  if (nreg >= NREG_MAX) {
-    std::cout << "Unable to conclude a maximal register count" << std::endl;
-    nreg_max = NREG_STEP;
-  } else {
-    std::cout << nreg_max << " registers are available at most" << std::endl;
+using namespace vkapi;
+
+class App {
+ private:
+  size_t buf_cache_size_;
+  uint32_t sm_count_;
+  uint32_t nthread_logic_;
+
+ public:
+  App() {
+    context()->initialize_querypool();
+
+    std::cout << context()->adapter_ptr()->stringize() << std::endl
+              << std::endl;
+
+    auto cl_device = get_cl_device();
+
+    sm_count_ = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+    nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+    buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
+
+    std::cout << std::endl;
+    std::cout << "SM count," << sm_count_ << std::endl;
+    std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
+    std::cout << "Cache Size," << buf_cache_size_ << std::endl;
   }
 
-  auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
-    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-    for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
-      auto time = bench(ngrp, nreg);
-      std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
-                << ", time=" << time << " us" << std::endl;
+  void reg_count() {
+    std::cout << std::endl;
+    std::cout << "------ Register Count ------" << std::endl;
+    const uint32_t NREG_MIN = 1;
+    const uint32_t NREG_MAX = 512;
+    const uint32_t NREG_STEP = 1;
+
+    const double COMPENSATE = 0.01;
+    const double THRESHOLD = 3;
+
+    const uint32_t NGRP_MIN = 1;
+    const uint32_t NGRP_MAX = 64;
+    const uint32_t NGRP_STEP = 1;
+
+    uint32_t NITER;
+
+    auto bench = [&](uint32_t ngrp, uint32_t nreg) {
+      size_t len = sizeof(float);
+      StorageBuffer buffer(context(), vkapi::kFloat, len);
+      ParamsBuffer params(context(), int32_t(len));
+      vkapi::PipelineBarrier pipeline_barrier{};
+
+      auto shader_name = "reg_count_" + std::to_string(nreg);
+
+      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
+        context()->submit_compute_job(
+            VK_KERNEL_FROM_STR(shader_name),
+            pipeline_barrier,
+            {1, ngrp, 1},
+            {1, 1, 1},
+            {SV(NITER)},
+            VK_NULL_HANDLE,
+            0,
+            buffer.buffer(),
+            params.buffer());
+      });
+      return time;
+    };
+
+    std::cout << "Calculating NITER..." << std::endl;
+    ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
+    std::cout << "NITER," << NITER << std::endl;
+
+    uint32_t nreg_max;
 
+    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+    uint32_t nreg = NREG_MIN;
+    for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
+      double time = bench(1, nreg);
+      std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time
+                << std::endl;
       if (dj.push(time)) {
-        ngrp -= NGRP_STEP;
-        std::cout << "Using " << nreg << " registers can have " << ngrp
-                  << " concurrent single-thread workgroups" << std::endl;
-        return ngrp;
+        nreg -= NREG_STEP;
+        nreg_max = nreg;
+        break;
       }
     }
-    std::cout
-        << "Unable to conclude a maximum number of concurrent single-thread workgroups when "
-        << nreg << " registers are occupied" << std::endl;
-    return (uint32_t)1;
-  };
-
-  uint32_t ngrp_full, ngrp_half;
-  ngrp_full = find_ngrp_by_nreg(nreg_max);
-  ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
-
-  std::string reg_ty;
+    if (nreg >= NREG_MAX) {
+      std::cout << "Unable to conclude a maximal register count" << std::endl;
+      nreg_max = NREG_STEP;
+    } else {
+      std::cout << nreg_max << " registers are available at most" << std::endl;
+    }
 
-  if (ngrp_full * 1.5 < ngrp_half) {
-    std::cout << "All physical threads in an sm share " << nreg_max
-              << " registers" << std::endl;
-    reg_ty = "Pooled";
+    auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
+      DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
+      for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
+        auto time = bench(ngrp, nreg);
+        std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
+                  << ", time=" << time << " us" << std::endl;
+
+        if (dj.push(time)) {
+          ngrp -= NGRP_STEP;
+          std::cout << "Using " << nreg << " registers can have " << ngrp
+                    << " concurrent single-thread workgroups" << std::endl;
+          return ngrp;
+        }
+      }
+      std::cout
+          << "Unable to conclude a maximum number of concurrent single-thread workgroups when "
+          << nreg << " registers are occupied" << std::endl;
+      return (uint32_t)1;
+    };
+
+    uint32_t ngrp_full, ngrp_half;
+    ngrp_full = find_ngrp_by_nreg(nreg_max);
+    ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
+
+    std::string reg_ty;
+
+    if (ngrp_full * 1.5 < ngrp_half) {
+      std::cout << "All physical threads in an sm share " << nreg_max
+                << " registers" << std::endl;
+      reg_ty = "Pooled";
+
+    } else {
+      std::cout << "Each physical thread has " << nreg_max << " registers"
+                << std::endl;
+      reg_ty = "Dedicated";
+    }
 
-  } else {
-    std::cout << "Each physical thread has " << nreg_max << " registers"
+    std::cout << std::endl << std::endl;
+    std::cout << "NITER," << NITER << std::endl;
+    std::cout << "Max registers," << nreg_max << std::endl;
+    std::cout << "Concurrent full single thread workgroups," << ngrp_full
+              << std::endl;
+    std::cout << "Concurrent half single thread workgroups," << ngrp_half
               << std::endl;
-    reg_ty = "Dedicated";
+    std::cout << "Register type," << reg_ty << std::endl;
   }
-
-  std::cout << "\n\nNITER," << NITER << std::endl;
-  std::cout << "Max registers," << nreg_max << std::endl;
-  std::cout << "Concurrent full single thread workgroups," << ngrp_full
-            << std::endl;
-  std::cout << "Concurrent half single thread workgroups," << ngrp_half
-            << std::endl;
-  std::cout << "Register type," << reg_ty << std::endl;
-}
+};
 
 int main(int argc, const char** argv) {
-  context()->initialize_querypool();
-
-  reg_count();
+  App app;
 
+  app.reg_count();
   return 0;
 }