Skip to content

Commit

Permalink
Get device information from OpenCL (#4210)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #4210

Some of the device's properties, like the number of available Streaming Processors, or the total size of the cache, are not supported by Vulkan as direct queries. These variables are essential for ArchProbe's algorithms, so we need to do some preprocessing with OpenCL to extract these values, before moving to the Vulkan implementation.

This diff moves the GPUInfo implementation into a class to have better control of internal variables like the device's properties, and obtains important limits that will be used in the following diffs, like the device's SM Count, Cache size and number of logic threads.

Reviewed By: jorgep31415

Differential Revision: D59636879

fbshipit-source-id: 1b9aa3a4ce48f360526e09576864514b2ac35429
  • Loading branch information
Esteban Padilla Cerdio authored and facebook-github-bot committed Jul 15, 2024
1 parent 93a7725 commit 4b45264
Show file tree
Hide file tree
Showing 2 changed files with 160 additions and 99 deletions.
30 changes: 30 additions & 0 deletions backends/vulkan/tools/gpuinfo/include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@

#include <executorch/backends/vulkan/runtime/api/api.h>

#define CL_TARGET_OPENCL_VERSION 200
#define CL_HPP_TARGET_OPENCL_VERSION CL_TARGET_OPENCL_VERSION
#include <CL/opencl.hpp>

using namespace vkcompute;
using namespace api;

Expand Down Expand Up @@ -49,3 +53,29 @@ void ensure_min_niter(
niter = uint32_t(niter * min_time_us / t);
}
}

cl_platform_id get_cl_platform_id() {
cl_uint nplatform_id;
clGetPlatformIDs(0, nullptr, &nplatform_id);
std::vector<cl_platform_id> platform_ids;
platform_ids.resize(nplatform_id);
clGetPlatformIDs(nplatform_id, platform_ids.data(), nullptr);
return platform_ids[0];
}

cl_device_id get_cl_dev_id(cl_platform_id platform_id) {
cl_uint ndev_id;
clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, nullptr, &ndev_id);
std::vector<cl_device_id> dev_ids;
dev_ids.resize(ndev_id);
clGetDeviceIDs(
platform_id, CL_DEVICE_TYPE_ALL, ndev_id, dev_ids.data(), nullptr);
return dev_ids[0];
}

cl::Device get_cl_device() {
auto platform_id = get_cl_platform_id();
auto dev_id = get_cl_dev_id(platform_id);
cl::Device dev(dev_id);
return dev;
}
229 changes: 130 additions & 99 deletions backends/vulkan/tools/gpuinfo/src/app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,117 +13,148 @@
#include "stats.h"
#include "utils.h"

void reg_count() {
const uint32_t NREG_MIN = 1;
const uint32_t NREG_MAX = 512;
const uint32_t NREG_STEP = 1;

const double COMPENSATE = 0.01;
const double THRESHOLD = 3;

const uint32_t NGRP_MIN = 1;
const uint32_t NGRP_MAX = 64;
const uint32_t NGRP_STEP = 1;

uint32_t NITER;

auto bench = [&](uint32_t ngrp, uint32_t nreg) {
size_t len = sizeof(float);
StorageBuffer buffer(context(), vkapi::kFloat, len);
ParamsBuffer params(context(), int32_t(len));
vkapi::PipelineBarrier pipeline_barrier{};

auto shader_name = "reg_count_" + std::to_string(nreg);

auto time = benchmark_on_gpu(shader_name, 100, [&]() {
context()->submit_compute_job(
VK_KERNEL_FROM_STR(shader_name),
pipeline_barrier,
{1, ngrp, 1},
{1, 1, 1},
{SV(NITER)},
VK_NULL_HANDLE,
0,
buffer.buffer(),
params.buffer());
});
return time;
};

std::cout << "Calculating NITER..." << std::endl;
ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
std::cout << "NITER," << NITER << std::endl;

uint32_t nreg_max;

DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
uint32_t nreg = NREG_MIN;
for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
double time = bench(1, nreg);
std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << std::endl;
if (dj.push(time)) {
nreg -= NREG_STEP;
nreg_max = nreg;
break;
}
}
if (nreg >= NREG_MAX) {
std::cout << "Unable to conclude a maximal register count" << std::endl;
nreg_max = NREG_STEP;
} else {
std::cout << nreg_max << " registers are available at most" << std::endl;
using namespace vkapi;

class App {
private:
size_t buf_cache_size_;
uint32_t sm_count_;
uint32_t nthread_logic_;

public:
App() {
context()->initialize_querypool();

std::cout << context()->adapter_ptr()->stringize() << std::endl
<< std::endl;

auto cl_device = get_cl_device();

sm_count_ = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
nthread_logic_ = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
buf_cache_size_ = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();

std::cout << std::endl;
std::cout << "SM count," << sm_count_ << std::endl;
std::cout << "Logic Thread Count," << nthread_logic_ << std::endl;
std::cout << "Cache Size," << buf_cache_size_ << std::endl;
}

auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
auto time = bench(ngrp, nreg);
std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
<< ", time=" << time << " us" << std::endl;
void reg_count() {
std::cout << std::endl;
std::cout << "------ Register Count ------" << std::endl;
const uint32_t NREG_MIN = 1;
const uint32_t NREG_MAX = 512;
const uint32_t NREG_STEP = 1;

const double COMPENSATE = 0.01;
const double THRESHOLD = 3;

const uint32_t NGRP_MIN = 1;
const uint32_t NGRP_MAX = 64;
const uint32_t NGRP_STEP = 1;

uint32_t NITER;

auto bench = [&](uint32_t ngrp, uint32_t nreg) {
size_t len = sizeof(float);
StorageBuffer buffer(context(), vkapi::kFloat, len);
ParamsBuffer params(context(), int32_t(len));
vkapi::PipelineBarrier pipeline_barrier{};

auto shader_name = "reg_count_" + std::to_string(nreg);

auto time = benchmark_on_gpu(shader_name, 100, [&]() {
context()->submit_compute_job(
VK_KERNEL_FROM_STR(shader_name),
pipeline_barrier,
{1, ngrp, 1},
{1, 1, 1},
{SV(NITER)},
VK_NULL_HANDLE,
0,
buffer.buffer(),
params.buffer());
});
return time;
};

std::cout << "Calculating NITER..." << std::endl;
ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
std::cout << "NITER," << NITER << std::endl;

uint32_t nreg_max;

DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
uint32_t nreg = NREG_MIN;
for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
double time = bench(1, nreg);
std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time
<< std::endl;
if (dj.push(time)) {
ngrp -= NGRP_STEP;
std::cout << "Using " << nreg << " registers can have " << ngrp
<< " concurrent single-thread workgroups" << std::endl;
return ngrp;
nreg -= NREG_STEP;
nreg_max = nreg;
break;
}
}
std::cout
<< "Unable to conclude a maximum number of concurrent single-thread workgroups when "
<< nreg << " registers are occupied" << std::endl;
return (uint32_t)1;
};

uint32_t ngrp_full, ngrp_half;
ngrp_full = find_ngrp_by_nreg(nreg_max);
ngrp_half = find_ngrp_by_nreg(nreg_max / 2);

std::string reg_ty;
if (nreg >= NREG_MAX) {
std::cout << "Unable to conclude a maximal register count" << std::endl;
nreg_max = NREG_STEP;
} else {
std::cout << nreg_max << " registers are available at most" << std::endl;
}

if (ngrp_full * 1.5 < ngrp_half) {
std::cout << "All physical threads in an sm share " << nreg_max
<< " registers" << std::endl;
reg_ty = "Pooled";
auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
auto time = bench(ngrp, nreg);
std::cout << "Testing occupation (nreg=" << nreg << "); ngrp=" << ngrp
<< ", time=" << time << " us" << std::endl;

if (dj.push(time)) {
ngrp -= NGRP_STEP;
std::cout << "Using " << nreg << " registers can have " << ngrp
<< " concurrent single-thread workgroups" << std::endl;
return ngrp;
}
}
std::cout
<< "Unable to conclude a maximum number of concurrent single-thread workgroups when "
<< nreg << " registers are occupied" << std::endl;
return (uint32_t)1;
};

uint32_t ngrp_full, ngrp_half;
ngrp_full = find_ngrp_by_nreg(nreg_max);
ngrp_half = find_ngrp_by_nreg(nreg_max / 2);

std::string reg_ty;

if (ngrp_full * 1.5 < ngrp_half) {
std::cout << "All physical threads in an sm share " << nreg_max
<< " registers" << std::endl;
reg_ty = "Pooled";

} else {
std::cout << "Each physical thread has " << nreg_max << " registers"
<< std::endl;
reg_ty = "Dedicated";
}

} else {
std::cout << "Each physical thread has " << nreg_max << " registers"
std::cout << std::endl << std::endl;
std::cout << "NITER," << NITER << std::endl;
std::cout << "Max registers," << nreg_max << std::endl;
std::cout << "Concurrent full single thread workgroups," << ngrp_full
<< std::endl;
std::cout << "Concurrent half single thread workgroups," << ngrp_half
<< std::endl;
reg_ty = "Dedicated";
std::cout << "Register type," << reg_ty << std::endl;
}

std::cout << "\n\nNITER," << NITER << std::endl;
std::cout << "Max registers," << nreg_max << std::endl;
std::cout << "Concurrent full single thread workgroups," << ngrp_full
<< std::endl;
std::cout << "Concurrent half single thread workgroups," << ngrp_half
<< std::endl;
std::cout << "Register type," << reg_ty << std::endl;
}
};

int main(int argc, const char** argv) {
context()->initialize_querypool();

reg_count();
App app;

app.reg_count();
return 0;
}

0 comments on commit 4b45264

Please sign in to comment.