Skip to content

Commit 1352dd0

Browse files
[GpuOclRuntime] Add DLTI attributes from the device info
1 parent 1377cc3 commit 1352dd0

File tree

3 files changed

+120
-24
lines changed

3 files changed

+120
-24
lines changed

lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp

Lines changed: 118 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@
1616
#include "llvm/ExecutionEngine/Orc/LLJIT.h"
1717
#include "llvm/Support/Error.h"
1818

19+
#include "mlir/Dialect/DLTI/DLTI.h"
1920
#include "mlir/Dialect/Func/IR/FuncOps.h"
2021
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
22+
#include "mlir/Interfaces/DataLayoutInterfaces.h"
2123
#include "mlir/Pass/PassManager.h"
2224

2325
namespace mlir::gc::gpu {
@@ -655,7 +657,8 @@ OclModule::~OclModule() {
655657
// buffers. The function will call the original function with the context,
656658
// buffers and the offset/shape/strides, statically created from the
657659
// memref descriptor.
658-
StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
660+
StringRef createStaticMain(OpBuilder &builder, ModuleOp &module,
661+
const StringRef &funcName,
659662
const ArrayRef<Type> argTypes) {
660663
auto mainFunc = module.lookupSymbol<LLVM::LLVMFuncOp>(funcName);
661664
if (!mainFunc) {
@@ -670,11 +673,8 @@ StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
670673
"' must have an least 3 arguments.");
671674
}
672675

673-
auto ctx = module.getContext();
674-
ctx->getOrLoadDialect<LLVM::LLVMDialect>();
675-
OpBuilder builder(ctx);
676676
auto i64Type = builder.getI64Type();
677-
auto ptrType = LLVM::LLVMPointerType::get(ctx);
677+
auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
678678

679679
if (mainArgTypes[nargs - 3] != ptrType ||
680680
mainArgTypes[nargs - 2] != ptrType ||
@@ -722,7 +722,7 @@ StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
722722
auto loc = mainFunc.getLoc();
723723
auto newFuncType = LLVM::LLVMFunctionType::get(
724724
mainFunc.getNumResults() ? mainFunc->getResult(0).getType()
725-
: LLVM::LLVMVoidType::get(ctx),
725+
: LLVM::LLVMVoidType::get(builder.getContext()),
726726
{ptrType, ptrType});
727727
auto newFunc =
728728
OpBuilder::atBlockEnd(module.getBody())
@@ -848,17 +848,58 @@ OclModuleBuilder::build(cl_device_id device, cl_context context) {
848848

849849
llvm::Expected<std::shared_ptr<const OclModule>>
850850
OclModuleBuilder::build(const OclRuntime::Ext &ext) {
851-
auto mod = mlirModule.clone();
852-
PassManager pm{mod.getContext()};
853-
pipeline(pm);
854-
CHECK(!pm.run(mod).failed(), "GPU pipeline failed!");
851+
auto ctx = mlirModule.getContext();
852+
ctx->getOrLoadDialect<DLTIDialect>();
853+
ctx->getOrLoadDialect<LLVM::LLVMDialect>();
854+
OpBuilder builder(ctx);
855+
DataLayoutEntryInterface dltiAttrs[6];
855856

856-
auto staticMain = createStaticMain(mod, funcName, argTypes);
857+
{
858+
struct DevInfo {
859+
cl_device_info key;
860+
const char *attrName;
861+
};
862+
DevInfo devInfo[]{
863+
{CL_DEVICE_MAX_COMPUTE_UNITS, "num_exec_units"},
864+
{CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, "num_exec_units_per_slice"},
865+
{CL_DEVICE_NUM_THREADS_PER_EU_INTEL, "num_threads_per_eu"},
866+
// Assuming the cache size is equal to the local mem
867+
{CL_DEVICE_LOCAL_MEM_SIZE, "L1_cache_size_in_bytes"},
868+
};
857869

858-
if (printIr) {
859-
mod.dump();
860-
}
870+
unsigned i = 0;
871+
for (auto &[key, attrName] : devInfo) {
872+
int64_t value = 0;
873+
CL_CHECK(
874+
clGetDeviceInfo(ext.device, key, sizeof(cl_ulong), &value, nullptr),
875+
"Failed to get the device property ", attrName);
876+
gcLogD("Device property ", attrName, "=", value);
877+
dltiAttrs[i++] =
878+
DataLayoutEntryAttr::get(ctx, builder.getStringAttr(attrName),
879+
builder.getI64IntegerAttr(value));
880+
}
861881

882+
// There is no a corresponding property in the OpenCL API, using the
883+
// hardcoded value.
884+
// TODO: Get the real value.
885+
dltiAttrs[i] = DataLayoutEntryAttr::get(
886+
ctx, builder.getStringAttr("max_vector_op_width"),
887+
builder.getI64IntegerAttr(512));
888+
}
889+
890+
OclRuntime rt(ext);
891+
auto expectedQueue = rt.createQueue();
892+
CHECKE(expectedQueue, "Failed to create queue!");
893+
struct OclQueue {
894+
cl_command_queue queue;
895+
~OclQueue() { clReleaseCommandQueue(queue); }
896+
} queue{*expectedQueue};
897+
OclContext oclCtx{rt, queue.queue, false};
898+
899+
ModuleOp mod;
900+
StringRef staticMain;
901+
std::unique_ptr<ExecutionEngine> eng;
902+
auto devStr = builder.getStringAttr("GPU" /* device ID*/);
862903
ExecutionEngineOptions opts;
863904
opts.jitCodeGenOptLevel = llvm::CodeGenOptLevel::Aggressive;
864905
opts.enableObjectDump = enableObjectDump;
@@ -868,18 +909,75 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
868909
opts.enablePerfNotificationListener = false;
869910
#endif
870911

871-
auto eng = ExecutionEngine::create(mod, opts);
872-
CHECKE(eng, "Failed to create ExecutionEngine!");
873-
eng->get()->registerSymbols(OclRuntime::Exports::symbolMap);
912+
// Build the module and check the kernels workgroup size. If the workgroup
913+
// size is different, rebuild the module with the new size.
914+
for (size_t wgSize = 64;;) {
915+
dltiAttrs[sizeof(dltiAttrs) / sizeof(DataLayoutEntryInterface) - 1] =
916+
DataLayoutEntryAttr::get(
917+
ctx, builder.getStringAttr("max_work_group_size"),
918+
builder.getI64IntegerAttr(static_cast<int64_t>(wgSize)));
919+
TargetDeviceSpecInterface devSpec =
920+
TargetDeviceSpecAttr::get(ctx, dltiAttrs);
921+
auto sysSpec =
922+
TargetSystemSpecAttr::get(ctx, ArrayRef(std::pair(devStr, devSpec)));
923+
mod = mlirModule.clone();
924+
mod.getOperation()->setAttr("#dlti.sys_spec", sysSpec);
925+
PassManager pm{ctx};
926+
pipeline(pm);
927+
CHECK(!pm.run(mod).failed(), "GPU pipeline failed!");
928+
staticMain = createStaticMain(builder, mod, funcName, argTypes);
929+
auto expectedEng = ExecutionEngine::create(mod, opts);
930+
CHECKE(expectedEng, "Failed to create ExecutionEngine!");
931+
expectedEng->get()->registerSymbols(OclRuntime::Exports::symbolMap);
932+
933+
// Find all kernels and query the workgroup size
934+
size_t minSize = std::numeric_limits<size_t>::max();
935+
mod.walk<>([&](LLVM::LLVMFuncOp func) {
936+
auto name = func.getName();
937+
if (!name.starts_with("createGcGpuOclKernel_")) {
938+
return WalkResult::skip();
939+
}
940+
auto fn = expectedEng.get()->lookup(name);
941+
if (!fn) {
942+
gcLogE("Function not found: ", name.data());
943+
return WalkResult::skip();
944+
}
945+
946+
Kernel *kernel =
947+
reinterpret_cast<Kernel *(*)(OclContext *)>(fn.get())(&oclCtx);
948+
size_t s = 0;
949+
auto err = clGetKernelWorkGroupInfo(kernel->kernel, ext.device,
950+
CL_KERNEL_WORK_GROUP_SIZE,
951+
sizeof(size_t), &s, nullptr);
952+
if (err == CL_SUCCESS) {
953+
minSize = std::min(minSize, s);
954+
} else {
955+
gcLogE("Failed to get the kernel workgroup size: ", err);
956+
}
957+
return WalkResult::skip();
958+
});
959+
960+
if (minSize == std::numeric_limits<size_t>::max() || minSize == wgSize) {
961+
eng = std::move(*expectedEng);
962+
break;
963+
}
964+
965+
gcLogD("Changing the workgroup size to ", minSize);
966+
wgSize = minSize;
967+
}
968+
969+
if (printIr) {
970+
mod.dump();
971+
}
874972

875973
OclModule::MainFunc main = {nullptr};
876974

877975
if (staticMain.empty()) {
878-
auto expect = eng.get()->lookupPacked(funcName);
976+
auto expect = eng->lookupPacked(funcName);
879977
CHECKE(expect, "Packed function '", funcName.begin(), "' not found!");
880978
main.wrappedMain = *expect;
881979
} else {
882-
auto expect = eng.get()->lookup(staticMain);
980+
auto expect = eng->lookup(staticMain);
883981
CHECKE(expect, "Compiled function '", staticMain.begin(), "' not found!");
884982
main.staticMain = reinterpret_cast<OclModule::StaticMainFunc>(*expect);
885983
}
@@ -889,8 +987,7 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
889987
return it->second;
890988
}
891989
std::shared_ptr<const OclModule> ptr(
892-
new OclModule(OclRuntime(ext), !staticMain.empty(), main, argTypes,
893-
std::move(eng.get())));
990+
new OclModule(rt, !staticMain.empty(), main, argTypes, std::move(eng)));
894991
return cache.emplace(OclDevCtxPair(ext.device, ext.context), ptr)
895992
.first->second;
896993
}

lib/gc/Transforms/GPU/GpuToGpuOcl.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -381,8 +381,7 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
381381

382382
auto function = rewriter.create<LLVM::LLVMFuncOp>(
383383
loc, funcName,
384-
LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}),
385-
LLVM::Linkage::Internal);
384+
LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}));
386385
rewriter.setInsertionPointToStart(function.addEntryBlock(rewriter));
387386

388387
auto ptr = mod.lookupSymbol<LLVM::GlobalOp>(str("Ptr"));

test/mlir/test/gc/Transforms/GPU/gpu-to-gpuocl.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ module @test attributes {gpu.container_module} {
3636
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_Name
3737
// CHECK: llvm.mlir.global internal @gcGpuOclKernel_entry_kernel_Ptr
3838

39-
// CHECK: llvm.func internal @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
39+
// CHECK: llvm.func @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
4040
// CHECK: [[NEW_PTR:%.+]] = llvm.call @gcGpuOclKernelCreate([[CTX]]
4141
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
4242
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr

0 commit comments

Comments
 (0)