Skip to content

[mlir][gpu] Change GPU modules to globals #135478

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 22, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Remove explicit eager loading from runtime wrappers and replace with …
…CUDA_MODULE_LOADING=EAGER in test.

Format test file.
  • Loading branch information
chsigg committed Apr 13, 2025
commit d38f98f2012053d95d406a5aba79bd354b278b9f
11 changes: 0 additions & 11 deletions mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#include "mlir/ExecutionEngine/CRunnerUtils.h"

#include <cstdio>
#include <vector>

#include "cuda.h"
#include "cuda_bf16.h"
Expand Down Expand Up @@ -122,16 +121,6 @@ mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
ScopedContext scopedContext;
CUmodule module = nullptr;
CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
// Preload functions in the module so that the first call to
// cuModuleGetFunction below doesn't synchronize context.
unsigned numFunctions = 0;
CUDA_REPORT_IF_ERROR(cuModuleGetFunctionCount(&numFunctions, module));
std::vector<CUfunction> functions(numFunctions);
CUDA_REPORT_IF_ERROR(
cuModuleEnumerateFunctions(functions.data(), numFunctions, module));
for (CUfunction function : functions) {
CUDA_REPORT_IF_ERROR(cuFuncLoad(function));
}
return module;
}

Expand Down
77 changes: 41 additions & 36 deletions mlir/test/Integration/GPU/CUDA/concurrent-kernels.mlir
Original file line number Diff line number Diff line change
@@ -1,48 +1,53 @@
// Tests that we can run multiple kernels concurrently. Runs two kernels, which
// increment a global atomic counter, then wait for the counter to reach 2.
// Tests multiple kernels running concurrently. Runs two kernels, which
// increment a global atomic counter and wait for the counter to reach 2.
//
// RUN: mlir-opt %s \
// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=%gpu_compilation_format" \
// RUN: | mlir-runner \
// RUN: | CUDA_MODULE_LOADING=EAGER mlir-runner \
// RUN: --shared-libs=%mlir_cuda_runtime \
// RUN: --shared-libs=%mlir_runner_utils \
// RUN: --entry-point-result=void

// CUDA_MODULE_LOADING=EAGER avoids an implicit context synchronization on first
// use of each kernel. It is technically not needed for this test, because
// there is only one kernel.

module attributes {gpu.container_module} {
gpu.module @kernels {
gpu.func @kernel(%memref: memref<i32>) kernel {
%c0 = arith.constant 0 : i32
%c1 = arith.constant 1 : i32
%c2 = arith.constant 2 : i32
%block = memref.atomic_rmw addi %c1, %memref[] : (i32, memref<i32>) -> i32
scf.while: () -> () {
%value = memref.atomic_rmw addi %c0, %memref[] : (i32, memref<i32>) -> i32
%cond = arith.cmpi slt, %value, %c2 : i32
scf.condition(%cond)
} do {
scf.yield
}
gpu.return
}

gpu.module @kernels {
gpu.func @kernel(%memref: memref<i32>) kernel {
%c0 = arith.constant 0 : i32
%c1 = arith.constant 1 : i32
%c2 = arith.constant 2 : i32
%block = memref.atomic_rmw addi %c1, %memref[] : (i32, memref<i32>) -> i32
scf.while: () -> () {
%value = memref.atomic_rmw addi %c0, %memref[] : (i32, memref<i32>) -> i32
%cond = arith.cmpi slt, %value, %c2 : i32
scf.condition(%cond)
} do {
scf.yield
}
gpu.return
}
}

func.func @main() {
%memref = gpu.alloc host_shared () : memref<i32>
%c0 = arith.constant 0 : i32
memref.store %c0, %memref[] : memref<i32>
func.func @main() {
%c0 = arith.constant 0 : i32
%c1 = arith.constant 1 : index
%memref = gpu.alloc host_shared () : memref<i32>
memref.store %c0, %memref[] : memref<i32>
%0 = gpu.wait async
%1 = gpu.wait async
%2 = gpu.launch_func async [%0] @kernels::@kernel
blocks in (%c1, %c1, %c1)
threads in (%c1, %c1, %c1)
args(%memref: memref<i32>)
%3 = gpu.launch_func async [%1] @kernels::@kernel
blocks in (%c1, %c1, %c1)
threads in (%c1, %c1, %c1)
args(%memref: memref<i32>)
gpu.wait [%2, %3]
return
}

%0 = gpu.wait async
%1 = gpu.wait async
%c1 = arith.constant 1 : index
%2 = gpu.launch_func async [%0] @kernels::@kernel
blocks in (%c1, %c1, %c1)
threads in (%c1, %c1, %c1)
args(%memref: memref<i32>)
%3 = gpu.launch_func async [%1] @kernels::@kernel
blocks in (%c1, %c1, %c1)
threads in (%c1, %c1, %c1)
args(%memref: memref<i32>)
gpu.wait [%2, %3]
return
}
}