Skip to content

Commit 37de695

Browse files
authored
[flang][cuda] Make sure global device descriptor is allocated in managed memory (#160596)
When the descriptor of a global device variable is re-materialized to be passed to a kernel, make sure it is allocated in managed memory otherwise the kernel launch will fail.
1 parent 5d511b6 commit 37de695

File tree

2 files changed

+45
-0
lines changed

2 files changed

+45
-0
lines changed

flang/lib/Optimizer/CodeGen/CodeGen.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include "flang/Optimizer/CodeGen/CodeGen.h"
1414

15+
#include "flang/Optimizer/Builder/CUFCommon.h"
1516
#include "flang/Optimizer/CodeGen/CodeGenOpenMP.h"
1617
#include "flang/Optimizer/CodeGen/FIROpPatterns.h"
1718
#include "flang/Optimizer/CodeGen/LLVMInsertChainFolder.h"
@@ -1846,6 +1847,15 @@ struct EmboxOpConversion : public EmboxCommonConversion<fir::EmboxOp> {
18461847
};
18471848

18481849
static bool isDeviceAllocation(mlir::Value val, mlir::Value adaptorVal) {
1850+
// Check if the global symbol is in the device module.
1851+
if (auto addr = mlir::dyn_cast_or_null<fir::AddrOfOp>(val.getDefiningOp()))
1852+
if (auto gpuMod =
1853+
addr->getParentOfType<mlir::ModuleOp>()
1854+
.lookupSymbol<mlir::gpu::GPUModuleOp>(cudaDeviceModuleName))
1855+
if (gpuMod.lookupSymbol<mlir::LLVM::GlobalOp>(addr.getSymbol()) ||
1856+
gpuMod.lookupSymbol<fir::GlobalOp>(addr.getSymbol()))
1857+
return true;
1858+
18491859
if (auto loadOp = mlir::dyn_cast_or_null<fir::LoadOp>(val.getDefiningOp()))
18501860
return isDeviceAllocation(loadOp.getMemref(), {});
18511861
if (auto boxAddrOp =

flang/test/Fir/CUDA/cuda-code-gen.mlir

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,3 +221,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> :
221221

222222
func.func private @__tgt_acc_get_deviceptr() -> !fir.ref<!fir.box<none>>
223223
}
224+
225+
// -----
226+
227+
module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
228+
fir.global @_QMm1Eda {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
229+
%c0 = arith.constant 0 : index
230+
%0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
231+
%1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, index, index) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
232+
fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
233+
}
234+
func.func @_QQmain() attributes {fir.bindc_name = "P", target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+mmx", "+sse", "+sse2", "+cx8", "+x87", "+fxsr"]>} {
235+
%c64 = arith.constant 64 : index
236+
%c1 = arith.constant 1 : index
237+
%c0_i32 = arith.constant 0 : i32
238+
%0 = fir.address_of(@_QMm1Eda) : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
239+
%8 = fir.load %0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
240+
%9 = fircg.ext_rebox %8 : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<!fir.array<?x?xf32>>
241+
gpu.launch_func @cuda_device_mod::@_QMm1Psub2 blocks in (%c1, %c1, %c1) threads in (%c64, %c1, %c1) dynamic_shared_memory_size %c0_i32 args(%9 : !fir.box<!fir.array<?x?xf32>>) {cuf.proc_attr = #cuf.cuda_proc<global>}
242+
return
243+
}
244+
gpu.module @cuda_device_mod [#nvvm.target<chip = "sm_90", features = "+ptx75", link = ["/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_builtin_intrinsics_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_utils_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_cpp_builtins.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12/libdevice_nvhpc_cuda_runtime.10.bc", "/proj/ng/Linux_x86_64/dev/compilers/lib/nvvm-next/12//libdevice_nvhpc_cuda_runtime_builtins_cc90.10.bc", "/proj/ng/Linux_x86_64/dev/cuda/12.9/nvvm/libdevice/libdevice.10.bc"]>] attributes {llvm.data_layout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"} {
245+
fir.global @_QMm1Eda {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
246+
%c0 = arith.constant 0 : index
247+
%0 = fir.zero_bits !fir.heap<!fir.array<?x?xf32>>
248+
%1 = fircg.ext_embox %0(%c0, %c0) {allocator_idx = 2 : i32} : (!fir.heap<!fir.array<?x?xf32>>, index, index) -> !fir.box<!fir.heap<!fir.array<?x?xf32>>>
249+
fir.has_value %1 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>
250+
}
251+
gpu.func @_QMm1Psub2(%arg0: !fir.box<!fir.array<?x?xf32>>) kernel {
252+
gpu.return
253+
}
254+
}
255+
}
256+
257+
// CHECK-LABEL: llvm.func @_QQmain()
258+
// CHECK: llvm.call @_FortranACUFAllocDescriptor

0 commit comments

Comments
 (0)