-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[flang][cuda] Carry over the stream information to kernel launch #136217
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesIn CUDA Fortran the stream is encoded in an INTEGER(cuda_stream_kind) variable. This information is carried over the GPU dialect through the When converting the The runtime is adapted to take integer references instead of value for stream. Full diff: https://github.com/llvm/llvm-project/pull/136217.diff 9 Files Affected:
diff --git a/flang-rt/lib/cuda/kernel.cpp b/flang-rt/lib/cuda/kernel.cpp
index 73b4e24bf701f..e299a114ed7eb 100644
--- a/flang-rt/lib/cuda/kernel.cpp
+++ b/flang-rt/lib/cuda/kernel.cpp
@@ -17,7 +17,7 @@ extern "C" {
void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
- intptr_t stream, int32_t smem, void **params, void **extra) {
+ int64_t *stream, int32_t smem, void **params, void **extra) {
dim3 gridDim;
gridDim.x = gridX;
gridDim.y = gridY;
@@ -77,13 +77,13 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
}
cudaStream_t defaultStream = 0;
CUDA_REPORT_IF_ERROR(cudaLaunchKernel(kernel, gridDim, blockDim, params, smem,
- stream != kNoAsyncId ? (cudaStream_t)stream : defaultStream));
+ stream != nullptr ? (cudaStream_t)(*stream) : defaultStream));
}
void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
- intptr_t stream, int32_t smem, void **params, void **extra) {
+ int64_t *stream, int32_t smem, void **params, void **extra) {
cudaLaunchConfig_t config;
config.gridDim.x = gridX;
config.gridDim.y = gridY;
@@ -141,8 +141,8 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
terminator.Crash("Too many invalid grid dimensions");
}
config.dynamicSmemBytes = smem;
- if (stream != kNoAsyncId) {
- config.stream = (cudaStream_t)stream;
+ if (stream != nullptr) {
+ config.stream = (cudaStream_t)(*stream);
} else {
config.stream = 0;
}
@@ -158,7 +158,7 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
- intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
+ intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
void **extra) {
dim3 gridDim;
gridDim.x = gridX;
@@ -218,9 +218,8 @@ void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
terminator.Crash("Too many invalid grid dimensions");
}
cudaStream_t defaultStream = 0;
- CUDA_REPORT_IF_ERROR(
- cudaLaunchCooperativeKernel(kernel, gridDim, blockDim, params, smem,
- stream != kNoAsyncId ? (cudaStream_t)stream : defaultStream));
+ CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(kernel, gridDim, blockDim,
+ params, smem, stream != nullptr ? (cudaStream_t)*stream : defaultStream));
}
} // extern "C"
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index ccf9969e73a8e..926983d364ed1 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -383,7 +383,7 @@ def cuf_StreamCastOp : cuf_Op<"stream_cast", [NoMemoryEffect]> {
Later in the lowering this will become a no op.
}];
- let arguments = (ins fir_ReferenceType:$stream);
+ let arguments = (ins AnyTypeOf<[fir_ReferenceType, LLVM_AnyPointer]>:$stream);
let results = (outs GPU_AsyncToken:$token);
diff --git a/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h b/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h
index 7d76c1f4e5218..f40f0049e9085 100644
--- a/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h
+++ b/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h
@@ -19,9 +19,9 @@ class LLVMTypeConverter;
namespace cuf {
-void populateCUFGPUToLLVMConversionPatterns(
- const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
- mlir::PatternBenefit benefit = 1);
+void populateCUFGPUToLLVMConversionPatterns(fir::LLVMTypeConverter &converter,
+ mlir::RewritePatternSet &patterns,
+ mlir::PatternBenefit benefit = 1);
} // namespace cuf
diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h
index eb9135868fdee..70eb74bb79554 100644
--- a/flang/include/flang/Runtime/CUDA/kernel.h
+++ b/flang/include/flang/Runtime/CUDA/kernel.h
@@ -21,17 +21,17 @@ extern "C" {
void RTDEF(CUFLaunchKernel)(const void *kernelName, intptr_t gridX,
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
- intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
+ intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
void **extra);
void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
- intptr_t stream, int32_t smem, void **params, void **extra);
+ int64_t *stream, int32_t smem, void **params, void **extra);
void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
- intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
+ intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
void **extra);
} // extern "C"
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 2c6d22f6f6c7d..7afbbf83e7077 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -147,9 +147,9 @@ template <typename OpTy>
static llvm::LogicalResult checkStreamType(OpTy op) {
if (!op.getStream())
return mlir::success();
- auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType());
- if (!refTy.getEleTy().isInteger(64))
- return op.emitOpError("stream is expected to be a i64 reference");
+ if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType()))
+ if (!refTy.getEleTy().isInteger(64))
+ return op.emitOpError("stream is expected to be a i64 reference");
return mlir::success();
}
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 205acbfea22b8..02b4e6a5a469c 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -121,7 +121,7 @@ struct GPULaunchKernelConversion
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
- llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
+ llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy, ptrTy},
/*isVarArg=*/false);
auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
@@ -133,10 +133,15 @@ struct GPULaunchKernelConversion
launchKernelFuncOp.setVisibility(
mlir::SymbolTable::Visibility::Private);
}
- mlir::Value stream = adaptor.getAsyncObject();
- if (!stream)
- stream = rewriter.create<mlir::LLVM::ConstantOp>(
- loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1));
+
+ mlir::Value stream = nullPtr;
+ if (!adaptor.getAsyncDependencies().empty()) {
+ if (adaptor.getAsyncDependencies().size() != 1)
+ return rewriter.notifyMatchFailure(
+ op, "Can only convert with exactly one stream dependency.");
+ stream = adaptor.getAsyncDependencies().front();
+ }
+
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
op, funcTy, cufLaunchClusterKernel,
mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
@@ -157,8 +162,8 @@ struct GPULaunchKernelConversion
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
- llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
- i32Ty, ptrTy, ptrTy},
+ llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy,
+ ptrTy},
/*isVarArg=*/false);
auto cufLaunchKernel =
mlir::SymbolRefAttr::get(mod.getContext(), fctName);
@@ -171,10 +176,13 @@ struct GPULaunchKernelConversion
mlir::SymbolTable::Visibility::Private);
}
- mlir::Value stream = adaptor.getAsyncObject();
- if (!stream)
- stream = rewriter.create<mlir::LLVM::ConstantOp>(
- loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1));
+ mlir::Value stream = nullPtr;
+ if (!adaptor.getAsyncDependencies().empty()) {
+ if (adaptor.getAsyncDependencies().size() != 1)
+ return rewriter.notifyMatchFailure(
+ op, "Can only convert with exactly one stream dependency.");
+ stream = adaptor.getAsyncDependencies().front();
+ }
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
op, funcTy, cufLaunchKernel,
@@ -251,6 +259,22 @@ struct CUFSharedMemoryOpConversion
}
};
+struct CUFStreamCastConversion
+ : public mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp> {
+ explicit CUFStreamCastConversion(const fir::LLVMTypeConverter &typeConverter,
+ mlir::PatternBenefit benefit)
+ : mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp>(typeConverter,
+ benefit) {}
+ using OpAdaptor = typename cuf::StreamCastOp::Adaptor;
+
+ mlir::LogicalResult
+ matchAndRewrite(cuf::StreamCastOp op, OpAdaptor adaptor,
+ mlir::ConversionPatternRewriter &rewriter) const override {
+ rewriter.replaceOp(op, adaptor.getStream());
+ return mlir::success();
+ }
+};
+
class CUFGPUToLLVMConversion
: public fir::impl::CUFGPUToLLVMConversionBase<CUFGPUToLLVMConversion> {
public:
@@ -283,8 +307,11 @@ class CUFGPUToLLVMConversion
} // namespace
void cuf::populateCUFGPUToLLVMConversionPatterns(
- const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
+ fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
mlir::PatternBenefit benefit) {
- patterns.add<CUFSharedMemoryOpConversion, GPULaunchKernelConversion>(
- converter, benefit);
+ converter.addConversion([&converter](mlir::gpu::AsyncTokenType) -> Type {
+ return mlir::LLVM::LLVMPointerType::get(&converter.getContext());
+ });
+ patterns.add<CUFSharedMemoryOpConversion, GPULaunchKernelConversion,
+ CUFStreamCastConversion>(converter, benefit);
}
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index 85266f17bb67a..0319213016e45 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -113,7 +113,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
// -----
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
- llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
+ llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
llvm.return
}
@@ -166,3 +166,66 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
+ llvm.return
+ }
+ llvm.func @_QQmain() attributes {fir.bindc_name = "test"} {
+ %0 = llvm.mlir.constant(1 : index) : i64
+ %stream = llvm.alloca %0 x i64 : (i64) -> !llvm.ptr
+ %1 = llvm.mlir.constant(2 : index) : i64
+ %2 = llvm.mlir.constant(0 : i32) : i32
+ %3 = llvm.mlir.constant(10 : index) : i64
+ %token = cuf.stream_cast %stream : !llvm.ptr
+ gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2
+ llvm.return
+ }
+ gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QQmain()
+// CHECK: %[[STREAM:.*]] = llvm.alloca %{{.*}} x i64 : (i64) -> !llvm.ptr
+// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
+// CHECK: llvm.call @_FortranACUFLaunchKernel(%[[KERNEL_PTR]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) -> ()
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ llvm.func @_QMmod1Phost_sub() {
+ %0 = llvm.mlir.constant(1 : i32) : i32
+ %one = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+ %stream = llvm.alloca %one x i64 : (i64) -> !llvm.ptr
+ %2 = llvm.mlir.constant(40 : i64) : i64
+ %3 = llvm.mlir.constant(16 : i32) : i32
+ %4 = llvm.mlir.constant(25 : i32) : i32
+ %5 = llvm.mlir.constant(21 : i32) : i32
+ %6 = llvm.mlir.constant(17 : i32) : i32
+ %7 = llvm.mlir.constant(1 : index) : i64
+ %8 = llvm.mlir.constant(27 : i32) : i32
+ %9 = llvm.mlir.constant(6 : i32) : i32
+ %10 = llvm.mlir.constant(1 : i32) : i32
+ %11 = llvm.mlir.constant(0 : i32) : i32
+ %12 = llvm.mlir.constant(10 : index) : i64
+ %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
+ %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
+ %token = cuf.stream_cast %stream : !llvm.ptr
+ gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+ llvm.return
+ }
+ llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
+ llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> {
+ %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
+ llvm.return %0 : !llvm.array<2 x i8>
+ }
+ llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
+ llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+ gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
+// CHECK: %[[STREAM:.*]] = llvm.alloca %{{.*}} x i64 : (i64) -> !llvm.ptr
+// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) -> ()
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index 319991546d3fe..028279832c703 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -154,5 +154,5 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// CHECK-LABEL: func.func @_QQmain()
// CHECK: %[[STREAM:.*]] = fir.alloca i64 {bindc_name = "stream", uniq_name = "_QMtest_callFhostEstream"}
// CHECK: %[[DECL_STREAM:.*]]:2 = hlfir.declare %[[STREAM]] {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : <i64>
+// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : !fir.ref<i64>
// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest
diff --git a/flang/test/Fir/CUDA/cuda-stream.mlir b/flang/test/Fir/CUDA/cuda-stream.mlir
index 50f230467854b..a501603fd35d1 100644
--- a/flang/test/Fir/CUDA/cuda-stream.mlir
+++ b/flang/test/Fir/CUDA/cuda-stream.mlir
@@ -17,5 +17,5 @@ module attributes {gpu.container_module} {
// CHECK-LABEL: func.func @_QMmod1Phost_sub()
// CHECK: %[[STREAM:.*]] = fir.alloca i64
-// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[STREAM]] : <i64>
+// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[STREAM]] : !fir.ref<i64>
// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMmod1Psub1
|
In CUDA Fortran the stream is encoded in an INTEGER(cuda_stream_kind) variable.
This information is carried over the GPU dialect through the
cuf.stream_cast
and the token in the GPU ops.When converting the
gpu.launch_func
to runtime call, thecuf.stream_cast
becomes a no-op and the reference to the stream is passed to the runtime.The runtime is adapted to take integer references instead of value for stream.