Skip to content

[flang][cuda] Carry over the stream information to kernel launch #136217

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 18, 2025

Conversation

clementval
Copy link
Contributor

In CUDA Fortran the stream is encoded in an INTEGER(cuda_stream_kind) variable.

This information is carried over the GPU dialect through the cuf.stream_cast and the token in the GPU ops.

When converting the gpu.launch_func to runtime call, the cuf.stream_cast becomes a no-op and the reference to the stream is passed to the runtime.

The runtime is adapted to take integer references instead of value for stream.

@clementval clementval requested a review from wangzpgi April 17, 2025 22:34
@llvmbot llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Apr 17, 2025
@llvmbot
Copy link
Member

llvmbot commented Apr 17, 2025

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

Changes

In CUDA Fortran the stream is encoded in an INTEGER(cuda_stream_kind) variable.

This information is carried over the GPU dialect through the cuf.stream_cast and the token in the GPU ops.

When converting the gpu.launch_func to runtime call, the cuf.stream_cast becomes a no-op and the reference to the stream is passed to the runtime.

The runtime is adapted to take integer references instead of value for stream.


Full diff: https://github.com/llvm/llvm-project/pull/136217.diff

9 Files Affected:

  • (modified) flang-rt/lib/cuda/kernel.cpp (+8-9)
  • (modified) flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td (+1-1)
  • (modified) flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h (+3-3)
  • (modified) flang/include/flang/Runtime/CUDA/kernel.h (+3-3)
  • (modified) flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp (+3-3)
  • (modified) flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp (+41-14)
  • (modified) flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir (+64-1)
  • (modified) flang/test/Fir/CUDA/cuda-launch.fir (+1-1)
  • (modified) flang/test/Fir/CUDA/cuda-stream.mlir (+1-1)
diff --git a/flang-rt/lib/cuda/kernel.cpp b/flang-rt/lib/cuda/kernel.cpp
index 73b4e24bf701f..e299a114ed7eb 100644
--- a/flang-rt/lib/cuda/kernel.cpp
+++ b/flang-rt/lib/cuda/kernel.cpp
@@ -17,7 +17,7 @@ extern "C" {
 
 void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
     intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
-    intptr_t stream, int32_t smem, void **params, void **extra) {
+    int64_t *stream, int32_t smem, void **params, void **extra) {
   dim3 gridDim;
   gridDim.x = gridX;
   gridDim.y = gridY;
@@ -77,13 +77,13 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
   }
   cudaStream_t defaultStream = 0;
   CUDA_REPORT_IF_ERROR(cudaLaunchKernel(kernel, gridDim, blockDim, params, smem,
-      stream != kNoAsyncId ? (cudaStream_t)stream : defaultStream));
+      stream != nullptr ? (cudaStream_t)(*stream) : defaultStream));
 }
 
 void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
     intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
     intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
-    intptr_t stream, int32_t smem, void **params, void **extra) {
+    int64_t *stream, int32_t smem, void **params, void **extra) {
   cudaLaunchConfig_t config;
   config.gridDim.x = gridX;
   config.gridDim.y = gridY;
@@ -141,8 +141,8 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
     terminator.Crash("Too many invalid grid dimensions");
   }
   config.dynamicSmemBytes = smem;
-  if (stream != kNoAsyncId) {
-    config.stream = (cudaStream_t)stream;
+  if (stream != nullptr) {
+    config.stream = (cudaStream_t)(*stream);
   } else {
     config.stream = 0;
   }
@@ -158,7 +158,7 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
 
 void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
     intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
-    intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
+    intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
     void **extra) {
   dim3 gridDim;
   gridDim.x = gridX;
@@ -218,9 +218,8 @@ void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
     terminator.Crash("Too many invalid grid dimensions");
   }
   cudaStream_t defaultStream = 0;
-  CUDA_REPORT_IF_ERROR(
-      cudaLaunchCooperativeKernel(kernel, gridDim, blockDim, params, smem,
-          stream != kNoAsyncId ? (cudaStream_t)stream : defaultStream));
+  CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(kernel, gridDim, blockDim,
+      params, smem, stream != nullptr ? (cudaStream_t)*stream : defaultStream));
 }
 
 } // extern "C"
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index ccf9969e73a8e..926983d364ed1 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -383,7 +383,7 @@ def cuf_StreamCastOp : cuf_Op<"stream_cast", [NoMemoryEffect]> {
     Later in the lowering this will become a no op.
   }];
 
-  let arguments = (ins fir_ReferenceType:$stream);
+  let arguments = (ins AnyTypeOf<[fir_ReferenceType, LLVM_AnyPointer]>:$stream);
 
   let results = (outs GPU_AsyncToken:$token);
 
diff --git a/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h b/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h
index 7d76c1f4e5218..f40f0049e9085 100644
--- a/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h
+++ b/flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h
@@ -19,9 +19,9 @@ class LLVMTypeConverter;
 
 namespace cuf {
 
-void populateCUFGPUToLLVMConversionPatterns(
-    const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
-    mlir::PatternBenefit benefit = 1);
+void populateCUFGPUToLLVMConversionPatterns(fir::LLVMTypeConverter &converter,
+                                            mlir::RewritePatternSet &patterns,
+                                            mlir::PatternBenefit benefit = 1);
 
 } // namespace cuf
 
diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h
index eb9135868fdee..70eb74bb79554 100644
--- a/flang/include/flang/Runtime/CUDA/kernel.h
+++ b/flang/include/flang/Runtime/CUDA/kernel.h
@@ -21,17 +21,17 @@ extern "C" {
 
 void RTDEF(CUFLaunchKernel)(const void *kernelName, intptr_t gridX,
     intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
-    intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
+    intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
     void **extra);
 
 void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
     intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
     intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
-    intptr_t stream, int32_t smem, void **params, void **extra);
+    int64_t *stream, int32_t smem, void **params, void **extra);
 
 void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
     intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
-    intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
+    intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
     void **extra);
 
 } // extern "C"
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 2c6d22f6f6c7d..7afbbf83e7077 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -147,9 +147,9 @@ template <typename OpTy>
 static llvm::LogicalResult checkStreamType(OpTy op) {
   if (!op.getStream())
     return mlir::success();
-  auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType());
-  if (!refTy.getEleTy().isInteger(64))
-    return op.emitOpError("stream is expected to be a i64 reference");
+  if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType()))
+    if (!refTy.getEleTy().isInteger(64))
+      return op.emitOpError("stream is expected to be a i64 reference");
   return mlir::success();
 }
 
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 205acbfea22b8..02b4e6a5a469c 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -121,7 +121,7 @@ struct GPULaunchKernelConversion
           voidTy,
           {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
            llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
-           llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
+           llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy, ptrTy},
           /*isVarArg=*/false);
       auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
           mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
@@ -133,10 +133,15 @@ struct GPULaunchKernelConversion
         launchKernelFuncOp.setVisibility(
             mlir::SymbolTable::Visibility::Private);
       }
-      mlir::Value stream = adaptor.getAsyncObject();
-      if (!stream)
-        stream = rewriter.create<mlir::LLVM::ConstantOp>(
-            loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1));
+
+      mlir::Value stream = nullPtr;
+      if (!adaptor.getAsyncDependencies().empty()) {
+        if (adaptor.getAsyncDependencies().size() != 1)
+          return rewriter.notifyMatchFailure(
+              op, "Can only convert with exactly one stream dependency.");
+        stream = adaptor.getAsyncDependencies().front();
+      }
+
       rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
           op, funcTy, cufLaunchClusterKernel,
           mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
@@ -157,8 +162,8 @@ struct GPULaunchKernelConversion
       auto funcTy = mlir::LLVM::LLVMFunctionType::get(
           voidTy,
           {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
-           llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
-           i32Ty, ptrTy, ptrTy},
+           llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy,
+           ptrTy},
           /*isVarArg=*/false);
       auto cufLaunchKernel =
           mlir::SymbolRefAttr::get(mod.getContext(), fctName);
@@ -171,10 +176,13 @@ struct GPULaunchKernelConversion
             mlir::SymbolTable::Visibility::Private);
       }
 
-      mlir::Value stream = adaptor.getAsyncObject();
-      if (!stream)
-        stream = rewriter.create<mlir::LLVM::ConstantOp>(
-            loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1));
+      mlir::Value stream = nullPtr;
+      if (!adaptor.getAsyncDependencies().empty()) {
+        if (adaptor.getAsyncDependencies().size() != 1)
+          return rewriter.notifyMatchFailure(
+              op, "Can only convert with exactly one stream dependency.");
+        stream = adaptor.getAsyncDependencies().front();
+      }
 
       rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
           op, funcTy, cufLaunchKernel,
@@ -251,6 +259,22 @@ struct CUFSharedMemoryOpConversion
   }
 };
 
+struct CUFStreamCastConversion
+    : public mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp> {
+  explicit CUFStreamCastConversion(const fir::LLVMTypeConverter &typeConverter,
+                                   mlir::PatternBenefit benefit)
+      : mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp>(typeConverter,
+                                                        benefit) {}
+  using OpAdaptor = typename cuf::StreamCastOp::Adaptor;
+
+  mlir::LogicalResult
+  matchAndRewrite(cuf::StreamCastOp op, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOp(op, adaptor.getStream());
+    return mlir::success();
+  }
+};
+
 class CUFGPUToLLVMConversion
     : public fir::impl::CUFGPUToLLVMConversionBase<CUFGPUToLLVMConversion> {
 public:
@@ -283,8 +307,11 @@ class CUFGPUToLLVMConversion
 } // namespace
 
 void cuf::populateCUFGPUToLLVMConversionPatterns(
-    const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
+    fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
     mlir::PatternBenefit benefit) {
-  patterns.add<CUFSharedMemoryOpConversion, GPULaunchKernelConversion>(
-      converter, benefit);
+  converter.addConversion([&converter](mlir::gpu::AsyncTokenType) -> Type {
+    return mlir::LLVM::LLVMPointerType::get(&converter.getContext());
+  });
+  patterns.add<CUFSharedMemoryOpConversion, GPULaunchKernelConversion,
+               CUFStreamCastConversion>(converter, benefit);
 }
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index 85266f17bb67a..0319213016e45 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -113,7 +113,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
 // -----
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
-  llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
+  llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
   llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
     llvm.return
   }
@@ -166,3 +166,66 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
 
 // CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
 // CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
+    llvm.return
+  }
+  llvm.func @_QQmain() attributes {fir.bindc_name = "test"} {
+    %0 = llvm.mlir.constant(1 : index) : i64
+    %stream = llvm.alloca %0 x i64 : (i64) -> !llvm.ptr
+    %1 = llvm.mlir.constant(2 : index) : i64
+    %2 = llvm.mlir.constant(0 : i32) : i32
+    %3 = llvm.mlir.constant(10 : index) : i64
+    %token = cuf.stream_cast %stream : !llvm.ptr
+    gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2
+    llvm.return
+  }
+  gpu.binary @cuda_device_mod  [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QQmain()
+// CHECK: %[[STREAM:.*]] = llvm.alloca %{{.*}} x i64 : (i64) -> !llvm.ptr
+// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
+// CHECK: llvm.call @_FortranACUFLaunchKernel(%[[KERNEL_PTR]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) -> ()
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  llvm.func @_QMmod1Phost_sub() {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %one = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %stream = llvm.alloca %one x i64 : (i64) -> !llvm.ptr
+    %2 = llvm.mlir.constant(40 : i64) : i64
+    %3 = llvm.mlir.constant(16 : i32) : i32
+    %4 = llvm.mlir.constant(25 : i32) : i32
+    %5 = llvm.mlir.constant(21 : i32) : i32
+    %6 = llvm.mlir.constant(17 : i32) : i32
+    %7 = llvm.mlir.constant(1 : index) : i64
+    %8 = llvm.mlir.constant(27 : i32) : i32
+    %9 = llvm.mlir.constant(6 : i32) : i32
+    %10 = llvm.mlir.constant(1 : i32) : i32
+    %11 = llvm.mlir.constant(0 : i32) : i32
+    %12 = llvm.mlir.constant(10 : index) : i64
+    %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
+    %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
+    %token = cuf.stream_cast %stream : !llvm.ptr
+    gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+    llvm.return
+  }
+  llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
+  llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5()  {addr_space = 0 : i32} : !llvm.array<2 x i8> {
+    %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
+    llvm.return %0 : !llvm.array<2 x i8>
+  }
+  llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
+  llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+  gpu.binary @cuda_device_mod  [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
+// CHECK: %[[STREAM:.*]] = llvm.alloca %{{.*}} x i64 : (i64) -> !llvm.ptr
+// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) -> ()
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index 319991546d3fe..028279832c703 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -154,5 +154,5 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
 // CHECK-LABEL: func.func @_QQmain()
 // CHECK: %[[STREAM:.*]] = fir.alloca i64 {bindc_name = "stream", uniq_name = "_QMtest_callFhostEstream"}
 // CHECK: %[[DECL_STREAM:.*]]:2 = hlfir.declare %[[STREAM]] {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : <i64>
+// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : !fir.ref<i64>
 // CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest
diff --git a/flang/test/Fir/CUDA/cuda-stream.mlir b/flang/test/Fir/CUDA/cuda-stream.mlir
index 50f230467854b..a501603fd35d1 100644
--- a/flang/test/Fir/CUDA/cuda-stream.mlir
+++ b/flang/test/Fir/CUDA/cuda-stream.mlir
@@ -17,5 +17,5 @@ module attributes {gpu.container_module} {
 
 // CHECK-LABEL: func.func @_QMmod1Phost_sub()
 // CHECK: %[[STREAM:.*]] = fir.alloca i64
-// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[STREAM]] : <i64>
+// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[STREAM]] : !fir.ref<i64>
 // CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMmod1Psub1

@clementval clementval merged commit d79bb93 into llvm:main Apr 18, 2025
11 checks passed
@clementval clementval deleted the cuf_async_token branch April 18, 2025 17:44
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
flang:fir-hlfir flang Flang issues not falling into any other category
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants