Skip to content

Conversation

@clementval
Copy link
Contributor

No description provided.

@clementval clementval requested a review from wangzpgi July 14, 2025 20:43
@llvmbot llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Jul 14, 2025
@llvmbot
Copy link
Member

llvmbot commented Jul 14, 2025

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/148717.diff

9 Files Affected:

  • (modified) flang-rt/lib/cuda/descriptor.cpp (+9)
  • (modified) flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp (+10)
  • (modified) flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h (+4)
  • (modified) flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td (+21)
  • (modified) flang/include/flang/Runtime/CUDA/descriptor.h (+4)
  • (modified) flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp (+15)
  • (modified) flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp (+11)
  • (modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+31-2)
  • (modified) flang/test/Fir/CUDA/cuda-alloc-free.fir (+15)
diff --git a/flang-rt/lib/cuda/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp
index aa75d4eff0511..f81316cc01730 100644
--- a/flang-rt/lib/cuda/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@@ -62,6 +62,15 @@ void RTDEF(CUFDescriptorCheckSection)(
   }
 }
 
+void RTDEF(CUFSetAllocatorIndex)(
+    Descriptor *, int index, const char *sourceFile, int sourceLine) {
+  if (!desc) {
+    Terminator terminator{sourceFile, sourceLine};
+    terminator.Crash("descriptor is null");
+  }
+  desc->SetAllocIdx(index);
+}
+
 RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime::cuda
diff --git a/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
index f1f931e87a86e..83aa37f8d06f3 100644
--- a/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -72,3 +72,13 @@ TEST(AllocatableCUFTest, DescriptorAllocationTest) {
   EXPECT_TRUE(desc != nullptr);
   RTNAME(CUFFreeDescriptor)(desc);
 }
+
+TEST(AllocatableCUFTest, CUFSetAllocatorIndex) {
+  using Fortran::common::TypeCategory;
+  RTNAME(CUFRegisterAllocator)();
+  // REAL(4), DEVICE, ALLOCATABLE :: a(:)
+  auto a{createAllocatable(TypeCategory::Real, 4)};
+  EXPECT_EQ((int)kDefaultAllocator, a->GetAllocIdx());
+  RTNAME(CUFSetAllocatorIndex)(*a, kDeviceAllocatorPos, __FILE__, __LINE__);
+  EXPECT_EQ((int)kDeviceAllocatorPos, a->GetAllocIdx());
+}
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
index bdeb7574012c6..43dca65322a62 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
@@ -31,6 +31,10 @@ void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
 void genDescriptorCheckSection(fir::FirOpBuilder &builder, mlir::Location loc,
                                mlir::Value desc);
 
+/// Generate runtime call to set the allocator index in the descriptor.
+void genSetAllocatorIndex(fir::FirOpBuilder &builder, mlir::Location loc,
+                          mlir::Value desc, mlir::Value index);
+
 } // namespace fir::runtime::cuda
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index e38738230ffbc..4eb35b477b0b0 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -388,4 +388,25 @@ def cuf_StreamCastOp : cuf_Op<"stream_cast", [NoMemoryEffect]> {
   let hasVerifier = 1;
 }
 
+def cuf_SetAllocatorIndexOp : cuf_Op<"set_allocator_idx", []> {
+  let summary = "Set the allocator index in a descriptor";
+
+  let description = [{
+    Allocator index in the Fortran descriptor is used to retrived the correct
+    CUDA allocator to allocate the memory on the device.
+    In many case the allocator index is set when the descriptor is created. For
+    device components, the descriptor is part of the derived-type itself and
+    need to be set after the derived-type is allocated in managed memory.
+  }];
+
+  let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
+      cuf_DataAttributeAttr:$data_attr);
+
+  let assemblyFormat = [{
+    $box `:` qualified(type($box)) attr-dict
+  }];
+
+  let hasVerifier = 1;
+}
+
 #endif // FORTRAN_DIALECT_CUF_CUF_OPS
diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
index 06e4a4649db1b..7555f276ac1de 100644
--- a/flang/include/flang/Runtime/CUDA/descriptor.h
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -41,6 +41,10 @@ void RTDECL(CUFSyncGlobalDescriptor)(
 void RTDECL(CUFDescriptorCheckSection)(
     const Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
 
+/// Set the allocator index with the provided value.
+void RTDECL(CUFSetAllocatorIndex)(Descriptor *, int index,
+    const char *sourceFile = nullptr, int sourceLine = 0);
+
 } // extern "C"
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
index a943469a76728..62a0652cc2e5d 100644
--- a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
@@ -47,3 +47,18 @@ void fir::runtime::cuda::genDescriptorCheckSection(fir::FirOpBuilder &builder,
       builder, loc, fTy, desc, sourceFile, sourceLine)};
   builder.create<fir::CallOp>(loc, func, args);
 }
+
+void fir::runtime::cuda::genSetAllocatorIndex(fir::FirOpBuilder &builder,
+                                              mlir::Location loc,
+                                              mlir::Value desc,
+                                              mlir::Value index) {
+  mlir::func::FuncOp func =
+      fir::runtime::getRuntimeFunc<mkRTKey(CUFSetAllocatorIndex)>(loc, builder);
+  auto fTy = func.getFunctionType();
+  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+  mlir::Value sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
+  llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+      builder, loc, fTy, desc, index, sourceFile, sourceLine)};
+  builder.create<fir::CallOp>(loc, func, args);
+}
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 687007d957225..ade80716f2561 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -345,6 +345,17 @@ llvm::LogicalResult cuf::StreamCastOp::verify() {
   return checkStreamType(*this);
 }
 
+//===----------------------------------------------------------------------===//
+// SetAllocatorOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult cuf::SetAllocatorIndexOp::verify() {
+  if (!mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getBox().getType())))
+    return emitOpError(
+        "expect box to be a reference to class or box type value");
+  return mlir::success();
+}
+
 // Tablegen operators
 
 #define GET_OP_CLASSES
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 0fff06033b73d..750569c126642 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -22,6 +22,7 @@
 #include "flang/Runtime/CUDA/memory.h"
 #include "flang/Runtime/CUDA/pointer.h"
 #include "flang/Runtime/allocatable.h"
+#include "flang/Runtime/allocator-registry-consts.h"
 #include "flang/Support/Fortran.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
@@ -923,6 +924,34 @@ struct CUFSyncDescriptorOpConversion
   }
 };
 
+struct CUFSetAllocatorIndexOpConversion
+    : public mlir::OpRewritePattern<cuf::SetAllocatorIndexOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cuf::SetAllocatorIndexOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto mod = op->getParentOfType<mlir::ModuleOp>();
+    fir::FirOpBuilder builder(rewriter, mod);
+    mlir::Location loc = op.getLoc();
+    int idx = kDefaultAllocator;
+    if (op.getDataAttr() == cuf::DataAttribute::Device) {
+      idx = kDeviceAllocatorPos;
+    } else if (op.getDataAttr() == cuf::DataAttribute::Managed) {
+      idx = kManagedAllocatorPos;
+    } else if (op.getDataAttr() == cuf::DataAttribute::Unified) {
+      idx = kUnifiedAllocatorPos;
+    } else if (op.getDataAttr() == cuf::DataAttribute::Pinned) {
+      idx = kPinnedAllocatorPos;
+    }
+    mlir::Value index =
+        builder.createIntegerConstant(loc, builder.getI32Type(), idx);
+    fir::runtime::cuda::genSetAllocatorIndex(builder, loc, op.getBox(), index);
+    op.erase();
+    return mlir::success();
+  }
+};
+
 class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
 public:
   void runOnOperation() override {
@@ -984,8 +1013,8 @@ void cuf::populateCUFToFIRConversionPatterns(
     const mlir::SymbolTable &symtab, mlir::RewritePatternSet &patterns) {
   patterns.insert<CUFAllocOpConversion>(patterns.getContext(), &dl, &converter);
   patterns.insert<CUFAllocateOpConversion, CUFDeallocateOpConversion,
-                  CUFFreeOpConversion, CUFSyncDescriptorOpConversion>(
-      patterns.getContext());
+                  CUFFreeOpConversion, CUFSyncDescriptorOpConversion,
+                  CUFSetAllocatorIndexOpConversion>(patterns.getContext());
   patterns.insert<CUFDataTransferOpConversion>(patterns.getContext(), symtab,
                                                &dl, &converter);
   patterns.insert<CUFLaunchOpConversion, CUFDeviceAddressOpConversion>(
diff --git a/flang/test/Fir/CUDA/cuda-alloc-free.fir b/flang/test/Fir/CUDA/cuda-alloc-free.fir
index 31f2ed022b6c4..8b6e7d67931df 100644
--- a/flang/test/Fir/CUDA/cuda-alloc-free.fir
+++ b/flang/test/Fir/CUDA/cuda-alloc-free.fir
@@ -94,4 +94,19 @@ func.func @_QQalloc_char() attributes {fir.bindc_name = "alloc_char"} {
 // CHECK: %[[BYTES_CONV:.*]] = fir.convert %[[BYTES]] : (index) -> i64
 // CHECK: fir.call @_FortranACUFMemAlloc(%[[BYTES_CONV]], %c0{{.*}}, %{{.*}}, %{{.*}}) {cuf.data_attr = #cuf.cuda<device>} : (i64, i32, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
 
+
+func.func @_QQsetalloc() {
+  %0 = cuf.alloc !fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}> {bindc_name = "d1", data_attr = #cuf.cuda<managed>, uniq_name = "_QFEd1"} -> !fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
+  %1 = fir.coordinate_of %0, a2 : (!fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+  cuf.set_allocator_idx %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>}
+  return
+}
+
+// CHECK-LABEL:   func.func @_QQsetalloc() {
+// CHECK: %[[DT:.*]] = fir.call @_FortranACUFMemAlloc
+// CHECK: %[[CONV:.*]] = fir.convert %[[DT]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>
+// CHECK: %[[COMP:.*]] = fir.coordinate_of %[[CONV]], a2 : (!fir.ref<!fir.type<_QMm1Tdt1{a2:!fir.box<!fir.heap<!fir.array<?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK: %[[DESC:.*]] = fir.convert %[[COMP]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK: fir.call @_FortranACUFSetAllocatorIndex(%[[DESC]], %c2{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i8>, i32) -> ()
+
 } // end module

let description = [{
Allocator index in the Fortran descriptor is used to retrived the correct
CUDA allocator to allocate the memory on the device.
In many case the allocator index is set when the descriptor is created. For
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
In many case the allocator index is set when the descriptor is created. For
In many cases the allocator index is set when the descriptor is created. For

CUDA allocator to allocate the memory on the device.
In many case the allocator index is set when the descriptor is created. For
device components, the descriptor is part of the derived-type itself and
need to be set after the derived-type is allocated in managed memory.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
need to be set after the derived-type is allocated in managed memory.
needs to be set after the derived-type is allocated in managed memory.

@clementval clementval merged commit 2c67718 into llvm:main Jul 15, 2025
5 of 9 checks passed
@clementval clementval deleted the cuf_set_allocator_op branch July 15, 2025 00:23
// REAL(4), DEVICE, ALLOCATABLE :: a(:)
auto a{createAllocatable(TypeCategory::Real, 4)};
EXPECT_EQ((int)kDefaultAllocator, a->GetAllocIdx());
RTNAME(CUFSetAllocatorIndex)(*a, kDeviceAllocatorPos, __FILE__, __LINE__);
Copy link
Member

@Meinersbur Meinersbur Jul 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get a compilation error here (even after 9e9fdd4):

../../../../_src/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp:82:3: error: no matching function for call to '_FortranACUFSetAllocatorIndex'
   82 |   RTNAME(CUFSetAllocatorIndex)(*a, kDeviceAllocatorPos, __FILE__, __LINE__);
      |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
../../../../_src/flang/include/flang/Runtime/entry-names.h:27:22: note: expanded from macro 'RTNAME'
   27 | #define RTNAME(name) NAME_WITH_PREFIX_AND_REVISION(_Fortran, A, name)
      |                      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../../../../_src/flang/include/flang/Runtime/entry-names.h:26:3: note: expanded from macro 'NAME_WITH_PREFIX_AND_REVISION'
   26 |   prefix##revision##name
      |   ^~~~~~~~~~~~~~~~~~~~~~
<scratch space>:42:1: note: expanded from here
   42 | _FortranACUFSetAllocatorIndex
      | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../../../../_src/flang/include/flang/Runtime/CUDA/descriptor.h:45:6: note: candidate function not viable: no known conversion from 'Fortran::runtime::Descriptor' to 'Descriptor *' for 1st argument; take the address of the argument with &
   45 | void RTDECL(CUFSetAllocatorIndex)(Descriptor *, int index,
      |      ^                            ~~~~~~~~~~~~
../../../../_src/flang/include/flang/Runtime/entry-names.h:31:35: note: expanded from macro 'RTDECL'
   31 | #define RTDECL(name) RT_API_ATTRS RTNAME(name)
      |                                   ^
../../../../_src/flang/include/flang/Runtime/entry-names.h:27:22: note: expanded from macro 'RTNAME'
   27 | #define RTNAME(name) NAME_WITH_PREFIX_AND_REVISION(_Fortran, A, name)
      |                      ^
../../../../_src/flang/include/flang/Runtime/entry-names.h:26:3: note: expanded from macro 'NAME_WITH_PREFIX_AND_REVISION'
   26 |   prefix##revision##name
      |   ^
<scratch space>:16:1: note: expanded from here
   16 | _FortranACUFSetAllocatorIndex
      | ^
1 error generated.

Did you mean

Suggested change
RTNAME(CUFSetAllocatorIndex)(*a, kDeviceAllocatorPos, __FILE__, __LINE__);
RTNAME(CUFSetAllocatorIndex)(a, kDeviceAllocatorPos, __FILE__, __LINE__);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get a compilation error here (even after 9e9fdd4):

../../../../_src/flang-rt/unittests/Runtime/CUDA/AllocatorCUF.cpp:82:3: error: no matching function for call to '_FortranACUFSetAllocatorIndex'
   82 |   RTNAME(CUFSetAllocatorIndex)(*a, kDeviceAllocatorPos, __FILE__, __LINE__);
      |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
../../../../_src/flang/include/flang/Runtime/entry-names.h:27:22: note: expanded from macro 'RTNAME'
   27 | #define RTNAME(name) NAME_WITH_PREFIX_AND_REVISION(_Fortran, A, name)
      |                      ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../../../../_src/flang/include/flang/Runtime/entry-names.h:26:3: note: expanded from macro 'NAME_WITH_PREFIX_AND_REVISION'
   26 |   prefix##revision##name
      |   ^~~~~~~~~~~~~~~~~~~~~~
<scratch space>:42:1: note: expanded from here
   42 | _FortranACUFSetAllocatorIndex
      | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../../../../_src/flang/include/flang/Runtime/CUDA/descriptor.h:45:6: note: candidate function not viable: no known conversion from 'Fortran::runtime::Descriptor' to 'Descriptor *' for 1st argument; take the address of the argument with &
   45 | void RTDECL(CUFSetAllocatorIndex)(Descriptor *, int index,
      |      ^                            ~~~~~~~~~~~~
../../../../_src/flang/include/flang/Runtime/entry-names.h:31:35: note: expanded from macro 'RTDECL'
   31 | #define RTDECL(name) RT_API_ATTRS RTNAME(name)
      |                                   ^
../../../../_src/flang/include/flang/Runtime/entry-names.h:27:22: note: expanded from macro 'RTNAME'
   27 | #define RTNAME(name) NAME_WITH_PREFIX_AND_REVISION(_Fortran, A, name)
      |                      ^
../../../../_src/flang/include/flang/Runtime/entry-names.h:26:3: note: expanded from macro 'NAME_WITH_PREFIX_AND_REVISION'
   26 |   prefix##revision##name
      |   ^
<scratch space>:16:1: note: expanded from here
   16 | _FortranACUFSetAllocatorIndex
      | ^
1 error generated.

Did you mean

Yeah that's the correct form. I opened a small PR #150136. I'm wondering why it doesn't fail on my hand. Anyway, thanks for reporting this.

clementval added a commit that referenced this pull request Jul 22, 2025
Fix building issue reported on #148717
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

flang:fir-hlfir flang Flang issues not falling into any other category

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants