Skip to content

[flang][cuda] Lower ALLOCATE for device variable #88980

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 17, 2024

Conversation

clementval
Copy link
Contributor

Replace the runtime call to AllocatableAllocate for CUDA device variable to the newly added fir.cuda_allocate operation.

@llvmbot llvmbot added flang Flang issues not falling into any other category flang:fir-hlfir labels Apr 16, 2024
@llvmbot
Copy link
Member

llvmbot commented Apr 16, 2024

@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

Changes

Replace the runtime call to AllocatableAllocate for CUDA device variable to the newly added fir.cuda_allocate operation.


Full diff: https://github.com/llvm/llvm-project/pull/88980.diff

2 Files Affected:

  • (modified) flang/lib/Lower/Allocatable.cpp (+47-10)
  • (added) flang/test/Lower/CUDA/cuda-allocatable.cuf (+107)
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 42e78fc96e4445..1d434d512d0c5c 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -14,6 +14,7 @@
 #include "flang/Evaluate/tools.h"
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/ConvertType.h"
+#include "flang/Lower/ConvertVariable.h"
 #include "flang/Lower/IterationSpace.h"
 #include "flang/Lower/Mangler.h"
 #include "flang/Lower/OpenACC.h"
@@ -368,20 +369,17 @@ class AllocateStmtHelper {
               [&](const Fortran::parser::AllocOpt::Mold &mold) {
                 moldExpr = Fortran::semantics::GetExpr(mold.v.value());
               },
-              [&](const Fortran::parser::AllocOpt::Stream &) {
-                TODO(loc, "CUDA ALLOCATE(STREAM=)");
+              [&](const Fortran::parser::AllocOpt::Stream &stream) {
+                streamExpr = Fortran::semantics::GetExpr(stream.v.value());
               },
-              [&](const Fortran::parser::AllocOpt::Pinned &) {
-                TODO(loc, "CUDA ALLOCATE(PINNED=)");
+              [&](const Fortran::parser::AllocOpt::Pinned &pinned) {
+                pinnedExpr = Fortran::semantics::GetExpr(pinned.v.value());
               },
           },
           allocOption.u);
   }
 
   void lowerAllocation(const Allocation &alloc) {
-    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
-      TODO(loc, "Allocation of variable with CUDA attributes");
-
     fir::MutableBoxValue boxAddr =
         genMutableBoxValue(converter, loc, alloc.getAllocObj());
 
@@ -456,7 +454,8 @@ class AllocateStmtHelper {
                            const fir::MutableBoxValue &box) {
     if (!box.isDerived() && !errorManager.hasStatSpec() &&
         !alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() &&
-        !useAllocateRuntime && !box.isPointer()) {
+        !useAllocateRuntime && !box.isPointer() &&
+        !Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
       // Pointers must use PointerAllocate so that their deallocations
       // can be validated.
       genInlinedAllocation(alloc, box);
@@ -472,7 +471,12 @@ class AllocateStmtHelper {
       genSetType(alloc, box, loc);
     genSetDeferredLengthParameters(alloc, box);
     genAllocateObjectBounds(alloc, box);
-    mlir::Value stat = genRuntimeAllocate(builder, loc, box, errorManager);
+    mlir::Value stat;
+    if (!Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+      stat = genRuntimeAllocate(builder, loc, box, errorManager);
+    else
+      stat =
+          genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
     fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
     postAllocationAction(alloc);
     errorManager.assignStat(builder, loc, stat);
@@ -602,7 +606,10 @@ class AllocateStmtHelper {
       genSetDeferredLengthParameters(alloc, box);
     genAllocateObjectBounds(alloc, box);
     mlir::Value stat;
-    if (isSource)
+    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+      stat =
+          genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
+    else if (isSource)
       stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
     else
       stat = genRuntimeAllocate(builder, loc, box, errorManager);
@@ -717,6 +724,34 @@ class AllocateStmtHelper {
     return nullptr;
   }
 
+  mlir::Value genCudaAllocate(fir::FirOpBuilder &builder, mlir::Location loc,
+                              const fir::MutableBoxValue &box,
+                              ErrorManager &errorManager,
+                              const Fortran::semantics::Symbol &sym) {
+    Fortran::lower::StatementContext stmtCtx;
+    fir::CUDADataAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+                                                         sym);
+    mlir::Value errmsg = errMsgExpr ? errorManager.errMsgAddr : nullptr;
+    mlir::Value stream =
+        streamExpr
+            ? fir::getBase(converter.genExprValue(loc, *streamExpr, stmtCtx))
+            : nullptr;
+    mlir::Value pinned =
+        pinnedExpr
+            ? fir::getBase(converter.genExprAddr(loc, *pinnedExpr, stmtCtx))
+            : nullptr;
+    mlir::Value source = sourceExpr ? fir::getBase(sourceExv) : nullptr;
+
+    // Keep return type the same as a standard AllocatableAllocate call.
+    mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
+    return builder
+        .create<fir::CUDAAllocateOp>(
+            loc, retTy, box.getAddr(), errmsg, stream, pinned, source, cudaAttr,
+            errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
+        .getResult();
+  }
+
   Fortran::lower::AbstractConverter &converter;
   fir::FirOpBuilder &builder;
   const Fortran::parser::AllocateStmt &stmt;
@@ -724,6 +759,8 @@ class AllocateStmtHelper {
   const Fortran::lower::SomeExpr *moldExpr{nullptr};
   const Fortran::lower::SomeExpr *statExpr{nullptr};
   const Fortran::lower::SomeExpr *errMsgExpr{nullptr};
+  const Fortran::lower::SomeExpr *pinnedExpr{nullptr};
+  const Fortran::lower::SomeExpr *streamExpr{nullptr};
   // If the allocate has a type spec, lenParams contains the
   // value of the length parameters that were specified inside.
   llvm::SmallVector<mlir::Value> lenParams;
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
new file mode 100644
index 00000000000000..55223011e8d9e9
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -0,0 +1,107 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test lowering of CUDA allocatable allocate/deallocate statements.
+
+subroutine sub1()
+  real, allocatable, device :: a(:)
+  allocate(a(10))
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub2()
+  real, allocatable, managed :: a(:)
+  integer :: istat
+  allocate(a(10), stat=istat)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub2()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub2Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub2Eistat"}
+! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub2Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>, hasStat} -> i32
+! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
+
+subroutine sub3()
+  integer, allocatable, pinned :: a(:,:)
+  logical :: plog
+  allocate(a(20,30), pinned = plog)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub3()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>> {bindc_name = "a", uniq_name = "_QFsub3Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub3Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
+! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"}
+! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK-2: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {cuda_attr = #fir.cuda<pinned>} -> i32
+
+subroutine sub4()
+  real, allocatable, unified :: a(:)
+  integer :: istream
+  allocate(a(10), stream=istream)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub4()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub4Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<unified>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
+! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda<unified>} -> i32
+
+subroutine sub5()
+  real, allocatable, device :: a(:)
+  real, allocatable :: b(:)
+  allocate(a, source=b)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub5()
+! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub5Ea"}
+! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub5Eb"}
+! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> source(%[[LOAD_B]] : !fir.box<!fir.heap<!fir.array<?xf32>>>) {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub6()
+  real, allocatable, device :: a(:)
+  real, allocatable :: b(:)
+  allocate(a, mold=b)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub6()
+! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub6Ea"}
+! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub6Eb"}
+! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: fir.call @_FortranAAllocatableApplyMold
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub7()
+  real, allocatable, device :: a(:)
+  integer :: istat
+  character(50) :: err
+  allocate(a(100), stat=istat, errmsg=err)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub7()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub7Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ERR:.*]] = fir.alloca !fir.char<1,50> {bindc_name = "err", uniq_name = "_QFsub7Eerr"}
+! CHECK: %[[ERR_DECL:.*]]:2 = hlfir.declare %[[ERR]] typeparams %{{.*}} {uniq_name = "_QFsub7Eerr"} : (!fir.ref<!fir.char<1,50>>, index) -> (!fir.ref<!fir.char<1,50>>, !fir.ref<!fir.char<1,50>>)
+! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub7Eistat"}
+! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub7Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%[[ERR_BOX]] : !fir.box<!fir.char<1,50>>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>

Copy link
Contributor

@jeanPerier jeanPerier left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks great

@clementval clementval merged commit da70f2c into llvm:main Apr 17, 2024
7 checks passed
clementval added a commit that referenced this pull request Apr 17, 2024
Replace the runtime call to `AllocatableDeallocate` for CUDA device
variable to the newly added `fir.cuda_deallocate` operation.

This is similar with #88980 

A third patch will handle the case of automatic dealloctaion of device
allocatable variables
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
flang:fir-hlfir flang Flang issues not falling into any other category
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants