-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[flang][cuda] Lower ALLOCATE for device variable #88980
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesReplace the runtime call to Full diff: https://github.com/llvm/llvm-project/pull/88980.diff 2 Files Affected:
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 42e78fc96e4445..1d434d512d0c5c 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -14,6 +14,7 @@
#include "flang/Evaluate/tools.h"
#include "flang/Lower/AbstractConverter.h"
#include "flang/Lower/ConvertType.h"
+#include "flang/Lower/ConvertVariable.h"
#include "flang/Lower/IterationSpace.h"
#include "flang/Lower/Mangler.h"
#include "flang/Lower/OpenACC.h"
@@ -368,20 +369,17 @@ class AllocateStmtHelper {
[&](const Fortran::parser::AllocOpt::Mold &mold) {
moldExpr = Fortran::semantics::GetExpr(mold.v.value());
},
- [&](const Fortran::parser::AllocOpt::Stream &) {
- TODO(loc, "CUDA ALLOCATE(STREAM=)");
+ [&](const Fortran::parser::AllocOpt::Stream &stream) {
+ streamExpr = Fortran::semantics::GetExpr(stream.v.value());
},
- [&](const Fortran::parser::AllocOpt::Pinned &) {
- TODO(loc, "CUDA ALLOCATE(PINNED=)");
+ [&](const Fortran::parser::AllocOpt::Pinned &pinned) {
+ pinnedExpr = Fortran::semantics::GetExpr(pinned.v.value());
},
},
allocOption.u);
}
void lowerAllocation(const Allocation &alloc) {
- if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
- TODO(loc, "Allocation of variable with CUDA attributes");
-
fir::MutableBoxValue boxAddr =
genMutableBoxValue(converter, loc, alloc.getAllocObj());
@@ -456,7 +454,8 @@ class AllocateStmtHelper {
const fir::MutableBoxValue &box) {
if (!box.isDerived() && !errorManager.hasStatSpec() &&
!alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() &&
- !useAllocateRuntime && !box.isPointer()) {
+ !useAllocateRuntime && !box.isPointer() &&
+ !Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
// Pointers must use PointerAllocate so that their deallocations
// can be validated.
genInlinedAllocation(alloc, box);
@@ -472,7 +471,12 @@ class AllocateStmtHelper {
genSetType(alloc, box, loc);
genSetDeferredLengthParameters(alloc, box);
genAllocateObjectBounds(alloc, box);
- mlir::Value stat = genRuntimeAllocate(builder, loc, box, errorManager);
+ mlir::Value stat;
+ if (!Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+ stat = genRuntimeAllocate(builder, loc, box, errorManager);
+ else
+ stat =
+ genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
postAllocationAction(alloc);
errorManager.assignStat(builder, loc, stat);
@@ -602,7 +606,10 @@ class AllocateStmtHelper {
genSetDeferredLengthParameters(alloc, box);
genAllocateObjectBounds(alloc, box);
mlir::Value stat;
- if (isSource)
+ if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+ stat =
+ genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
+ else if (isSource)
stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
else
stat = genRuntimeAllocate(builder, loc, box, errorManager);
@@ -717,6 +724,34 @@ class AllocateStmtHelper {
return nullptr;
}
+ mlir::Value genCudaAllocate(fir::FirOpBuilder &builder, mlir::Location loc,
+ const fir::MutableBoxValue &box,
+ ErrorManager &errorManager,
+ const Fortran::semantics::Symbol &sym) {
+ Fortran::lower::StatementContext stmtCtx;
+ fir::CUDADataAttributeAttr cudaAttr =
+ Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+ sym);
+ mlir::Value errmsg = errMsgExpr ? errorManager.errMsgAddr : nullptr;
+ mlir::Value stream =
+ streamExpr
+ ? fir::getBase(converter.genExprValue(loc, *streamExpr, stmtCtx))
+ : nullptr;
+ mlir::Value pinned =
+ pinnedExpr
+ ? fir::getBase(converter.genExprAddr(loc, *pinnedExpr, stmtCtx))
+ : nullptr;
+ mlir::Value source = sourceExpr ? fir::getBase(sourceExv) : nullptr;
+
+ // Keep return type the same as a standard AllocatableAllocate call.
+ mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
+ return builder
+ .create<fir::CUDAAllocateOp>(
+ loc, retTy, box.getAddr(), errmsg, stream, pinned, source, cudaAttr,
+ errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
+ .getResult();
+ }
+
Fortran::lower::AbstractConverter &converter;
fir::FirOpBuilder &builder;
const Fortran::parser::AllocateStmt &stmt;
@@ -724,6 +759,8 @@ class AllocateStmtHelper {
const Fortran::lower::SomeExpr *moldExpr{nullptr};
const Fortran::lower::SomeExpr *statExpr{nullptr};
const Fortran::lower::SomeExpr *errMsgExpr{nullptr};
+ const Fortran::lower::SomeExpr *pinnedExpr{nullptr};
+ const Fortran::lower::SomeExpr *streamExpr{nullptr};
// If the allocate has a type spec, lenParams contains the
// value of the length parameters that were specified inside.
llvm::SmallVector<mlir::Value> lenParams;
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
new file mode 100644
index 00000000000000..55223011e8d9e9
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -0,0 +1,107 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test lowering of CUDA allocatable allocate/deallocate statements.
+
+subroutine sub1()
+ real, allocatable, device :: a(:)
+ allocate(a(10))
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub2()
+ real, allocatable, managed :: a(:)
+ integer :: istat
+ allocate(a(10), stat=istat)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub2()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub2Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub2Eistat"}
+! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub2Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>, hasStat} -> i32
+! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
+
+subroutine sub3()
+ integer, allocatable, pinned :: a(:,:)
+ logical :: plog
+ allocate(a(20,30), pinned = plog)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub3()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>> {bindc_name = "a", uniq_name = "_QFsub3Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub3Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
+! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"}
+! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK-2: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {cuda_attr = #fir.cuda<pinned>} -> i32
+
+subroutine sub4()
+ real, allocatable, unified :: a(:)
+ integer :: istream
+ allocate(a(10), stream=istream)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub4()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub4Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<unified>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
+! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda<unified>} -> i32
+
+subroutine sub5()
+ real, allocatable, device :: a(:)
+ real, allocatable :: b(:)
+ allocate(a, source=b)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub5()
+! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub5Ea"}
+! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub5Eb"}
+! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> source(%[[LOAD_B]] : !fir.box<!fir.heap<!fir.array<?xf32>>>) {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub6()
+ real, allocatable, device :: a(:)
+ real, allocatable :: b(:)
+ allocate(a, mold=b)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub6()
+! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub6Ea"}
+! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub6Eb"}
+! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: fir.call @_FortranAAllocatableApplyMold
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub7()
+ real, allocatable, device :: a(:)
+ integer :: istat
+ character(50) :: err
+ allocate(a(100), stat=istat, errmsg=err)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub7()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub7Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ERR:.*]] = fir.alloca !fir.char<1,50> {bindc_name = "err", uniq_name = "_QFsub7Eerr"}
+! CHECK: %[[ERR_DECL:.*]]:2 = hlfir.declare %[[ERR]] typeparams %{{.*}} {uniq_name = "_QFsub7Eerr"} : (!fir.ref<!fir.char<1,50>>, index) -> (!fir.ref<!fir.char<1,50>>, !fir.ref<!fir.char<1,50>>)
+! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub7Eistat"}
+! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub7Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%[[ERR_BOX]] : !fir.box<!fir.char<1,50>>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks great
Replace the runtime call to `AllocatableDeallocate` for CUDA device variable to the newly added `fir.cuda_deallocate` operation. This is similar with #88980 A third patch will handle the case of automatic dealloctaion of device allocatable variables
Replace the runtime call to
AllocatableAllocate
for CUDA device variable to the newly addedfir.cuda_allocate
operation.