Skip to content

[flang][cuda] Lower ALLOCATE for device variable #88980

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 47 additions & 10 deletions flang/lib/Lower/Allocatable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "flang/Evaluate/tools.h"
#include "flang/Lower/AbstractConverter.h"
#include "flang/Lower/ConvertType.h"
#include "flang/Lower/ConvertVariable.h"
#include "flang/Lower/IterationSpace.h"
#include "flang/Lower/Mangler.h"
#include "flang/Lower/OpenACC.h"
Expand Down Expand Up @@ -368,20 +369,17 @@ class AllocateStmtHelper {
[&](const Fortran::parser::AllocOpt::Mold &mold) {
moldExpr = Fortran::semantics::GetExpr(mold.v.value());
},
[&](const Fortran::parser::AllocOpt::Stream &) {
TODO(loc, "CUDA ALLOCATE(STREAM=)");
[&](const Fortran::parser::AllocOpt::Stream &stream) {
streamExpr = Fortran::semantics::GetExpr(stream.v.value());
},
[&](const Fortran::parser::AllocOpt::Pinned &) {
TODO(loc, "CUDA ALLOCATE(PINNED=)");
[&](const Fortran::parser::AllocOpt::Pinned &pinned) {
pinnedExpr = Fortran::semantics::GetExpr(pinned.v.value());
},
},
allocOption.u);
}

void lowerAllocation(const Allocation &alloc) {
if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
TODO(loc, "Allocation of variable with CUDA attributes");

fir::MutableBoxValue boxAddr =
genMutableBoxValue(converter, loc, alloc.getAllocObj());

Expand Down Expand Up @@ -456,7 +454,8 @@ class AllocateStmtHelper {
const fir::MutableBoxValue &box) {
if (!box.isDerived() && !errorManager.hasStatSpec() &&
!alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() &&
!useAllocateRuntime && !box.isPointer()) {
!useAllocateRuntime && !box.isPointer() &&
!Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
// Pointers must use PointerAllocate so that their deallocations
// can be validated.
genInlinedAllocation(alloc, box);
Expand All @@ -472,7 +471,12 @@ class AllocateStmtHelper {
genSetType(alloc, box, loc);
genSetDeferredLengthParameters(alloc, box);
genAllocateObjectBounds(alloc, box);
mlir::Value stat = genRuntimeAllocate(builder, loc, box, errorManager);
mlir::Value stat;
if (!Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
stat = genRuntimeAllocate(builder, loc, box, errorManager);
else
stat =
genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
postAllocationAction(alloc);
errorManager.assignStat(builder, loc, stat);
Expand Down Expand Up @@ -602,7 +606,10 @@ class AllocateStmtHelper {
genSetDeferredLengthParameters(alloc, box);
genAllocateObjectBounds(alloc, box);
mlir::Value stat;
if (isSource)
if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
stat =
genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
else if (isSource)
stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
else
stat = genRuntimeAllocate(builder, loc, box, errorManager);
Expand Down Expand Up @@ -717,13 +724,43 @@ class AllocateStmtHelper {
return nullptr;
}

mlir::Value genCudaAllocate(fir::FirOpBuilder &builder, mlir::Location loc,
const fir::MutableBoxValue &box,
ErrorManager &errorManager,
const Fortran::semantics::Symbol &sym) {
Fortran::lower::StatementContext stmtCtx;
fir::CUDADataAttributeAttr cudaAttr =
Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
sym);
mlir::Value errmsg = errMsgExpr ? errorManager.errMsgAddr : nullptr;
mlir::Value stream =
streamExpr
? fir::getBase(converter.genExprValue(loc, *streamExpr, stmtCtx))
: nullptr;
mlir::Value pinned =
pinnedExpr
? fir::getBase(converter.genExprAddr(loc, *pinnedExpr, stmtCtx))
: nullptr;
mlir::Value source = sourceExpr ? fir::getBase(sourceExv) : nullptr;

// Keep return type the same as a standard AllocatableAllocate call.
mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
return builder
.create<fir::CUDAAllocateOp>(
loc, retTy, box.getAddr(), errmsg, stream, pinned, source, cudaAttr,
errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
.getResult();
}

Fortran::lower::AbstractConverter &converter;
fir::FirOpBuilder &builder;
const Fortran::parser::AllocateStmt &stmt;
const Fortran::lower::SomeExpr *sourceExpr{nullptr};
const Fortran::lower::SomeExpr *moldExpr{nullptr};
const Fortran::lower::SomeExpr *statExpr{nullptr};
const Fortran::lower::SomeExpr *errMsgExpr{nullptr};
const Fortran::lower::SomeExpr *pinnedExpr{nullptr};
const Fortran::lower::SomeExpr *streamExpr{nullptr};
// If the allocate has a type spec, lenParams contains the
// value of the length parameters that were specified inside.
llvm::SmallVector<mlir::Value> lenParams;
Expand Down
107 changes: 107 additions & 0 deletions flang/test/Lower/CUDA/cuda-allocatable.cuf
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s

! Test lowering of CUDA allocatable allocate/deallocate statements.

subroutine sub1()
real, allocatable, device :: a(:)
allocate(a(10))
end subroutine

! CHECK-LABEL: func.func @_QPsub1()
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32

subroutine sub2()
real, allocatable, managed :: a(:)
integer :: istat
allocate(a(10), stat=istat)
end subroutine

! CHECK-LABEL: func.func @_QPsub2()
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub2Ea"}
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub2Eistat"}
! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub2Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>, hasStat} -> i32
! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>

subroutine sub3()
integer, allocatable, pinned :: a(:,:)
logical :: plog
allocate(a(20,30), pinned = plog)
end subroutine

! CHECK-LABEL: func.func @_QPsub3()
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>> {bindc_name = "a", uniq_name = "_QFsub3Ea"}
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub3Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"}
! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
! CHECK-2: fir.call @_FortranAAllocatableSetBounds
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {cuda_attr = #fir.cuda<pinned>} -> i32

subroutine sub4()
real, allocatable, unified :: a(:)
integer :: istream
allocate(a(10), stream=istream)
end subroutine

! CHECK-LABEL: func.func @_QPsub4()
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub4Ea"}
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<unified>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda<unified>} -> i32

subroutine sub5()
real, allocatable, device :: a(:)
real, allocatable :: b(:)
allocate(a, source=b)
end subroutine

! CHECK-LABEL: func.func @_QPsub5()
! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub5Ea"}
! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub5Eb"}
! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> source(%[[LOAD_B]] : !fir.box<!fir.heap<!fir.array<?xf32>>>) {cuda_attr = #fir.cuda<device>} -> i32

subroutine sub6()
real, allocatable, device :: a(:)
real, allocatable :: b(:)
allocate(a, mold=b)
end subroutine

! CHECK-LABEL: func.func @_QPsub6()
! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub6Ea"}
! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub6Eb"}
! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
! CHECK: fir.call @_FortranAAllocatableApplyMold
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32

subroutine sub7()
real, allocatable, device :: a(:)
integer :: istat
character(50) :: err
allocate(a(100), stat=istat, errmsg=err)
end subroutine

! CHECK-LABEL: func.func @_QPsub7()
! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub7Ea"}
! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
! CHECK: %[[ERR:.*]] = fir.alloca !fir.char<1,50> {bindc_name = "err", uniq_name = "_QFsub7Eerr"}
! CHECK: %[[ERR_DECL:.*]]:2 = hlfir.declare %[[ERR]] typeparams %{{.*}} {uniq_name = "_QFsub7Eerr"} : (!fir.ref<!fir.char<1,50>>, index) -> (!fir.ref<!fir.char<1,50>>, !fir.ref<!fir.char<1,50>>)
! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub7Eistat"}
! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub7Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%[[ERR_BOX]] : !fir.box<!fir.char<1,50>>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
Loading