Skip to content

[Flang][OpenMP] Restructure recursive lowering in createBodyOfOp #77761

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions flang/include/flang/Lower/OpenMP.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ struct Variable;
} // namespace pft

// Generate the OpenMP terminator for Operation at Location.
void genOpenMPTerminator(fir::FirOpBuilder &, mlir::Operation *,
mlir::Location);
mlir::Operation *genOpenMPTerminator(fir::FirOpBuilder &, mlir::Operation *,
mlir::Location);

void genOpenMPConstruct(AbstractConverter &, Fortran::lower::SymMap &,
semantics::SemanticsContext &, pft::Evaluation &,
Expand Down
144 changes: 103 additions & 41 deletions flang/lib/Lower/OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "flang/Parser/parse-tree.h"
#include "flang/Semantics/openmp-directive-sets.h"
#include "flang/Semantics/tools.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Transforms/RegionUtils.h"
Expand Down Expand Up @@ -385,7 +386,8 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
// construct
mlir::OpBuilder::InsertPoint unstructuredSectionsIP =
firOpBuilder.saveInsertionPoint();
firOpBuilder.setInsertionPointToStart(&op->getRegion(0).back());
mlir::Operation *lastOper = op->getRegion(0).back().getTerminator();
firOpBuilder.setInsertionPoint(lastOper);
lastPrivIP = firOpBuilder.saveInsertionPoint();
firOpBuilder.restoreInsertionPoint(unstructuredSectionsIP);
}
Expand Down Expand Up @@ -2135,15 +2137,6 @@ static mlir::Type getLoopVarType(Fortran::lower::AbstractConverter &converter,
return converter.getFirOpBuilder().getIntegerType(loopVarTypeSize);
}

static void resetBeforeTerminator(fir::FirOpBuilder &firOpBuilder,
mlir::Operation *storeOp,
mlir::Block &block) {
if (storeOp)
firOpBuilder.setInsertionPointAfter(storeOp);
else
firOpBuilder.setInsertionPointToStart(&block);
}

static mlir::Operation *
createAndSetPrivatizedLoopVar(Fortran::lower::AbstractConverter &converter,
mlir::Location loc, mlir::Value indexVal,
Expand Down Expand Up @@ -2186,11 +2179,17 @@ static void createBodyOfOp(
const llvm::SmallVector<const Fortran::semantics::Symbol *> &args = {},
bool outerCombined = false, DataSharingProcessor *dsp = nullptr) {
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();

auto insertMarker = [](fir::FirOpBuilder &builder) {
mlir::Value undef = builder.create<fir::UndefOp>(builder.getUnknownLoc(),
builder.getIndexType());
return undef.getDefiningOp();
};

// If an argument for the region is provided then create the block with that
// argument. Also update the symbol's address with the mlir argument value.
// e.g. For loops the argument is the induction variable. And all further
// uses of the induction variable should use this mlir value.
mlir::Operation *storeOp = nullptr;
if (args.size()) {
std::size_t loopVarTypeSize = 0;
for (const Fortran::semantics::Symbol *arg : args)
Expand All @@ -2201,20 +2200,20 @@ static void createBodyOfOp(
firOpBuilder.createBlock(&op.getRegion(), {}, tiv, locs);
// The argument is not currently in memory, so make a temporary for the
// argument, and store it there, then bind that location to the argument.
mlir::Operation *storeOp = nullptr;
for (auto [argIndex, argSymbol] : llvm::enumerate(args)) {
mlir::Value indexVal =
fir::getBase(op.getRegion().front().getArgument(argIndex));
storeOp =
createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
}
firOpBuilder.setInsertionPointAfter(storeOp);
} else {
firOpBuilder.createBlock(&op.getRegion());
}
// Set the insert for the terminator operation to go at the end of the
// block - this is either empty or the block with the stores above,
// the end of the block works for both.
mlir::Block &block = op.getRegion().back();
firOpBuilder.setInsertionPointToEnd(&block);

// Mark the earliest insertion point.
mlir::Operation *marker = insertMarker(firOpBuilder);

// If it is an unstructured region and is not the outer region of a combined
// construct, create empty blocks for all evaluations.
Expand All @@ -2223,37 +2222,100 @@ static void createBodyOfOp(
mlir::omp::YieldOp>(
firOpBuilder, eval.getNestedEvaluations());

// Insert the terminator.
Fortran::lower::genOpenMPTerminator(firOpBuilder, op.getOperation(), loc);
// Reset the insert point to before the terminator.
resetBeforeTerminator(firOpBuilder, storeOp, block);
// Start with privatization, so that the lowering of the nested
// code will use the right symbols.
constexpr bool isLoop = std::is_same_v<Op, mlir::omp::WsLoopOp> ||
std::is_same_v<Op, mlir::omp::SimdLoopOp>;
bool privatize = clauses && !outerCombined;

// Handle privatization. Do not privatize if this is the outer operation.
if (clauses && !outerCombined) {
constexpr bool isLoop = std::is_same_v<Op, mlir::omp::WsLoopOp> ||
std::is_same_v<Op, mlir::omp::SimdLoopOp>;
firOpBuilder.setInsertionPoint(marker);
std::optional<DataSharingProcessor> tempDsp;
if (privatize) {
if (!dsp) {
DataSharingProcessor proc(converter, *clauses, eval);
proc.processStep1();
proc.processStep2(op, isLoop);
} else {
if (isLoop && args.size() > 0)
dsp->setLoopIV(converter.getSymbolAddress(*args[0]));
dsp->processStep2(op, isLoop);
tempDsp.emplace(converter, *clauses, eval);
tempDsp->processStep1();
}

if (storeOp)
firOpBuilder.setInsertionPointAfter(storeOp);
}

if constexpr (std::is_same_v<Op, mlir::omp::ParallelOp>) {
threadPrivatizeVars(converter, eval);
if (clauses)
if (clauses) {
firOpBuilder.setInsertionPoint(marker);
ClauseProcessor(converter, *clauses).processCopyin();
}
}

if (genNested)
if (genNested) {
// genFIR(Evaluation&) tries to patch up unterminated blocks, causing
// a lot of trouble if the terminator generation is delayed past this
// point. Insert a temporary terminator here, then delete it.
firOpBuilder.setInsertionPointToEnd(&op.getRegion().back());
auto *temp = Fortran::lower::genOpenMPTerminator(firOpBuilder,
op.getOperation(), loc);
firOpBuilder.setInsertionPointAfter(marker);
genNestedEvaluations(converter, eval);
temp->erase();
}

// Get or create a unique exiting block from the given region, or
// return nullptr if there is no exiting block.
auto getUniqueExit = [&](mlir::Region &region) -> mlir::Block * {
// Find the blocks where the OMP terminator should go. In simple cases
// it is the single block in the operation's region. When the region
// is more complicated, especially with unstructured control flow, there
// may be multiple blocks, and some of them may have non-OMP terminators
// resulting from lowering of the code contained within the operation.
// All the remaining blocks are potential exit points from the op's region.
//
// Explicit control flow cannot exit any OpenMP region (other than via
// STOP), and that is enforced by semantic checks prior to lowering. STOP
// statements are lowered to a function call.

// Collect unterminated blocks.
llvm::SmallVector<mlir::Block *> exits;
for (mlir::Block &b : region) {
if (b.empty() || !b.back().hasTrait<mlir::OpTrait::IsTerminator>())
exits.push_back(&b);
}

if (exits.empty())
return nullptr;
// If there already is a unique exiting block, do not create another one.
// Additionally, some ops (e.g. omp.sections) require only 1 block in
// its region.
if (exits.size() == 1)
return exits[0];
mlir::Block *exit = firOpBuilder.createBlock(&region);
for (mlir::Block *b : exits) {
firOpBuilder.setInsertionPointToEnd(b);
firOpBuilder.create<mlir::cf::BranchOp>(loc, exit);
}
return exit;
};

if (auto *exitBlock = getUniqueExit(op.getRegion())) {
firOpBuilder.setInsertionPointToEnd(exitBlock);
auto *term = Fortran::lower::genOpenMPTerminator(firOpBuilder,
op.getOperation(), loc);
// Only insert lastprivate code when there actually is an exit block.
// Such a block may not exist if the nested code produced an infinite
// loop (this may not make sense in production code, but a user could
// write that and we should handle it).
Comment on lines +2300 to +2303
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it guaranteed that it is only when there is an infinite loop that no exit block exists?
Also who/what inserts the terminator for this case (when there is infinite loop)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We rely on the genEval for the nested code to do its thing. If we consider the nested code (PFT/Evaluation) on its own, it will have a single entry point, and zero or more exit points. If it creates multiple blocks, it will need to create its own branches to facilitate the control flow. After genEval we remove the temporary terminator, then we look at all the blocks created during genEval. We know that everything that is present inside of these blocks was created for the nested code (and not the enclosing construct we're lowering). Terminators for our op haven't been created yet, and they can only placed in blocks that don't yet have a terminator. If every block has a terminator then there is no way to leave the region by means of reaching the end of it (that is aside from function calls).

Based on that, the logical thing to do would be to insert a terminator into all non-terminated blocks. This would work even if there is always at most one such block.

The assertion that there can only be one comes mostly from two things:

  • the requirement from the OpenMP standard that a structured block exits at the bottom of the program text, and
  • the current lowering seems to provide a single FIR location that corresponds to the "bottom".

If we can't be sure that the assertion is correct, we should be able to remove it, and process all exiting blocks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I'm going to do is to create a unique exiting block: create a new block, and for each yet-unterminated block, will insert a branch to the new one. Then insert the OMP terminator into it, and process the lastprivate in there.

This will get rid of the assertion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. Makes sense.

firOpBuilder.setInsertionPoint(term);
if (privatize) {
if (!dsp) {
assert(tempDsp.has_value());
tempDsp->processStep2(op, isLoop);
} else {
if (isLoop && args.size() > 0)
dsp->setLoopIV(converter.getSymbolAddress(*args[0]));
dsp->processStep2(op, isLoop);
}
}
}

firOpBuilder.setInsertionPointAfter(marker);
marker->erase();
}

static void genBodyOfTargetDataOp(
Expand Down Expand Up @@ -3685,14 +3747,14 @@ genOMP(Fortran::lower::AbstractConverter &converter,
// Public functions
//===----------------------------------------------------------------------===//

void Fortran::lower::genOpenMPTerminator(fir::FirOpBuilder &builder,
mlir::Operation *op,
mlir::Location loc) {
mlir::Operation *Fortran::lower::genOpenMPTerminator(fir::FirOpBuilder &builder,
mlir::Operation *op,
mlir::Location loc) {
if (mlir::isa<mlir::omp::WsLoopOp, mlir::omp::ReductionDeclareOp,
mlir::omp::AtomicUpdateOp, mlir::omp::SimdLoopOp>(op))
builder.create<mlir::omp::YieldOp>(loc);
return builder.create<mlir::omp::YieldOp>(loc);
else
builder.create<mlir::omp::TerminatorOp>(loc);
return builder.create<mlir::omp::TerminatorOp>(loc);
}

void Fortran::lower::genOpenMPConstruct(
Expand Down
6 changes: 3 additions & 3 deletions flang/test/Lower/OpenMP/FIR/sections.f90
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,11 @@ subroutine lastprivate()
x = x * 10
!CHECK: omp.section {
!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
!CHECK: %[[true:.*]] = arith.constant true
!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
!CHECK: %[[const:.*]] = arith.constant 1 : i32
!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
!CHECK: %[[true:.*]] = arith.constant true
!CHECK: fir.if %[[true]] {
!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
!CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
Expand Down Expand Up @@ -163,11 +163,11 @@ subroutine lastprivate()
!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
!CHECK: omp.barrier
!CHECK: %[[true:.*]] = arith.constant true
!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
!CHECK: %[[const:.*]] = arith.constant 1 : i32
!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
!CHECK: %[[true:.*]] = arith.constant true
!CHECK: fir.if %true {
!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
!CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
Expand Down Expand Up @@ -200,11 +200,11 @@ subroutine lastprivate()
!CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
!CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
!CHECK: omp.barrier
!CHECK: %[[true:.*]] = arith.constant true
!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
!CHECK: %[[const:.*]] = arith.constant 1 : i32
!CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
!CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
!CHECK: %[[true:.*]] = arith.constant true
!CHECK: fir.if %true {
!CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
!CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
Expand Down
26 changes: 26 additions & 0 deletions flang/test/Lower/OpenMP/infinite-loop-in-construct.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
! RUN: bbc -fopenmp -o - %s | FileCheck %s

! Check that this test can be lowered successfully.
! See https://github.com/llvm/llvm-project/issues/74348

! CHECK-LABEL: func.func @_QPsb
! CHECK: omp.parallel
! CHECK: cf.cond_br %{{[0-9]+}}, ^bb1, ^bb2
! CHECK-NEXT: ^bb1: // pred: ^bb0
! CHECK: cf.br ^bb2
! CHECK-NEXT: ^bb2: // 3 preds: ^bb0, ^bb1, ^bb2
! CHECK-NEXT: cf.br ^bb2
! CHECK-NEXT: }

subroutine sb(ninter, numnod)
integer :: ninter, numnod
integer, dimension(:), allocatable :: indx_nm

!$omp parallel
if (ninter>0) then
allocate(indx_nm(numnod))
endif
220 continue
goto 220
!$omp end parallel
end subroutine
6 changes: 3 additions & 3 deletions flang/test/Lower/OpenMP/sections.f90
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,11 @@ subroutine lastprivate()
!CHECK: omp.section {
!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
!CHECK: %[[TRUE:.*]] = arith.constant true
!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
!CHECK: %[[CONST:.*]] = arith.constant 1 : i32
!CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
!CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
!CHECK: %[[TRUE:.*]] = arith.constant true
!CHECK: fir.if %[[TRUE]] {
!CHECK: %[[TEMP1:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
!CHECK: hlfir.assign %[[TEMP1]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
Expand Down Expand Up @@ -180,11 +180,11 @@ subroutine lastprivate()
!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
!CHECK: omp.barrier
!CHECK: %[[TRUE:.*]] = arith.constant true
!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
!CHECK: %[[CONST:.*]] = arith.constant 1 : i32
!CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
!CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
!CHECK: %[[TRUE:.*]] = arith.constant true
!CHECK: fir.if %[[TRUE]] {
!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
Expand Down Expand Up @@ -219,11 +219,11 @@ subroutine lastprivate()
!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
!CHECK: omp.barrier
!CHECK: %[[TRUE:.*]] = arith.constant true
!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
!CHECK: %[[CONST:.*]] = arith.constant 1 : i32
!CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
!CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
!CHECK: %[[TRUE:.*]] = arith.constant true
!CHECK: fir.if %[[TRUE]] {
!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
Expand Down
61 changes: 61 additions & 0 deletions flang/test/Lower/OpenMP/wsloop-unstructured.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s

subroutine sub(imax, jmax, x, y)
integer, intent(in) :: imax, jmax
real, intent(in), dimension(1:imax, 1:jmax) :: x, y

integer :: i, j, ii

! collapse(2) is needed to reproduce the issue
!$omp parallel do collapse(2)
do j = 1, jmax
do i = 1, imax
do ii = 1, imax ! note that this loop is not collapsed
if (x(i,j) < y(ii,j)) then
! exit needed to force unstructured control flow
exit
endif
enddo
enddo
enddo
end subroutine sub

! this is testing that we don't crash generating code for this: in particular
! that all blocks are terminated

! CHECK-LABEL: func.func @_QPsub(
! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "imax"},
! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "jmax"},
! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref<!fir.array<?x?xf32>> {fir.bindc_name = "x"},
! CHECK-SAME: %[[VAL_3:.*]]: !fir.ref<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) {
! [...]
! CHECK: omp.wsloop for (%[[VAL_53:.*]], %[[VAL_54:.*]]) : i32 = ({{.*}}) to ({{.*}}) inclusive step ({{.*}}) {
! [...]
! CHECK: cf.br ^bb1
! CHECK: ^bb1:
! CHECK: cf.br ^bb2
! CHECK: ^bb2:
! [...]
! CHECK: cf.br ^bb3
! CHECK: ^bb3:
! [...]
! CHECK: %[[VAL_63:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} : i32
! CHECK: cf.cond_br %[[VAL_63]], ^bb4, ^bb7
! CHECK: ^bb4:
! [...]
! CHECK: %[[VAL_76:.*]] = arith.cmpf olt, %{{.*}}, %{{.*}} fastmath<contract> : f32
! CHECK: cf.cond_br %[[VAL_76]], ^bb5, ^bb6
! CHECK: ^bb5:
! CHECK: cf.br ^bb7
! CHECK: ^bb6:
! [...]
! CHECK: cf.br ^bb3
! CHECK: ^bb7:
! CHECK: omp.yield
! CHECK: }
! CHECK: omp.terminator
! CHECK: }
! CHECK: cf.br ^bb1
! CHECK: ^bb1:
! CHECK: return
! CHECK: }