Skip to content

Commit ef56b53

Browse files
authored
[flang][OpenMP] Extend do concurrent mapping to multi-range loops (#127634)
Adds support for converting mulit-range loops to OpenMP (on the host only for now). The changes here "prepare" a loop nest for collapsing by sinking iteration variables to the innermost `fir.do_loop` op in the nest. PR stack: - #126026 - #127595 - #127633 - #127634 (this PR) - #127635
1 parent 7d441d9 commit ef56b53

File tree

3 files changed

+259
-54
lines changed

3 files changed

+259
-54
lines changed

flang/docs/DoConcurrentConversionToOpenMP.md

+29
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,35 @@ omp.parallel {
173173

174174
<!-- TODO -->
175175

176+
### Multi-range loops
177+
178+
The pass currently supports multi-range loops as well. Given the following
179+
example:
180+
181+
```fortran
182+
do concurrent(i=1:n, j=1:m)
183+
a(i,j) = i * j
184+
end do
185+
```
186+
187+
The generated `omp.loop_nest` operation look like:
188+
189+
```
190+
omp.loop_nest (%arg0, %arg1)
191+
: index = (%17, %19) to (%18, %20)
192+
inclusive step (%c1_2, %c1_4) {
193+
fir.store %arg0 to %private_i#1 : !fir.ref<i32>
194+
fir.store %arg1 to %private_j#1 : !fir.ref<i32>
195+
...
196+
omp.yield
197+
}
198+
```
199+
200+
It is worth noting that we have privatized versions for both iteration
201+
variables: `i` and `j`. These are locally allocated inside the parallel/target
202+
OpenMP region similar to what the single-range example in previous section
203+
shows.
204+
176205
<!--
177206
More details about current status will be added along with relevant parts of the
178207
implementation in later upstreaming patches.

flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp

+158-54
Original file line numberDiff line numberDiff line change
@@ -28,64 +28,80 @@ namespace looputils {
2828
/// Stores info needed about the induction/iteration variable for each `do
2929
/// concurrent` in a loop nest.
3030
struct InductionVariableInfo {
31+
InductionVariableInfo(fir::DoLoopOp doLoop) { populateInfo(doLoop); }
32+
3133
/// The operation allocating memory for iteration variable.
3234
mlir::Operation *iterVarMemDef;
33-
};
35+
/// the operation(s) updating the iteration variable with the current
36+
/// iteration number.
37+
llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps;
3438

35-
using LoopNestToIndVarMap =
36-
llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
37-
38-
/// For the \p doLoop parameter, find the operation that declares its iteration
39-
/// variable or allocates memory for it.
40-
///
41-
/// For example, give the following loop:
42-
/// ```
43-
/// ...
44-
/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
45-
/// ...
46-
/// fir.do_loop %ind_var = %lb to %ub step %s unordered {
47-
/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
48-
/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
49-
/// ...
50-
/// }
51-
/// ```
52-
///
53-
/// This function returns the `hlfir.declare` op for `%i`.
54-
///
55-
/// Note: The current implementation is dependent on how flang emits loop
56-
/// bodies; which is sufficient for the current simple test/use cases. If this
57-
/// proves to be insufficient, this should be made more generic.
58-
mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
59-
mlir::Value result = nullptr;
60-
61-
// Checks if a StoreOp is updating the memref of the loop's iteration
62-
// variable.
63-
auto isStoringIV = [&](fir::StoreOp storeOp) {
64-
// Direct store into the IV memref.
65-
if (storeOp.getValue() == doLoop.getInductionVar())
66-
return true;
67-
68-
// Indirect store into the IV memref.
69-
if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
70-
storeOp.getValue().getDefiningOp())) {
71-
if (convertOp.getOperand() == doLoop.getInductionVar())
39+
private:
40+
/// For the \p doLoop parameter, find the following:
41+
///
42+
/// 1. The operation that declares its iteration variable or allocates memory
43+
/// for it. For example, give the following loop:
44+
/// ```
45+
/// ...
46+
/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
47+
/// ...
48+
/// fir.do_loop %ind_var = %lb to %ub step %s unordered {
49+
/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
50+
/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
51+
/// ...
52+
/// }
53+
/// ```
54+
///
55+
/// This function sets the `iterVarMemDef` member to the `hlfir.declare` op
56+
/// for `%i`.
57+
///
58+
/// 2. The operation(s) that update the loop's iteration variable from its
59+
/// induction variable. For the above example, the `indVarUpdateOps` is
60+
/// populated with the first 2 ops in the loop's body.
61+
///
62+
/// Note: The current implementation is dependent on how flang emits loop
63+
/// bodies; which is sufficient for the current simple test/use cases. If this
64+
/// proves to be insufficient, this should be made more generic.
65+
void populateInfo(fir::DoLoopOp doLoop) {
66+
mlir::Value result = nullptr;
67+
68+
// Checks if a StoreOp is updating the memref of the loop's iteration
69+
// variable.
70+
auto isStoringIV = [&](fir::StoreOp storeOp) {
71+
// Direct store into the IV memref.
72+
if (storeOp.getValue() == doLoop.getInductionVar()) {
73+
indVarUpdateOps.push_back(storeOp);
7274
return true;
73-
}
74-
75-
return false;
76-
};
75+
}
7776

78-
for (mlir::Operation &op : doLoop) {
79-
if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
80-
if (isStoringIV(storeOp)) {
81-
result = storeOp.getMemref();
82-
break;
77+
// Indirect store into the IV memref.
78+
if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
79+
storeOp.getValue().getDefiningOp())) {
80+
if (convertOp.getOperand() == doLoop.getInductionVar()) {
81+
indVarUpdateOps.push_back(convertOp);
82+
indVarUpdateOps.push_back(storeOp);
83+
return true;
84+
}
8385
}
86+
87+
return false;
88+
};
89+
90+
for (mlir::Operation &op : doLoop) {
91+
if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
92+
if (isStoringIV(storeOp)) {
93+
result = storeOp.getMemref();
94+
break;
95+
}
96+
}
97+
98+
assert(result != nullptr && result.getDefiningOp() != nullptr);
99+
iterVarMemDef = result.getDefiningOp();
84100
}
101+
};
85102

86-
assert(result != nullptr && result.getDefiningOp() != nullptr);
87-
return result.getDefiningOp();
88-
}
103+
using LoopNestToIndVarMap =
104+
llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
89105

90106
/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
91107
/// there are no operations in \p outerloop's body other than:
@@ -181,10 +197,7 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
181197
assert(currentLoop.getUnordered());
182198

183199
while (true) {
184-
loopNest.insert(
185-
{currentLoop,
186-
InductionVariableInfo{findLoopIterationVarMemDecl(currentLoop)}});
187-
200+
loopNest.insert({currentLoop, InductionVariableInfo(currentLoop)});
188201
llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
189202

190203
for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())
@@ -210,6 +223,96 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
210223

211224
return mlir::success();
212225
}
226+
227+
/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In
228+
/// particular, this function would take this input IR:
229+
/// ```
230+
/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered {
231+
/// fir.store %i_iv to %i#1 : !fir.ref<i32>
232+
/// %j_lb = arith.constant 1 : i32
233+
/// %j_ub = arith.constant 10 : i32
234+
/// %j_step = arith.constant 1 : index
235+
///
236+
/// fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered {
237+
/// fir.store %j_iv to %j#1 : !fir.ref<i32>
238+
/// ...
239+
/// }
240+
/// }
241+
/// ```
242+
///
243+
/// into the following form (using generic op form since the result is
244+
/// technically an invalid `fir.do_loop` op:
245+
///
246+
/// ```
247+
/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({
248+
/// ^bb0(%i_iv: index):
249+
/// %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32
250+
/// %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32
251+
/// %j_step = "arith.constant"() <{value = 1 : index}> : () -> index
252+
///
253+
/// "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({
254+
/// ^bb0(%new_i_iv: index, %new_j_iv: index):
255+
/// "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> ()
256+
/// "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> ()
257+
/// ...
258+
/// })
259+
/// ```
260+
///
261+
/// What happened to the loop nest is the following:
262+
///
263+
/// * the innermost loop's entry block was updated from having one operand to
264+
/// having `n` operands where `n` is the number of loops in the nest,
265+
///
266+
/// * the outer loop(s)' ops that update the IVs were sank inside the innermost
267+
/// loop (see the `"fir.store"(%new_i_iv, %i#1)` op above),
268+
///
269+
/// * the innermost loop's entry block's arguments were mapped in order from the
270+
/// outermost to the innermost IV.
271+
///
272+
/// With this IR change, we can directly inline the innermost loop's region into
273+
/// the newly generated `omp.loop_nest` op.
274+
///
275+
/// Note that this function has a pre-condition that \p loopNest consists of
276+
/// perfectly nested loops; i.e. there are no in-between ops between 2 nested
277+
/// loops except for the ops to setup the inner loop's LB, UB, and step. These
278+
/// ops are handled/cloned by `genLoopNestClauseOps(..)`.
279+
void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
280+
looputils::LoopNestToIndVarMap &loopNest) {
281+
if (loopNest.size() <= 1)
282+
return;
283+
284+
fir::DoLoopOp innermostLoop = loopNest.back().first;
285+
mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front();
286+
287+
llvm::SmallVector<mlir::Type> argTypes;
288+
llvm::SmallVector<mlir::Location> argLocs;
289+
290+
for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) {
291+
// Sink the IV update ops to the innermost loop. We need to do for all loops
292+
// except for the innermost one, hence the `drop_end` usage above.
293+
for (mlir::Operation *op : indVarInfo.indVarUpdateOps)
294+
op->moveBefore(&innermostFirstOp);
295+
296+
argTypes.push_back(doLoop.getInductionVar().getType());
297+
argLocs.push_back(doLoop.getInductionVar().getLoc());
298+
}
299+
300+
mlir::Region &innermmostRegion = innermostLoop.getRegion();
301+
// Extend the innermost entry block with arguments to represent the outer IVs.
302+
innermmostRegion.addArguments(argTypes, argLocs);
303+
304+
unsigned idx = 1;
305+
// In reverse, remap the IVs of the loop nest from the old values to the new
306+
// ones. We do that in reverse since the first argument before this loop is
307+
// the old IV for the innermost loop. Therefore, we want to replace it first
308+
// before the old value (1st argument in the block) is remapped to be the IV
309+
// of the outermost loop in the nest.
310+
for (auto &[doLoop, _] : llvm::reverse(loopNest)) {
311+
doLoop.getInductionVar().replaceAllUsesWith(
312+
innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx));
313+
++idx;
314+
}
315+
}
213316
} // namespace looputils
214317

215318
class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
@@ -236,6 +339,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
236339
"Some `do concurent` loops are not perfectly-nested. "
237340
"These will be serialized.");
238341

342+
looputils::sinkLoopIVArgs(rewriter, loopNest);
239343
mlir::IRMapping mapper;
240344
genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
241345
mlir::omp::LoopNestOperands loopNestClauseOps;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
! Tests mapping of a `do concurrent` loop with multiple iteration ranges.
2+
3+
! RUN: split-file %s %t
4+
5+
! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %t/multi_range.f90 -o - \
6+
! RUN: | FileCheck %s
7+
8+
!--- multi_range.f90
9+
program main
10+
integer, parameter :: n = 20
11+
integer, parameter :: m = 40
12+
integer, parameter :: l = 60
13+
integer :: a(n, m, l)
14+
15+
do concurrent(i=3:n, j=5:m, k=7:l)
16+
a(i,j,k) = i * j + k
17+
end do
18+
end
19+
20+
! CHECK: func.func @_QQmain
21+
22+
! CHECK: %[[C3:.*]] = arith.constant 3 : i32
23+
! CHECK: %[[LB_I:.*]] = fir.convert %[[C3]] : (i32) -> index
24+
! CHECK: %[[C20:.*]] = arith.constant 20 : i32
25+
! CHECK: %[[UB_I:.*]] = fir.convert %[[C20]] : (i32) -> index
26+
! CHECK: %[[STEP_I:.*]] = arith.constant 1 : index
27+
28+
! CHECK: %[[C5:.*]] = arith.constant 5 : i32
29+
! CHECK: %[[LB_J:.*]] = fir.convert %[[C5]] : (i32) -> index
30+
! CHECK: %[[C40:.*]] = arith.constant 40 : i32
31+
! CHECK: %[[UB_J:.*]] = fir.convert %[[C40]] : (i32) -> index
32+
! CHECK: %[[STEP_J:.*]] = arith.constant 1 : index
33+
34+
! CHECK: %[[C7:.*]] = arith.constant 7 : i32
35+
! CHECK: %[[LB_K:.*]] = fir.convert %[[C7]] : (i32) -> index
36+
! CHECK: %[[C60:.*]] = arith.constant 60 : i32
37+
! CHECK: %[[UB_K:.*]] = fir.convert %[[C60]] : (i32) -> index
38+
! CHECK: %[[STEP_K:.*]] = arith.constant 1 : index
39+
40+
! CHECK: omp.parallel {
41+
42+
! CHECK-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"}
43+
! CHECK-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"}
44+
45+
! CHECK-NEXT: %[[ITER_VAR_J:.*]] = fir.alloca i32 {bindc_name = "j"}
46+
! CHECK-NEXT: %[[BINDING_J:.*]]:2 = hlfir.declare %[[ITER_VAR_J]] {uniq_name = "_QFEj"}
47+
48+
! CHECK-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"}
49+
! CHECK-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"}
50+
51+
! CHECK: omp.wsloop {
52+
! CHECK-NEXT: omp.loop_nest
53+
! CHECK-SAME: (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]])
54+
! CHECK-SAME: : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]])
55+
! CHECK-SAME: to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive
56+
! CHECK-SAME: step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) {
57+
58+
! CHECK-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]]
59+
! CHECK-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#0
60+
61+
! CHECK-NEXT: %[[IV_IDX_J:.*]] = fir.convert %[[ARG1]]
62+
! CHECK-NEXT: fir.store %[[IV_IDX_J]] to %[[BINDING_J]]#0
63+
64+
! CHECK-NEXT: %[[IV_IDX_K:.*]] = fir.convert %[[ARG2]]
65+
! CHECK-NEXT: fir.store %[[IV_IDX_K]] to %[[BINDING_K]]#0
66+
67+
! CHECK: omp.yield
68+
! CHECK-NEXT: }
69+
! CHECK-NEXT: }
70+
71+
! CHECK-NEXT: omp.terminator
72+
! CHECK-NEXT: }

0 commit comments

Comments
 (0)