Skip to content

Commit 090ea42

Browse files
committed
[flang][OpenMP] Extend do concurrent mapping to multi-range loops
Adds support for converting mulit-range loops to OpenMP (on the host only for now). The changes here "prepare" a loop nest for collapsing by sinking iteration variables to the innermost `fir.do_loop` op in the nest.
1 parent b50be98 commit 090ea42

File tree

3 files changed

+239
-1
lines changed

3 files changed

+239
-1
lines changed

flang/docs/DoConcurrentConversionToOpenMP.md

+29
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,35 @@ omp.parallel {
173173

174174
<!-- TODO -->
175175

176+
### Multi-range loops
177+
178+
The pass currently supports multi-range loops as well. Given the following
179+
example:
180+
181+
```fortran
182+
do concurrent(i=1:n, j=1:m)
183+
a(i,j) = i * j
184+
end do
185+
```
186+
187+
The generated `omp.loop_nest` operation look like:
188+
189+
```
190+
omp.loop_nest (%arg0, %arg1)
191+
: index = (%17, %19) to (%18, %20)
192+
inclusive step (%c1_2, %c1_4) {
193+
fir.store %arg0 to %private_i#1 : !fir.ref<i32>
194+
fir.store %arg1 to %private_j#1 : !fir.ref<i32>
195+
...
196+
omp.yield
197+
}
198+
```
199+
200+
It is worth noting that we have privatized versions for both iteration
201+
variables: `i` and `j`. These are locally allocated inside the parallel/target
202+
OpenMP region similar to what the single-range example in previous section
203+
shows.
204+
176205
<!--
177206
More details about current status will be added along with relevant parts of the
178207
implementation in later upstreaming patches.

flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp

+138-1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ namespace looputils {
3030
struct InductionVariableInfo {
3131
/// the operation allocating memory for iteration variable,
3232
mlir::Operation *iterVarMemDef;
33+
/// the operation(s) updating the iteration variable with the current
34+
/// iteration number.
35+
llvm::SetVector<mlir::Operation *> indVarUpdateOps;
3336
};
3437

3538
using LoopNestToIndVarMap =
@@ -70,6 +73,47 @@ mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
7073
return result.getDefiningOp();
7174
}
7275

76+
/// Collects the op(s) responsible for updating a loop's iteration variable with
77+
/// the current iteration number. For example, for the input IR:
78+
/// ```
79+
/// %i = fir.alloca i32 {bindc_name = "i"}
80+
/// %i_decl:2 = hlfir.declare %i ...
81+
/// ...
82+
/// fir.do_loop %i_iv = %lb to %ub step %step unordered {
83+
/// %1 = fir.convert %i_iv : (index) -> i32
84+
/// fir.store %1 to %i_decl#1 : !fir.ref<i32>
85+
/// ...
86+
/// }
87+
/// ```
88+
/// this function would return the first 2 ops in the `fir.do_loop`'s region.
89+
llvm::SetVector<mlir::Operation *>
90+
extractIndVarUpdateOps(fir::DoLoopOp doLoop) {
91+
mlir::Value indVar = doLoop.getInductionVar();
92+
llvm::SetVector<mlir::Operation *> indVarUpdateOps;
93+
94+
llvm::SmallVector<mlir::Value> toProcess;
95+
toProcess.push_back(indVar);
96+
97+
llvm::DenseSet<mlir::Value> done;
98+
99+
while (!toProcess.empty()) {
100+
mlir::Value val = toProcess.back();
101+
toProcess.pop_back();
102+
103+
if (!done.insert(val).second)
104+
continue;
105+
106+
for (mlir::Operation *user : val.getUsers()) {
107+
indVarUpdateOps.insert(user);
108+
109+
for (mlir::Value result : user->getResults())
110+
toProcess.push_back(result);
111+
}
112+
}
113+
114+
return std::move(indVarUpdateOps);
115+
}
116+
73117
/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
74118
/// there are no operations in \p outerloop's body other than:
75119
///
@@ -166,7 +210,9 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
166210
while (true) {
167211
loopNest.insert(
168212
{currentLoop,
169-
InductionVariableInfo{findLoopIterationVarMemDecl(currentLoop)}});
213+
InductionVariableInfo{
214+
findLoopIterationVarMemDecl(currentLoop),
215+
std::move(looputils::extractIndVarUpdateOps(currentLoop))}});
170216

171217
llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
172218

@@ -193,6 +239,96 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
193239

194240
return mlir::success();
195241
}
242+
243+
/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In
244+
/// particular, this function would take this input IR:
245+
/// ```
246+
/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered {
247+
/// fir.store %i_iv to %i#1 : !fir.ref<i32>
248+
/// %j_lb = arith.constant 1 : i32
249+
/// %j_ub = arith.constant 10 : i32
250+
/// %j_step = arith.constant 1 : index
251+
///
252+
/// fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered {
253+
/// fir.store %j_iv to %j#1 : !fir.ref<i32>
254+
/// ...
255+
/// }
256+
/// }
257+
/// ```
258+
///
259+
/// into the following form (using generic op form since the result is
260+
/// technically an invalid `fir.do_loop` op:
261+
///
262+
/// ```
263+
/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({
264+
/// ^bb0(%i_iv: index):
265+
/// %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32
266+
/// %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32
267+
/// %j_step = "arith.constant"() <{value = 1 : index}> : () -> index
268+
///
269+
/// "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({
270+
/// ^bb0(%new_i_iv: index, %new_j_iv: index):
271+
/// "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> ()
272+
/// "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> ()
273+
/// ...
274+
/// })
275+
/// ```
276+
///
277+
/// What happened to the loop nest is the following:
278+
///
279+
/// * the innermost loop's entry block was updated from having one operand to
280+
/// having `n` operands where `n` is the number of loops in the nest,
281+
///
282+
/// * the outer loop(s)' ops that update the IVs were sank inside the innermost
283+
/// loop (see the `"fir.store"(%new_i_iv, %i#1)` op above),
284+
///
285+
/// * the innermost loop's entry block's arguments were mapped in order from the
286+
/// outermost to the innermost IV.
287+
///
288+
/// With this IR change, we can directly inline the innermost loop's region into
289+
/// the newly generated `omp.loop_nest` op.
290+
///
291+
/// Note that this function has a pre-condition that \p loopNest consists of
292+
/// perfectly nested loops; i.e. there are no in-between ops between 2 nested
293+
/// loops except for the ops to setup the inner loop's LB, UB, and step. These
294+
/// ops are handled/cloned by `genLoopNestClauseOps(..)`.
295+
void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
296+
looputils::LoopNestToIndVarMap &loopNest) {
297+
if (loopNest.size() <= 1)
298+
return;
299+
300+
fir::DoLoopOp innermostLoop = loopNest.back().first;
301+
mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front();
302+
303+
llvm::SmallVector<mlir::Type> argTypes;
304+
llvm::SmallVector<mlir::Location> argLocs;
305+
306+
for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) {
307+
// Sink the IV update ops to the innermost loop. We need to do for all loops
308+
// except for the innermost one, hence the `drop_end` usage above.
309+
for (mlir::Operation *op : indVarInfo.indVarUpdateOps)
310+
op->moveBefore(&innermostFirstOp);
311+
312+
argTypes.push_back(doLoop.getInductionVar().getType());
313+
argLocs.push_back(doLoop.getInductionVar().getLoc());
314+
}
315+
316+
mlir::Region &innermmostRegion = innermostLoop.getRegion();
317+
// Extend the innermost entry block with arguments to represent the outer IVs.
318+
innermmostRegion.addArguments(argTypes, argLocs);
319+
320+
unsigned idx = 1;
321+
// In reverse, remap the IVs of the loop nest from the old values to the new
322+
// ones. We do that in reverse since the first argument before this loop is
323+
// the old IV for the innermost loop. Therefore, we want to replace it first
324+
// before the old value (1st argument in the block) is remapped to be the IV
325+
// of the outermost loop in the nest.
326+
for (auto &[doLoop, _] : llvm::reverse(loopNest)) {
327+
doLoop.getInductionVar().replaceAllUsesWith(
328+
innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx));
329+
++idx;
330+
}
331+
}
196332
} // namespace looputils
197333

198334
class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
@@ -219,6 +355,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
219355
"Some `do concurent` loops are not perfectly-nested. "
220356
"These will be serialized.");
221357

358+
looputils::sinkLoopIVArgs(rewriter, loopNest);
222359
mlir::IRMapping mapper;
223360
genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
224361
mlir::omp::LoopNestOperands loopNestClauseOps;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
! Tests mapping of a `do concurrent` loop with multiple iteration ranges.
2+
3+
! RUN: split-file %s %t
4+
5+
! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %t/multi_range.f90 -o - \
6+
! RUN: | FileCheck %s
7+
8+
!--- multi_range.f90
9+
program main
10+
integer, parameter :: n = 20
11+
integer, parameter :: m = 40
12+
integer, parameter :: l = 60
13+
integer :: a(n, m, l)
14+
15+
do concurrent(i=3:n, j=5:m, k=7:l)
16+
a(i,j,k) = i * j + k
17+
end do
18+
end
19+
20+
! CHECK: func.func @_QQmain
21+
22+
! CHECK: %[[C3:.*]] = arith.constant 3 : i32
23+
! CHECK: %[[LB_I:.*]] = fir.convert %[[C3]] : (i32) -> index
24+
! CHECK: %[[C20:.*]] = arith.constant 20 : i32
25+
! CHECK: %[[UB_I:.*]] = fir.convert %[[C20]] : (i32) -> index
26+
! CHECK: %[[STEP_I:.*]] = arith.constant 1 : index
27+
28+
! CHECK: %[[C5:.*]] = arith.constant 5 : i32
29+
! CHECK: %[[LB_J:.*]] = fir.convert %[[C5]] : (i32) -> index
30+
! CHECK: %[[C40:.*]] = arith.constant 40 : i32
31+
! CHECK: %[[UB_J:.*]] = fir.convert %[[C40]] : (i32) -> index
32+
! CHECK: %[[STEP_J:.*]] = arith.constant 1 : index
33+
34+
! CHECK: %[[C7:.*]] = arith.constant 7 : i32
35+
! CHECK: %[[LB_K:.*]] = fir.convert %[[C7]] : (i32) -> index
36+
! CHECK: %[[C60:.*]] = arith.constant 60 : i32
37+
! CHECK: %[[UB_K:.*]] = fir.convert %[[C60]] : (i32) -> index
38+
! CHECK: %[[STEP_K:.*]] = arith.constant 1 : index
39+
40+
! CHECK: omp.parallel {
41+
42+
! CHECK-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"}
43+
! CHECK-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"}
44+
45+
! CHECK-NEXT: %[[ITER_VAR_J:.*]] = fir.alloca i32 {bindc_name = "j"}
46+
! CHECK-NEXT: %[[BINDING_J:.*]]:2 = hlfir.declare %[[ITER_VAR_J]] {uniq_name = "_QFEj"}
47+
48+
! CHECK-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"}
49+
! CHECK-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"}
50+
51+
! CHECK: omp.wsloop {
52+
! CHECK-NEXT: omp.loop_nest
53+
! CHECK-SAME: (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]])
54+
! CHECK-SAME: : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]])
55+
! CHECK-SAME: to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive
56+
! CHECK-SAME: step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) {
57+
58+
! CHECK-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]]
59+
! CHECK-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#1
60+
61+
! CHECK-NEXT: %[[IV_IDX_J:.*]] = fir.convert %[[ARG1]]
62+
! CHECK-NEXT: fir.store %[[IV_IDX_J]] to %[[BINDING_J]]#1
63+
64+
! CHECK-NEXT: %[[IV_IDX_K:.*]] = fir.convert %[[ARG2]]
65+
! CHECK-NEXT: fir.store %[[IV_IDX_K]] to %[[BINDING_K]]#1
66+
67+
! CHECK: omp.yield
68+
! CHECK-NEXT: }
69+
! CHECK-NEXT: }
70+
71+
! CHECK-NEXT: omp.terminator
72+
! CHECK-NEXT: }

0 commit comments

Comments
 (0)