Skip to content

Commit 4c63b2a

Browse files
committed
[flang][OpenMP] Extend do concurrent mapping to multi-range loops
Adds support for converting mulit-range loops to OpenMP (on the host only for now). The changes here "prepare" a loop nest for collapsing by sinking iteration variables to the innermost `fir.do_loop` op in the nest.
1 parent 06bf9bc commit 4c63b2a

File tree

3 files changed

+240
-2
lines changed

3 files changed

+240
-2
lines changed

flang/docs/DoConcurrentConversionToOpenMP.md

+29
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,35 @@ omp.parallel {
173173

174174
<!-- TODO -->
175175

176+
### Multi-range loops
177+
178+
The pass currently supports multi-range loops as well. Given the following
179+
example:
180+
181+
```fortran
182+
do concurrent(i=1:n, j=1:m)
183+
a(i,j) = i * j
184+
end do
185+
```
186+
187+
The generated `omp.loop_nest` operation look like:
188+
189+
```
190+
omp.loop_nest (%arg0, %arg1)
191+
: index = (%17, %19) to (%18, %20)
192+
inclusive step (%c1_2, %c1_4) {
193+
fir.store %arg0 to %private_i#1 : !fir.ref<i32>
194+
fir.store %arg1 to %private_j#1 : !fir.ref<i32>
195+
...
196+
omp.yield
197+
}
198+
```
199+
200+
It is worth noting that we have privatized versions for both iteration
201+
variables: `i` and `j`. These are locally allocated inside the parallel/target
202+
OpenMP region similar to what the single-range example in previous section
203+
shows.
204+
176205
<!--
177206
More details about current status will be added along with relevant parts of the
178207
implementation in later upstreaming patches.

flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp

+139-2
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,13 @@ namespace flangomp {
2626
namespace {
2727
namespace looputils {
2828
/// Stores info needed about the induction/iteration variable for each `do
29-
/// concurrent` in a loop nest. This includes only for now:
29+
/// concurrent` in a loop nest. This includes:
3030
/// * the operation allocating memory for iteration variable,
31+
/// * the operation(s) updating the iteration variable with the current
32+
/// iteration number.
3133
struct InductionVariableInfo {
3234
mlir::Operation *iterVarMemDef;
35+
llvm::SetVector<mlir::Operation *> indVarUpdateOps;
3336
};
3437

3538
using LoopNestToIndVarMap =
@@ -102,6 +105,47 @@ mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
102105
return result.getDefiningOp();
103106
}
104107

108+
/// Collects the op(s) responsible for updating a loop's iteration variable with
109+
/// the current iteration number. For example, for the input IR:
110+
/// ```
111+
/// %i = fir.alloca i32 {bindc_name = "i"}
112+
/// %i_decl:2 = hlfir.declare %i ...
113+
/// ...
114+
/// fir.do_loop %i_iv = %lb to %ub step %step unordered {
115+
/// %1 = fir.convert %i_iv : (index) -> i32
116+
/// fir.store %1 to %i_decl#1 : !fir.ref<i32>
117+
/// ...
118+
/// }
119+
/// ```
120+
/// this function would return the first 2 ops in the `fir.do_loop`'s region.
121+
llvm::SetVector<mlir::Operation *>
122+
extractIndVarUpdateOps(fir::DoLoopOp doLoop) {
123+
mlir::Value indVar = doLoop.getInductionVar();
124+
llvm::SetVector<mlir::Operation *> indVarUpdateOps;
125+
126+
llvm::SmallVector<mlir::Value> toProcess;
127+
toProcess.push_back(indVar);
128+
129+
llvm::DenseSet<mlir::Value> done;
130+
131+
while (!toProcess.empty()) {
132+
mlir::Value val = toProcess.back();
133+
toProcess.pop_back();
134+
135+
if (!done.insert(val).second)
136+
continue;
137+
138+
for (mlir::Operation *user : val.getUsers()) {
139+
indVarUpdateOps.insert(user);
140+
141+
for (mlir::Value result : user->getResults())
142+
toProcess.push_back(result);
143+
}
144+
}
145+
146+
return std::move(indVarUpdateOps);
147+
}
148+
105149
/// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
106150
/// there are no operations in \p outerloop's body other than:
107151
///
@@ -175,7 +219,9 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
175219
while (true) {
176220
loopNest.try_emplace(
177221
currentLoop,
178-
InductionVariableInfo{findLoopIterationVarMemDecl(currentLoop)});
222+
InductionVariableInfo{
223+
findLoopIterationVarMemDecl(currentLoop),
224+
std::move(looputils::extractIndVarUpdateOps(currentLoop))});
179225

180226
auto directlyNestedLoops = currentLoop.getRegion().getOps<fir::DoLoopOp>();
181227
llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
@@ -200,6 +246,96 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
200246

201247
return mlir::success();
202248
}
249+
250+
/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In
251+
/// particular, this function would take this input IR:
252+
/// ```
253+
/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered {
254+
/// fir.store %i_iv to %i#1 : !fir.ref<i32>
255+
/// %j_lb = arith.constant 1 : i32
256+
/// %j_ub = arith.constant 10 : i32
257+
/// %j_step = arith.constant 1 : index
258+
///
259+
/// fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered {
260+
/// fir.store %j_iv to %j#1 : !fir.ref<i32>
261+
/// ...
262+
/// }
263+
/// }
264+
/// ```
265+
///
266+
/// into the following form (using generic op form since the result is
267+
/// technically an invalid `fir.do_loop` op:
268+
///
269+
/// ```
270+
/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({
271+
/// ^bb0(%i_iv: index):
272+
/// %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32
273+
/// %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32
274+
/// %j_step = "arith.constant"() <{value = 1 : index}> : () -> index
275+
///
276+
/// "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({
277+
/// ^bb0(%new_i_iv: index, %new_j_iv: index):
278+
/// "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> ()
279+
/// "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> ()
280+
/// ...
281+
/// })
282+
/// ```
283+
///
284+
/// What happened to the loop nest is the following:
285+
///
286+
/// * the innermost loop's entry block was updated from having one operand to
287+
/// having `n` operands where `n` is the number of loops in the nest,
288+
///
289+
/// * the outer loop(s)' ops that update the IVs were sank inside the innermost
290+
/// loop (see the `"fir.store"(%new_i_iv, %i#1)` op above),
291+
///
292+
/// * the innermost loop's entry block's arguments were mapped in order from the
293+
/// outermost to the innermost IV.
294+
///
295+
/// With this IR change, we can directly inline the innermost loop's region into
296+
/// the newly generated `omp.loop_nest` op.
297+
///
298+
/// Note that this function has a pre-condition that \p loopNest consists of
299+
/// perfectly nested loops; i.e. there are no in-between ops between 2 nested
300+
/// loops except for the ops to setup the inner loop's LB, UB, and step. These
301+
/// ops are handled/cloned by `genLoopNestClauseOps(..)`.
302+
void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
303+
looputils::LoopNestToIndVarMap &loopNest) {
304+
if (loopNest.size() <= 1)
305+
return;
306+
307+
fir::DoLoopOp innermostLoop = loopNest.back().first;
308+
mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front();
309+
310+
llvm::SmallVector<mlir::Type> argTypes;
311+
llvm::SmallVector<mlir::Location> argLocs;
312+
313+
for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) {
314+
// Sink the IV update ops to the innermost loop. We need to do for all loops
315+
// except for the innermost one, hence the `drop_end` usage above.
316+
for (mlir::Operation *op : indVarInfo.indVarUpdateOps)
317+
op->moveBefore(&innermostFirstOp);
318+
319+
argTypes.push_back(doLoop.getInductionVar().getType());
320+
argLocs.push_back(doLoop.getInductionVar().getLoc());
321+
}
322+
323+
mlir::Region &innermmostRegion = innermostLoop.getRegion();
324+
// Extend the innermost entry block with arguments to represent the outer IVs.
325+
innermmostRegion.addArguments(argTypes, argLocs);
326+
327+
unsigned idx = 1;
328+
// In reverse, remap the IVs of the loop nest from the old values to the new
329+
// ones. We do that in reverse since the first argument before this loop is
330+
// the old IV for the innermost loop. Therefore, we want to replace it first
331+
// before the old value (1st argument in the block) is remapped to be the IV
332+
// of the outermost loop in the nest.
333+
for (auto &[doLoop, _] : llvm::reverse(loopNest)) {
334+
doLoop.getInductionVar().replaceAllUsesWith(
335+
innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx));
336+
++idx;
337+
}
338+
}
203339
} // namespace looputils
204340

205341
class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
@@ -222,6 +358,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
222358
"Some `do concurent` loops are not perfectly-nested. "
223359
"These will be serialzied.");
224360

361+
looputils::sinkLoopIVArgs(rewriter, loopNest);
225362
mlir::IRMapping mapper;
226363
genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
227364
mlir::omp::LoopNestOperands loopNestClauseOps;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
! Tests mapping of a `do concurrent` loop with multiple iteration ranges.
2+
3+
! RUN: split-file %s %t
4+
5+
! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %t/multi_range.f90 -o - \
6+
! RUN: | FileCheck %s
7+
8+
!--- multi_range.f90
9+
program main
10+
integer, parameter :: n = 20
11+
integer, parameter :: m = 40
12+
integer, parameter :: l = 60
13+
integer :: a(n, m, l)
14+
15+
do concurrent(i=3:n, j=5:m, k=7:l)
16+
a(i,j,k) = i * j + k
17+
end do
18+
end
19+
20+
! CHECK: func.func @_QQmain
21+
22+
! CHECK: %[[C3:.*]] = arith.constant 3 : i32
23+
! CHECK: %[[LB_I:.*]] = fir.convert %[[C3]] : (i32) -> index
24+
! CHECK: %[[C20:.*]] = arith.constant 20 : i32
25+
! CHECK: %[[UB_I:.*]] = fir.convert %[[C20]] : (i32) -> index
26+
! CHECK: %[[STEP_I:.*]] = arith.constant 1 : index
27+
28+
! CHECK: %[[C5:.*]] = arith.constant 5 : i32
29+
! CHECK: %[[LB_J:.*]] = fir.convert %[[C5]] : (i32) -> index
30+
! CHECK: %[[C40:.*]] = arith.constant 40 : i32
31+
! CHECK: %[[UB_J:.*]] = fir.convert %[[C40]] : (i32) -> index
32+
! CHECK: %[[STEP_J:.*]] = arith.constant 1 : index
33+
34+
! CHECK: %[[C7:.*]] = arith.constant 7 : i32
35+
! CHECK: %[[LB_K:.*]] = fir.convert %[[C7]] : (i32) -> index
36+
! CHECK: %[[C60:.*]] = arith.constant 60 : i32
37+
! CHECK: %[[UB_K:.*]] = fir.convert %[[C60]] : (i32) -> index
38+
! CHECK: %[[STEP_K:.*]] = arith.constant 1 : index
39+
40+
! CHECK: omp.parallel {
41+
42+
! CHECK-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"}
43+
! CHECK-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"}
44+
45+
! CHECK-NEXT: %[[ITER_VAR_J:.*]] = fir.alloca i32 {bindc_name = "j"}
46+
! CHECK-NEXT: %[[BINDING_J:.*]]:2 = hlfir.declare %[[ITER_VAR_J]] {uniq_name = "_QFEj"}
47+
48+
! CHECK-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"}
49+
! CHECK-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"}
50+
51+
! CHECK: omp.wsloop {
52+
! CHECK-NEXT: omp.loop_nest
53+
! CHECK-SAME: (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]])
54+
! CHECK-SAME: : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]])
55+
! CHECK-SAME: to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive
56+
! CHECK-SAME: step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) {
57+
58+
! CHECK-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]]
59+
! CHECK-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#1
60+
61+
! CHECK-NEXT: %[[IV_IDX_J:.*]] = fir.convert %[[ARG1]]
62+
! CHECK-NEXT: fir.store %[[IV_IDX_J]] to %[[BINDING_J]]#1
63+
64+
! CHECK-NEXT: %[[IV_IDX_K:.*]] = fir.convert %[[ARG2]]
65+
! CHECK-NEXT: fir.store %[[IV_IDX_K]] to %[[BINDING_K]]#1
66+
67+
! CHECK: omp.yield
68+
! CHECK-NEXT: }
69+
! CHECK-NEXT: }
70+
71+
! CHECK-NEXT: omp.terminator
72+
! CHECK-NEXT: }

0 commit comments

Comments
 (0)