Skip to content

Commit b1c4248

Browse files
committed
[LoopInterchange] Improve profitability check for vectorization
The vectorization profitability has a process to check whether a given loop can be vectorized or not. Since the process is conservative, a loop that can be vectorized may be deemed not to be possible. This can trigger unnecessary exchanges. This patch improves the profitability decision by mitigating such misjudgments. Before this patch, we considered a loop to be vectorizable only when there are no loop carried dependencies with the IV of the loop. However, a loop carried dependency doesn't prevent vectorization if the distance is positive. This patch makes the vectorization check more accurate by allowing a loop with the positive dependency. Note that it is difficult to make a complete decision whether a loop can be vectorized or not. To achieve this, we must check the vector width and the distance of dependency.
1 parent 6333fa5 commit b1c4248

File tree

2 files changed

+106
-30
lines changed

2 files changed

+106
-30
lines changed

llvm/lib/Transforms/Scalar/LoopInterchange.cpp

Lines changed: 103 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
#include "llvm/ADT/SmallSet.h"
1818
#include "llvm/ADT/SmallVector.h"
1919
#include "llvm/ADT/Statistic.h"
20+
#include "llvm/ADT/StringMap.h"
2021
#include "llvm/ADT/StringRef.h"
21-
#include "llvm/ADT/StringSet.h"
2222
#include "llvm/Analysis/DependenceAnalysis.h"
2323
#include "llvm/Analysis/LoopCacheAnalysis.h"
2424
#include "llvm/Analysis/LoopInfo.h"
@@ -80,6 +80,21 @@ enum class RuleTy {
8080
ForVectorization,
8181
};
8282

83+
/// Store the information about if corresponding direction vector was negated
84+
/// by normalization or not. This is necessary to restore the original one from
85+
/// a row of a dependency matrix, because we only manage normalized direction
86+
/// vectors and duplicate vectors are eliminated. So there may be both original
87+
/// and negated vectors for a single entry (a row of dependency matrix). E.g.,
88+
/// if there are two direction vectors `[< =]` and `[> =]`, the later one will
89+
/// be converted to the same as former one by normalization, so only `[< =]`
90+
/// would be retained in the final result.
91+
struct NegatedStatus {
92+
bool Original = false;
93+
bool Negated = false;
94+
95+
bool isNonNegativeDir(char Dir) const;
96+
};
97+
8398
} // end anonymous namespace
8499

85100
// Minimum loop depth supported.
@@ -126,9 +141,10 @@ static void printDepMatrix(CharMatrix &DepMatrix) {
126141
}
127142
#endif
128143

129-
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
130-
Loop *L, DependenceInfo *DI,
131-
ScalarEvolution *SE,
144+
static bool populateDependencyMatrix(CharMatrix &DepMatrix,
145+
std::vector<NegatedStatus> &NegStatusVec,
146+
unsigned Level, Loop *L,
147+
DependenceInfo *DI, ScalarEvolution *SE,
132148
OptimizationRemarkEmitter *ORE) {
133149
using ValueVector = SmallVector<Value *, 16>;
134150

@@ -167,7 +183,9 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
167183
return false;
168184
}
169185
ValueVector::iterator I, IE, J, JE;
170-
StringSet<> Seen;
186+
187+
// Manage all found direction vectors. and map it to the index of DepMatrix.
188+
StringMap<unsigned> Seen;
171189

172190
for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
173191
for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -182,7 +200,8 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
182200
assert(D->isOrdered() && "Expected an output, flow or anti dep.");
183201
// If the direction vector is negative, normalize it to
184202
// make it non-negative.
185-
if (D->normalize(SE))
203+
bool Normalized = D->normalize(SE);
204+
if (Normalized)
186205
LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
187206
LLVM_DEBUG(StringRef DepType =
188207
D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
@@ -214,8 +233,17 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
214233
}
215234

216235
// Make sure we only add unique entries to the dependency matrix.
217-
if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
236+
unsigned Index = DepMatrix.size();
237+
auto [Ite, Inserted] =
238+
Seen.try_emplace(StringRef(Dep.data(), Dep.size()), Index);
239+
if (Inserted) {
218240
DepMatrix.push_back(Dep);
241+
NegStatusVec.push_back(NegatedStatus{});
242+
} else
243+
Index = Ite->second;
244+
245+
NegatedStatus &Status = NegStatusVec[Index];
246+
(Normalized ? Status.Negated : Status.Original) = true;
219247
}
220248
}
221249
}
@@ -400,6 +428,7 @@ class LoopInterchangeProfitability {
400428
bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
401429
unsigned InnerLoopId, unsigned OuterLoopId,
402430
CharMatrix &DepMatrix,
431+
const std::vector<NegatedStatus> &NegStatusVec,
403432
const DenseMap<const Loop *, unsigned> &CostMap,
404433
std::unique_ptr<CacheCost> &CC);
405434

@@ -409,9 +438,10 @@ class LoopInterchangeProfitability {
409438
const DenseMap<const Loop *, unsigned> &CostMap,
410439
std::unique_ptr<CacheCost> &CC);
411440
std::optional<bool> isProfitablePerInstrOrderCost();
412-
std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
413-
unsigned OuterLoopId,
414-
CharMatrix &DepMatrix);
441+
std::optional<bool>
442+
isProfitableForVectorization(unsigned InnerLoopId, unsigned OuterLoopId,
443+
CharMatrix &DepMatrix,
444+
const std::vector<NegatedStatus> &NegStatusVec);
415445
Loop *OuterLoop;
416446
Loop *InnerLoop;
417447

@@ -503,8 +533,9 @@ struct LoopInterchange {
503533
<< "\n");
504534

505535
CharMatrix DependencyMatrix;
536+
std::vector<NegatedStatus> NegStatusVec;
506537
Loop *OuterMostLoop = *(LoopList.begin());
507-
if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
538+
if (!populateDependencyMatrix(DependencyMatrix, NegStatusVec, LoopNestDepth,
508539
OuterMostLoop, DI, SE, ORE)) {
509540
LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
510541
return false;
@@ -543,8 +574,8 @@ struct LoopInterchange {
543574
for (unsigned j = SelecLoopId; j > 0; j--) {
544575
bool ChangedPerIter = false;
545576
for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
546-
bool Interchanged =
547-
processLoop(LoopList, i, i - 1, DependencyMatrix, CostMap);
577+
bool Interchanged = processLoop(LoopList, i, i - 1, DependencyMatrix,
578+
NegStatusVec, CostMap);
548579
ChangedPerIter |= Interchanged;
549580
Changed |= Interchanged;
550581
}
@@ -559,6 +590,8 @@ struct LoopInterchange {
559590
bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
560591
unsigned OuterLoopId,
561592
std::vector<std::vector<char>> &DependencyMatrix,
593+
594+
const std::vector<NegatedStatus> &NegStatusVec,
562595
const DenseMap<const Loop *, unsigned> &CostMap) {
563596
Loop *OuterLoop = LoopList[OuterLoopId];
564597
Loop *InnerLoop = LoopList[InnerLoopId];
@@ -572,7 +605,7 @@ struct LoopInterchange {
572605
LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
573606
LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
574607
if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
575-
DependencyMatrix, CostMap, CC)) {
608+
DependencyMatrix, NegStatusVec, CostMap, CC)) {
576609
LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
577610
return false;
578611
}
@@ -1197,27 +1230,71 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
11971230
return std::nullopt;
11981231
}
11991232

1233+
static char flipDirection(char Dir) {
1234+
switch (Dir) {
1235+
case '<':
1236+
return '>';
1237+
case '>':
1238+
return '<';
1239+
case '=':
1240+
case 'I':
1241+
case '*':
1242+
return Dir;
1243+
default:
1244+
llvm_unreachable("Unknown direction");
1245+
}
1246+
}
1247+
1248+
/// Ensure that there are no negative direction dependencies corresponding to \p
1249+
/// Dir.
1250+
bool NegatedStatus::isNonNegativeDir(char Dir) const {
1251+
assert((Original || Negated) && "Cannot restore the original direction");
1252+
1253+
// If both flag is true, it means that there is both as-is and negated
1254+
// direction. In this case only `=` or `I` don't have negative direction
1255+
// dependency.
1256+
if (Original && Negated)
1257+
return Dir == '=' || Dir == 'I';
1258+
1259+
char Restored = Negated ? flipDirection(Dir) : Dir;
1260+
return Restored == '=' || Restored == 'I' || Restored == '<';
1261+
}
1262+
12001263
/// Return true if we can vectorize the loop specified by \p LoopId.
1201-
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1264+
static bool canVectorize(const CharMatrix &DepMatrix,
1265+
const std::vector<NegatedStatus> &NegStatusVec,
1266+
unsigned LoopId) {
1267+
// The loop can be vectorized if there are no negative dependencies. Consider
1268+
// the dependency of `j` in the following example.
1269+
//
1270+
// Positive: ... = A[i][j] Negative: ... = A[i][j-1]
1271+
// A[i][j-1] = ... A[i][j] = ...
1272+
//
1273+
// In the right case, vectorizing the loop can change the loaded value from
1274+
// `A[i][j-1]`. At the moment we don't take into account the distance of the
1275+
// dependency and vector width.
1276+
// TODO: Considering the dependency distance and the vector width can give a
1277+
// more accurate result. For example, the following loop can be vectorized if
1278+
// the vector width is less than or equal to 4 x sizeof(A[0][0]).
12021279
for (unsigned I = 0; I != DepMatrix.size(); I++) {
12031280
char Dir = DepMatrix[I][LoopId];
1204-
if (Dir != 'I' && Dir != '=')
1281+
if (!NegStatusVec[I].isNonNegativeDir(Dir))
12051282
return false;
12061283
}
12071284
return true;
12081285
}
12091286

12101287
std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1211-
unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1212-
// If the outer loop is not loop independent it is not profitable to move
1213-
// this to inner position, since doing so would not enable inner loop
1214-
// parallelism.
1215-
if (!canVectorize(DepMatrix, OuterLoopId))
1288+
unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix,
1289+
const std::vector<NegatedStatus> &NegStatusVec) {
1290+
// If the outer loop cannot be vectorized, it is not profitable to move this
1291+
// to inner position.
1292+
if (!canVectorize(DepMatrix, NegStatusVec, OuterLoopId))
12161293
return false;
12171294

1218-
// If inner loop has dependence and outer loop is loop independent then it is
1295+
// If inner loop cannot be vectorized and outer loop can be then it is
12191296
// profitable to interchange to enable inner loop parallelism.
1220-
if (!canVectorize(DepMatrix, InnerLoopId))
1297+
if (!canVectorize(DepMatrix, NegStatusVec, InnerLoopId))
12211298
return true;
12221299

12231300
// If both the inner and the outer loop can be vectorized, it is necessary to
@@ -1231,6 +1308,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
12311308
bool LoopInterchangeProfitability::isProfitable(
12321309
const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
12331310
unsigned OuterLoopId, CharMatrix &DepMatrix,
1311+
const std::vector<NegatedStatus> &NegStatusVec,
12341312
const DenseMap<const Loop *, unsigned> &CostMap,
12351313
std::unique_ptr<CacheCost> &CC) {
12361314
// isProfitable() is structured to avoid endless loop interchange. If the
@@ -1252,8 +1330,8 @@ bool LoopInterchangeProfitability::isProfitable(
12521330
shouldInterchange = isProfitablePerInstrOrderCost();
12531331
break;
12541332
case RuleTy::ForVectorization:
1255-
shouldInterchange =
1256-
isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1333+
shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId,
1334+
DepMatrix, NegStatusVec);
12571335
break;
12581336
}
12591337

llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,13 @@ exit:
6464
; for (int j = 1; j < 256; j++)
6565
; A[i][j-1] = A[i][j] + B[i][j];
6666
;
67-
; FIXME: These loops are exchanged at this time due to the problem in
68-
; profitability heuristic calculation for vectorization.
6967

70-
; CHECK: --- !Passed
68+
; CHECK: --- !Missed
7169
; CHECK-NEXT: Pass: loop-interchange
72-
; CHECK-NEXT: Name: Interchanged
70+
; CHECK-NEXT: Name: InterchangeNotProfitable
7371
; CHECK-NEXT: Function: interchange_unnecesasry_for_vectorization
7472
; CHECK-NEXT: Args:
75-
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
73+
; CHECK-NEXT: - String: Insufficient information to calculate the cost of loop for interchange.
7674
define void @interchange_unnecesasry_for_vectorization() {
7775
entry:
7876
br label %for.i.header

0 commit comments

Comments
 (0)