Skip to content

[LoopInterchange] Improve profitability check for vectorization #133672

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 71 additions & 14 deletions llvm/lib/Transforms/Scalar/LoopInterchange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/Analysis/DependenceAnalysis.h"
#include "llvm/Analysis/LoopCacheAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
Expand Down Expand Up @@ -119,7 +119,11 @@ static bool noDuplicateRules(ArrayRef<RuleTy> Rules) {

static void printDepMatrix(CharMatrix &DepMatrix) {
for (auto &Row : DepMatrix) {
for (auto D : Row)
ArrayRef<char> RowRef(Row);

// Drop the last element because it is a flag indicating whether the row is
// "lexically forward", which doesn't affect the legality check.
for (auto D : RowRef.drop_back())
LLVM_DEBUG(dbgs() << D << " ");
LLVM_DEBUG(dbgs() << "\n");
}
Expand Down Expand Up @@ -167,7 +171,20 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
return false;
}
ValueVector::iterator I, IE, J, JE;
StringSet<> Seen;

// Manage direction vectors that are already seen. Map each direction vector
// to an index of DepMatrix at which it is stored.
StringMap<unsigned> Seen;

// The i-th element is set iff all dependencies corresponding to the i-th
// direction vector in DepMatrix are "lexically forward". The notion
// "lexically forward" aligns with what is defined in LAA
// (LoopAccessAnalysis).
//
// We deem a dependence lexically forward if we can prove that the
// destination instruction is always executed after the source instruction
// within each iteration.
BitVector IsForwardFlags;

for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
for (J = I, JE = MemInstr.end(); J != JE; ++J) {
Expand All @@ -180,10 +197,22 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
// Track Output, Flow, and Anti dependencies.
if (auto D = DI->depends(Src, Dst)) {
assert(D->isOrdered() && "Expected an output, flow or anti dep.");
bool IsForward = true;

// If Src and Dst are in the same BB, Src is always executed before Dst
// in the same loop iteration. If not, we must check whether one BB
// dominates the other to determine if Src and Dst are executed in this
// order. At the moment, we don't perform such check.
if (Src->getParent() != Dst->getParent())
IsForward = false;

// If the direction vector is negative, normalize it to
// make it non-negative.
if (D->normalize(SE))
bool Normalized = D->normalize(SE);
if (Normalized) {
LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
IsForward = false;
}
LLVM_DEBUG(StringRef DepType =
D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
dbgs() << "Found " << DepType
Expand Down Expand Up @@ -221,13 +250,28 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
Dep.push_back('I');
}

auto [Ite, Inserted] = Seen.try_emplace(
StringRef(Dep.data(), Dep.size()), DepMatrix.size());

// Make sure we only add unique entries to the dependency matrix.
if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
if (Inserted) {
DepMatrix.push_back(Dep);
IsForwardFlags.push_back(true);
}
if (!IsForward)
IsForwardFlags.reset(Ite->second);
}
}
}

assert(DepMatrix.size() == IsForwardFlags.size() &&
"Dependency matrix and IsForwardVec should have the same size.");

// If all dependencies corresponding to a direction vector are forward, encode
// it to '<', otherwise to '*'.
for (unsigned I = 0; I != DepMatrix.size(); I++)
DepMatrix[I].push_back(IsForwardFlags[I] ? '<' : '*');
Comment on lines +270 to +273
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could go as for as to encode it as < and * in another trailing element of the dependence vector.

Do you mean something like this?


return true;
}

Expand Down Expand Up @@ -276,11 +320,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
continue;

// Check if the direction vector is lexicographically positive (or zero)
// for both before/after exchanged.
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
// for both before/after exchanged. Ignore the last element because it
// doesn't affect the legality.
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
return false;
std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
return false;
}
return true;
Expand Down Expand Up @@ -1222,21 +1267,33 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
for (unsigned I = 0; I != DepMatrix.size(); I++) {
char Dir = DepMatrix[I][LoopId];
if (Dir != 'I' && Dir != '=')
return false;
char DepType = DepMatrix[I].back();
assert((DepType == '<' || DepType == '*') &&
"Unexpected element in dependency vector");

// There are no loop-carried dependencies.
if (Dir == '=' || Dir == 'I')
continue;

// If both Dir and DepType are '<', it means that the all dependencies are
// lexically forward. Such dependencies don't prevent vectorization.
if (Dir == '<' && DepType == '<')
continue;
Comment on lines +1278 to +1281
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A similar fact holds when Dir is > and all dependencies are lexically backward? (even if this is true, I don't intend to address it in this PR).


// We cannot prove that the loop is vectorizable.
return false;
}
return true;
}

std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
// If the outer loop is not loop independent it is not profitable to move
// this to inner position, since doing so would not enable inner loop
// parallelism.
// If the outer loop cannot be vectorized, it is not profitable to move this
// to inner position.
if (!canVectorize(DepMatrix, OuterLoopId))
return false;

// If inner loop has dependence and outer loop is loop independent then it is
// If inner loop cannot be vectorized and outer loop can be then it is
// profitable to interchange to enable inner loop parallelism.
if (!canVectorize(DepMatrix, InnerLoopId))
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
@A = dso_local global [256 x [256 x float]] zeroinitializer
@B = dso_local global [256 x [256 x float]] zeroinitializer
@C = dso_local global [256 x [256 x float]] zeroinitializer
@D = dso_local global [256 x [256 x [256 x float]]] zeroinitializer
@E = dso_local global [256 x [256 x [256 x float]]] zeroinitializer

; Check that the below loops are exchanged for vectorization.
;
Expand Down Expand Up @@ -64,15 +66,13 @@ exit:
; for (int j = 1; j < 256; j++)
; A[i][j-1] = A[i][j] + B[i][j];
;
; FIXME: These loops are exchanged at this time due to the problem in
; profitability heuristic calculation for vectorization.

; CHECK: --- !Passed
; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: Interchanged
; CHECK-NEXT: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: interchange_unnecesasry_for_vectorization
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
; CHECK-NEXT: - String: Insufficient information to calculate the cost of loop for interchange.
define void @interchange_unnecesasry_for_vectorization() {
entry:
br label %for.i.header
Expand Down Expand Up @@ -103,3 +103,135 @@ for.i.inc:
exit:
ret void
}

; Check that the below loops are exchanged to allow innermost loop
; vectorization. We cannot vectorize the j-loop because it has a lexically
; backward dependency, but the i-loop can be vectorized because all the
; loop-carried dependencies are lexically forward.
;
; for (int i = 0; i < 255; i++) {
; for (int j = 1; j < 256; j++) {
; A[i][j] = A[i][j-1] + B[i][j];
; C[i][j] += C[i+1][j];
; }
; }
;

; CHECK: --- !Passed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: Interchanged
; CHECK-NEXT: Function: interchange_necessary_for_vectorization2
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Loop interchanged with enclosing loop.
define void @interchange_necessary_for_vectorization2() {
entry:
br label %for.i.header

for.i.header:
%i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
%i.inc = add nsw i64 %i, 1
br label %for.j.body

for.j.body:
%j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
%j.dec = add nsw i64 %j, -1
%a.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j.dec
%b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %i, i64 %j
%c.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i.inc, i64 %j
%c.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
%a = load float, ptr %a.load.index, align 4
%b = load float, ptr %b.index, align 4
%c0 = load float, ptr %c.load.index, align 4
%c1 = load float, ptr %c.store.index, align 4
%add.0 = fadd float %a, %b
%a.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j
store float %add.0, ptr %a.store.index, align 4
%add.1 = fadd float %c0, %c1
store float %add.1, ptr %c.store.index, align 4
%j.next = add nuw nsw i64 %j, 1
%cmp.j = icmp eq i64 %j.next, 256
br i1 %cmp.j, label %for.i.inc, label %for.j.body

for.i.inc:
%i.next = add nuw nsw i64 %i, 1
%cmp.i = icmp eq i64 %i.next, 255
br i1 %cmp.i, label %exit, label %for.i.header

exit:
ret void
}

; Check that no interchange is performed for the following loop. The j-loop is
; vectorizable because all the dependencies are lexically forward. However, at
; the moment, we don't analyze an execution order between instructions in
; different BBs, so fail to determine that the j-loop is vectorizable.
; Therefore, no exchange is performed.
;
; for (int i = 0; i < 255; i++) {
; for (int j = 0; j < 255; j++) {
; for (int k = 0; k < 128; k++) {
; E[i][j][k] = D[i+1][j+1][2*k];
; if (cond)
; D[i][j][k+1] += 1.0;
; }
; }

; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: multiple_BBs_in_loop
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
; CHECK: --- !Missed
; CHECK-NEXT: Pass: loop-interchange
; CHECK-NEXT: Name: InterchangeNotProfitable
; CHECK-NEXT: Function: multiple_BBs_in_loop
; CHECK-NEXT: Args:
; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
define void @multiple_BBs_in_loop() {
entry:
br label %for.i.header

for.i.header:
%i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
%i.inc = add nsw i64 %i, 1
br label %for.j.header

for.j.header:
%j = phi i64 [ 0, %for.i.header ], [ %j.inc, %for.j.inc ]
%j.inc = add nsw i64 %j, 1
br label %for.k.body

for.k.body:
%k = phi i64 [ 0, %for.j.header ], [ %k.inc, %for.k.inc ]
%k.inc = add nsw i64 %k, 1
%k.2 = mul nsw i64 %k, 2
%d.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i.inc, i64 %j.inc, i64 %k.2
%e.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @E, i64 %i, i64 %j, i64 %k
%d.load = load float, ptr %d.index, align 4
store float %d.load, ptr %e.index, align 4
%cond = freeze i1 undef
br i1 %cond, label %if.then, label %for.k.inc

if.then:
%d.index2 = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i, i64 %j, i64 %k.inc
%d.load2 = load float, ptr %d.index2, align 4
%add = fadd float %d.load2, 1.0
store float %add, ptr %d.index2, align 4
br label %for.k.inc

for.k.inc:
%cmp.k = icmp eq i64 %k.inc, 128
br i1 %cmp.k, label %for.j.inc, label %for.k.body

for.j.inc:
%cmp.j = icmp eq i64 %j.inc, 255
br i1 %cmp.j, label %for.i.inc, label %for.j.header

for.i.inc:
%cmp.i = icmp eq i64 %i.inc, 255
br i1 %cmp.i, label %exit, label %for.i.header

exit:
ret void
}
Loading