llvm · kasuga-fj · Mar 27, 2025 · Apr 3, 2025 · Apr 8, 2025 · Jun 6, 2025
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -17,8 +17,8 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -119,7 +119,11 @@ static bool noDuplicateRules(ArrayRef<RuleTy> Rules) {
 
 static void printDepMatrix(CharMatrix &DepMatrix) {
   for (auto &Row : DepMatrix) {
-    for (auto D : Row)
+    ArrayRef<char> RowRef(Row);
+
+    // Drop the last element because it is a flag indicating whether the row is
+    // "lexically forward", which doesn't affect the legality check.
+    for (auto D : RowRef.drop_back())
       LLVM_DEBUG(dbgs() << D << " ");
     LLVM_DEBUG(dbgs() << "\n");
   }
@@ -167,7 +171,20 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
     return false;
   }
   ValueVector::iterator I, IE, J, JE;
-  StringSet<> Seen;
+
+  // Manage direction vectors that are already seen. Map each direction vector
+  // to an index of DepMatrix at which it is stored.
+  StringMap<unsigned> Seen;
+
+  // The i-th element is set iff all dependencies corresponding to the i-th
+  // direction vector in DepMatrix are "lexically forward". The notion
+  // "lexically forward" aligns with what is defined in LAA
+  // (LoopAccessAnalysis).
+  //
+  // We deem a dependence lexically forward if we can prove that the
+  // destination instruction is always executed after the source instruction
+  // within each iteration.
+  BitVector IsForwardFlags;
 
   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
@@ -180,10 +197,22 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
       // Track Output, Flow, and Anti dependencies.
       if (auto D = DI->depends(Src, Dst)) {
         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+        bool IsForward = true;
+
+        // If Src and Dst are in the same BB, Src is always executed before Dst
+        // in the same loop iteration. If not, we must check whether one BB
+        // dominates the other to determine if Src and Dst are executed in this
+        // order. At the moment, we don't perform such check.
+        if (Src->getParent() != Dst->getParent())
+          IsForward = false;
+
         // If the direction vector is negative, normalize it to
         // make it non-negative.
-        if (D->normalize(SE))
+        bool Normalized = D->normalize(SE);
+        if (Normalized) {
           LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
+          IsForward = false;
+        }
         LLVM_DEBUG(StringRef DepType =
                        D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
                    dbgs() << "Found " << DepType
@@ -221,13 +250,28 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
           Dep.push_back('I');
         }
 
+        auto [Ite, Inserted] = Seen.try_emplace(
+            StringRef(Dep.data(), Dep.size()), DepMatrix.size());
+
         // Make sure we only add unique entries to the dependency matrix.
-        if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
+        if (Inserted) {
           DepMatrix.push_back(Dep);
+          IsForwardFlags.push_back(true);
+        }
+        if (!IsForward)
+          IsForwardFlags.reset(Ite->second);
       }
     }
   }
 
+  assert(DepMatrix.size() == IsForwardFlags.size() &&
+         "Dependency matrix and IsForwardVec should have the same size.");
+
+  // If all dependencies corresponding to a direction vector are forward, encode
+  // it to '<', otherwise to '*'.
+  for (unsigned I = 0; I != DepMatrix.size(); I++)
+    DepMatrix[I].push_back(IsForwardFlags[I] ? '<' : '*');
+
   return true;
 }
 
@@ -276,11 +320,12 @@ static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
       continue;
 
     // Check if the direction vector is lexicographically positive (or zero)
-    // for both before/after exchanged.
-    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+    // for both before/after exchanged. Ignore the last element because it
+    // doesn't affect the legality.
+    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
       return false;
     std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
-    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size()) == false)
+    if (isLexicographicallyPositive(Cur, OuterLoopId, Cur.size() - 1) == false)
       return false;
   }
   return true;
@@ -1222,21 +1267,33 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
 static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
   for (unsigned I = 0; I != DepMatrix.size(); I++) {
     char Dir = DepMatrix[I][LoopId];
-    if (Dir != 'I' && Dir != '=')
-      return false;
+    char DepType = DepMatrix[I].back();
+    assert((DepType == '<' || DepType == '*') &&
+           "Unexpected element in dependency vector");
+
+    // There are no loop-carried dependencies.
+    if (Dir == '=' || Dir == 'I')
+      continue;
+
+    // If both Dir and DepType are '<', it means that the all dependencies are
+    // lexically forward. Such dependencies don't prevent vectorization.
+    if (Dir == '<' && DepType == '<')
+      continue;
+
+    // We cannot prove that the loop is vectorizable.
+    return false;
   }
   return true;
 }
 
 std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
     unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
-  // If the outer loop is not loop independent it is not profitable to move
-  // this to inner position, since doing so would not enable inner loop
-  // parallelism.
+  // If the outer loop cannot be vectorized, it is not profitable to move this
+  // to inner position.
   if (!canVectorize(DepMatrix, OuterLoopId))
     return false;
 
-  // If inner loop has dependence and outer loop is loop independent then it is
+  // If inner loop cannot be vectorized and outer loop can be then it is
   // profitable to interchange to enable inner loop parallelism.
   if (!canVectorize(DepMatrix, InnerLoopId))
     return true;

diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll
@@ -5,6 +5,8 @@
 @A = dso_local global [256 x [256 x float]] zeroinitializer
 @B = dso_local global [256 x [256 x float]] zeroinitializer
 @C = dso_local global [256 x [256 x float]] zeroinitializer
+@D = dso_local global [256 x [256 x [256 x float]]] zeroinitializer
+@E = dso_local global [256 x [256 x [256 x float]]] zeroinitializer
 
 ; Check that the below loops are exchanged for vectorization.
 ;
@@ -64,15 +66,13 @@ exit:
 ;   for (int j = 1; j < 256; j++)
 ;     A[i][j-1] = A[i][j] + B[i][j];
 ;
-; FIXME: These loops are exchanged at this time due to the problem in
-; profitability heuristic calculation for vectorization.
 
-; CHECK:      --- !Passed
+; CHECK:      --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Name:            InterchangeNotProfitable
 ; CHECK-NEXT: Function:        interchange_unnecesasry_for_vectorization
 ; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+; CHECK-NEXT:   - String:          Insufficient information to calculate the cost of loop for interchange.
 define void @interchange_unnecesasry_for_vectorization() {
 entry:
   br label %for.i.header
@@ -103,3 +103,135 @@ for.i.inc:
 exit:
   ret void
 }
+
+; Check that the below loops are exchanged to allow innermost loop
+; vectorization. We cannot vectorize the j-loop because it has a lexically
+; backward dependency, but the i-loop can be vectorized because all the
+; loop-carried dependencies are lexically forward.
+;
+; for (int i = 0; i < 255; i++) {
+;   for (int j = 1; j < 256; j++) {
+;     A[i][j] = A[i][j-1] + B[i][j];
+;     C[i][j] += C[i+1][j];
+;   }
+; }
+;
+
+; CHECK:      --- !Passed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            Interchanged
+; CHECK-NEXT: Function:        interchange_necessary_for_vectorization2
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Loop interchanged with enclosing loop.
+define void @interchange_necessary_for_vectorization2() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 1, %entry ], [ %i.next, %for.i.inc ]
+  %i.inc = add nsw i64 %i, 1
+  br label %for.j.body
+
+for.j.body:
+  %j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
+  %j.dec = add nsw i64 %j, -1
+  %a.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j.dec
+  %b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %i, i64 %j
+  %c.load.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i.inc, i64 %j
+  %c.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
+  %a = load float, ptr %a.load.index, align 4
+  %b = load float, ptr %b.index, align 4
+  %c0 = load float, ptr %c.load.index, align 4
+  %c1 = load float, ptr %c.store.index, align 4
+  %add.0 = fadd float %a, %b
+  %a.store.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %i, i64 %j
+  store float %add.0, ptr %a.store.index, align 4
+  %add.1 = fadd float %c0, %c1
+  store float %add.1, ptr %c.store.index, align 4
+  %j.next = add nuw nsw i64 %j, 1
+  %cmp.j = icmp eq i64 %j.next, 256
+  br i1 %cmp.j, label %for.i.inc, label %for.j.body
+
+for.i.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cmp.i = icmp eq i64 %i.next, 255
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}
+
+; Check that no interchange is performed for the following loop. The j-loop is
+; vectorizable because all the dependencies are lexically forward. However, at
+; the moment, we don't analyze an execution order between instructions in
+; different BBs, so fail to determine that the j-loop is vectorizable.
+; Therefore, no exchange is performed.
+;
+; for (int i = 0; i < 255; i++) {
+;   for (int j = 0; j < 255; j++) {
+;     for (int k = 0; k < 128; k++) {
+;       E[i][j][k] = D[i+1][j+1][2*k];
+;       if (cond)
+;         D[i][j][k+1] += 1.0;
+;   }
+; }
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        multiple_BBs_in_loop
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        multiple_BBs_in_loop
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          Interchanging loops is not considered to improve cache locality nor vectorization.
+define void @multiple_BBs_in_loop() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  %i.inc = add nsw i64 %i, 1
+  br label %for.j.header
+
+for.j.header:
+  %j = phi i64 [ 0, %for.i.header ], [ %j.inc, %for.j.inc ]
+  %j.inc = add nsw i64 %j, 1
+  br label %for.k.body
+
+for.k.body:
+  %k = phi i64 [ 0, %for.j.header ], [ %k.inc, %for.k.inc ]
+  %k.inc = add nsw i64 %k, 1
+  %k.2 = mul nsw i64 %k, 2
+  %d.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i.inc, i64 %j.inc, i64 %k.2
+  %e.index = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @E, i64 %i, i64 %j, i64 %k
+  %d.load = load float, ptr %d.index, align 4
+  store float %d.load, ptr %e.index, align 4
+  %cond = freeze i1 undef
+  br i1 %cond, label %if.then, label %for.k.inc
+
+if.then:
+  %d.index2 = getelementptr nuw inbounds [256 x [256 x [256 x float]]], ptr @D, i64 %i, i64 %j, i64 %k.inc
+  %d.load2 = load float, ptr %d.index2, align 4
+  %add = fadd float %d.load2, 1.0
+  store float %add, ptr %d.index2, align 4
+  br label %for.k.inc
+
+for.k.inc:
+  %cmp.k = icmp eq i64 %k.inc, 128
+  br i1 %cmp.k, label %for.j.inc, label %for.k.body
+
+for.j.inc:
+  %cmp.j = icmp eq i64 %j.inc, 255
+  br i1 %cmp.j, label %for.i.inc, label %for.j.header
+
+for.i.inc:
+  %cmp.i = icmp eq i64 %i.inc, 255
+  br i1 %cmp.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}