Skip to content

[VPlan][LoopVectorize] Potential issue in how replicate recipe is lowered to IR #115169

Open
@danilaml

Description

@danilaml

It's a bit of a tricky one since I can no longer reproduce it on main due to changes in 5a4c6f9 , but I believe they didn't actually fix the underlying problem, but just made it (almost?) impossible to trigger the conditions for it.
As a workaround, I've introduced a cl opt for testing that forces replicate recipe over widen recipe so this can be triggered for regular instructions (since it's not possible to trigger it for loads after above changes):

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -357,6 +357,11 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
     cl::desc(
         "Prefer predicating a reduction operation over an after loop select."));
;
 
+static cl::opt<bool> ForceReplicateOverWiden(
+    "force-replicate-over-widen", cl::init(false), cl::Hidden,
+    cl::desc("Force replicate recipies over widen recipes."
+             "This flag should only be used for testing."));
+
 namespace llvm {
 cl::opt<bool> EnableVPlanNativePath(
     "enable-vplan-native-path", cl::Hidden,
@@ -9148,8 +9153,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         continue;
       }
 
-      VPRecipeBase *Recipe =
-          RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
+      VPRecipeBase *Recipe = nullptr;
+      if (!ForceReplicateOverWiden || isa<PHINode>(Instr))
+        Recipe =
+            RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
       if (!Recipe)
         Recipe = RecipeBuilder.handleReplication(Instr, Range);

With this patch and -passes=loop-vectorize -force-vector-width=4 -force-replicate-over-widen options passed to opt on a command line the following IR:

define void @src(i1 %arg, i64 %arg1, ptr %arg2) {
bb:
  br label %bb3

bb3:                                              ; preds = %bb6, %bb
  %phi = phi i64 [ %add, %bb6 ], [ 0, %bb ]
  br i1 %arg, label %bb6, label %bb4

bb4:                                              ; preds = %bb3
  %load = load i32, ptr  %arg2, align 8
  %add5 = add i32 %load, 1
  br label %bb6

bb6:                                              ; preds = %bb4, %bb3
  %phi7 = phi i32 [ %add5, %bb4 ], [ 0, %bb3 ]
  %add = add i64 %phi, 1
  %icmp = icmp samesign ult i64 %phi, %arg1
  br i1 %icmp, label %bb3, label %bb8

bb8:                                              ; preds = %bb6
  %phi9 = phi i32 [ %phi7, %bb6 ]
  ret void
}

Gets optimized to

define void @src(i1 %arg, i64 %arg1, ptr %arg2) {
bb:
  %0 = add i64 %arg1, 1
  %min.iters.check = icmp ult i64 %0, 4
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %bb
  %n.mod.vf = urem i64 %0, 4
  %n.vec = sub i64 %0, %n.mod.vf
  %broadcast.splatinsert = insertelement <4 x i1> poison, i1 %arg, i64 0
  %broadcast.splat = shufflevector <4 x i1> %broadcast.splatinsert, <4 x i1> poison, <4 x i32> zeroinitializer
  br label %vector.body

vector.body:                                      ; preds = %pred.load.continue6, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %pred.load.continue6 ]
  %1 = xor <4 x i1> %broadcast.splat, <i1 true, i1 true, i1 true, i1 true>
  %2 = extractelement <4 x i1> %1, i32 0
  br i1 %2, label %pred.load.if, label %pred.load.continue

pred.load.if:                                     ; preds = %vector.body
  %3 = load i32, ptr %arg2, align 8
  br label %pred.load.continue

pred.load.continue:                               ; preds = %pred.load.if, %vector.body
  %4 = phi i32 [ poison, %vector.body ], [ %3, %pred.load.if ]
  %5 = extractelement <4 x i1> %1, i32 1
  br i1 %5, label %pred.load.if1, label %pred.load.continue2

pred.load.if1:                                    ; preds = %pred.load.continue
  %6 = load i32, ptr %arg2, align 8
  br label %pred.load.continue2

pred.load.continue2:                              ; preds = %pred.load.if1, %pred.load.continue
  %7 = phi i32 [ poison, %pred.load.continue ], [ %6, %pred.load.if1 ]
  %8 = extractelement <4 x i1> %1, i32 2
  br i1 %8, label %pred.load.if3, label %pred.load.continue4

pred.load.if3:                                    ; preds = %pred.load.continue2
  %9 = load i32, ptr %arg2, align 8
  br label %pred.load.continue4

pred.load.continue4:                              ; preds = %pred.load.if3, %pred.load.continue2
  %10 = phi i32 [ poison, %pred.load.continue2 ], [ %9, %pred.load.if3 ]
  %11 = extractelement <4 x i1> %1, i32 3
  br i1 %11, label %pred.load.if5, label %pred.load.continue6

pred.load.if5:                                    ; preds = %pred.load.continue4
  %12 = load i32, ptr %arg2, align 8
  br label %pred.load.continue6

pred.load.continue6:                              ; preds = %pred.load.if5, %pred.load.continue4
  %13 = phi i32 [ poison, %pred.load.continue4 ], [ %12, %pred.load.if5 ]
  %14 = add i32 %4, 1
  %15 = add i32 %7, 1
  %16 = add i32 %10, 1
  %17 = add i32 %13, 1
  %18 = insertelement <4 x i32> poison, i32 %14, i32 0
  %19 = insertelement <4 x i32> %18, i32 %15, i32 1
  %20 = insertelement <4 x i32> %19, i32 %16, i32 2
  %21 = insertelement <4 x i32> %20, i32 %17, i32 3
  %predphi = select <4 x i1> %broadcast.splat, <4 x i32> zeroinitializer, <4 x i32> %21
  %index.next = add nuw i64 %index, 4
  %22 = icmp eq i64 %index.next, %n.vec
  br i1 %22, label %middle.block, label %vector.body, !llvm.loop !0

middle.block:                                     ; preds = %pred.load.continue6
  %23 = extractelement <4 x i32> %predphi, i32 3
  %cmp.n = icmp eq i64 %0, %n.vec
  br i1 %cmp.n, label %bb8, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %bb
  %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %bb ]
  br label %bb3

bb3:                                              ; preds = %scalar.ph, %bb6
  %phi = phi i64 [ %add, %bb6 ], [ %bc.resume.val, %scalar.ph ]
  br i1 %arg, label %bb6, label %bb4

bb4:                                              ; preds = %bb3
  %load = load i32, ptr %arg2, align 8
  %add5 = add i32 %load, 1
  br label %bb6

bb6:                                              ; preds = %bb4, %bb3
  %phi7 = phi i32 [ %add5, %bb4 ], [ 0, %bb3 ]
  %add = add i64 %phi, 1
  %icmp = icmp samesign ult i64 %phi, %arg1
  br i1 %icmp, label %bb3, label %bb8, !llvm.loop !3

bb8:                                              ; preds = %middle.block, %bb6
  %phi9 = phi i32 [ %phi7, %bb6 ], [ %23, %middle.block ]
  ret void
}

!0 = distinct !{!0, !1, !2}
!1 = !{!"llvm.loop.isvectorized", i32 1}
!2 = !{!"llvm.loop.unroll.runtime.disable"}
!3 = distinct !{!3, !2, !1}

Note the pred.load.continue6 BB:

pred.load.continue6:                              ; preds = %pred.load.if5, %pred.load.continue4
  %13 = phi i32 [ poison, %pred.load.continue4 ], [ %12, %pred.load.if5 ]
  %14 = add i32 %4, 1
  %15 = add i32 %7, 1
  %16 = add i32 %10, 1
  %17 = add i32 %13, 1
  %18 = insertelement <4 x i32> poison, i32 %14, i32 0
  %19 = insertelement <4 x i32> %18, i32 %15, i32 1
  %20 = insertelement <4 x i32> %19, i32 %16, i32 2
  %21 = insertelement <4 x i32> %20, i32 %17, i32 3
  %predphi = select <4 x i1> %broadcast.splat, <4 x i32> zeroinitializer, <4 x i32> %21
  %index.next = add nuw i64 %index, 4
  %22 = icmp eq i64 %index.next, %n.vec
  br i1 %22, label %middle.block, label %vector.body, !llvm.loop !0

All those adds are operating on a poison coming from phi while the predicate/mask is applied AFTER the operation is done, not before.
Before 5a4c6f9 this would happen for dereferenceable loads leading to load from poison ptr which is immediate ub. In a downstream project I have loads that are known to be dereferenceable unconditionally in the function body (so no ctx argument is needed) so for them this transformation still applies even after the above change.
To me it seems like the "correct" IR should've been to construct vector from %4, %7, %10, %13, then use it in a predphi and then do a vector (or scalar) add on that.

@fhahn , do you see any obvious problems with this? I've tried looking into why VPlan generates this IR but it's rather involved to my untrained eye.

Metadata

Metadata

Assignees

Type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions