-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[VPlan] Simplify BLEND %a, %b, NOT(%m) -> BLEND %b, %a, %m. #128375
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Avoid negations for normalized blends by reordering operands.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) ChangesAvoid negations for normalized blends by reordering operands. Patch is 51.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128375.diff 19 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ce81b2e147df8..58f7f1178aab5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -851,8 +851,20 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return;
}
- if (Blend->isNormalized())
+ if (Blend->isNormalized()) {
+ /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
+ VPValue *NewMask;
+ if (Blend->getNumOperands() == 3 &&
+ match(Blend->getMask(1), m_Not(m_VPValue(NewMask)))) {
+ VPValue *Inc0 = Blend->getIncomingValue(0);
+ VPValue *Inc1 = Blend->getIncomingValue(1);
+ Blend->setOperand(0, Inc1);
+ Blend->setOperand(1, Inc0);
+ Blend->setOperand(2, NewMask);
+ }
+
return;
+ }
// Normalize the blend so its first incoming value is used as the initial
// value with the others blended into it.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index 5b0f0961a6297..07873fba86b6d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -158,8 +158,7 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
; TFNONE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
; TFNONE-NEXT: [[TMP7:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_LOAD]], splat (i64 50)
; TFNONE-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> [[TMP7]])
-; TFNONE-NEXT: [[TMP9:%.*]] = xor <vscale x 2 x i1> [[TMP7]], splat (i1 true)
-; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP8]]
+; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i64> zeroinitializer
; TFNONE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
; TFNONE-NEXT: store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP10]], align 8
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -321,7 +320,7 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
; TFNONE-NEXT: [[TMP8:%.*]] = xor <vscale x 2 x i1> [[TMP7]], splat (i1 true)
; TFNONE-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP8]])
; TFNONE-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> [[TMP7]])
-; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP10]]
+; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64> [[TMP9]]
; TFNONE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
; TFNONE-NEXT: store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP11]], align 8
; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -987,8 +986,7 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 {
; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
; TFNONE-NEXT: [[TMP8:%.*]] = call <vscale x 2 x double> @exp_masked_scalable(<vscale x 2 x double> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true))
; TFNONE-NEXT: [[TMP9:%.*]] = fcmp ogt <vscale x 2 x double> [[TMP8]], zeroinitializer
-; TFNONE-NEXT: [[TMP10:%.*]] = xor <vscale x 2 x i1> [[TMP9]], splat (i1 true)
-; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x double> splat (double 1.000000e+00), <vscale x 2 x double> zeroinitializer
+; TFNONE-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> splat (double 1.000000e+00)
; TFNONE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
; TFNONE-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 2
; TFNONE-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index d612e1f3dab5b..61729c5616faf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -1041,8 +1041,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP6]]
; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = xor <vscale x 4 x i1> [[TMP9]], splat (i1 true)
-; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x float> splat (float 3.000000e+00), <vscale x 4 x float> [[WIDE_MASKED_LOAD]]
+; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 3.000000e+00)
; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[PREDPHI]]
; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1102,8 +1101,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[A]], i64 [[TMP6]]
; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0
; CHECK-ORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = xor <vscale x 4 x i1> [[TMP9]], splat (i1 true)
-; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x float> splat (float 3.000000e+00), <vscale x 4 x float> [[WIDE_MASKED_LOAD]]
+; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> splat (float 3.000000e+00)
; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[PREDPHI]])
; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
index 8333c3193d799..839961483d435 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -401,8 +401,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-ORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue
; CHECK-ORDERED: pred.load.continue6
; CHECK-ORDERED: %[[PHI1:.*]] = phi <4 x float> [ %[[PHI0:.*]], %pred.load.continue4 ], [ %[[INS_ELT:.*]], %pred.load.if5 ]
-; CHECK-ORDERED: %[[XOR:.*]] = xor <4 x i1> %[[FCMP1]], splat (i1 true)
-; CHECK-ORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> splat (float 3.000000e+00), <4 x float> %[[PHI1]]
+; CHECK-ORDERED: %[[PRED:.*]] = select <4 x i1> %[[FCMP1]], <4 x float> %[[PHI1]], <4 x float> splat (float 3.000000e+00)
; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[PHI]], <4 x float> %[[PRED]])
; CHECK-ORDERED: for.body
; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
@@ -427,8 +426,7 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-UNORDERED: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0
; CHECK-UNORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue
; CHECK-UNORDERED: pred.load.continue6
-; CHECK-UNORDERED: %[[XOR:.*]] = xor <4 x i1> %[[FCMP1]], splat (i1 true)
-; CHECK-UNORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> splat (float 3.000000e+00), <4 x float> %[[PRED_PHI:.*]]
+; CHECK-UNORDERED: %[[PRED:.*]] = select <4 x i1> %[[FCMP1]], <4 x float> %[[PRED_PHI:.*]], <4 x float> splat (float 3.000000e+00)
; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[PHI]], %[[PRED]]
; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
; CHECK-UNORDERED: middle.block
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
index eaec02038fde1..e154883ca7eca 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
@@ -36,16 +36,16 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 %
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PB]], i32 [[OFFSET_IDX5]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[NEXT_GEP]], align 4
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[NEXT_GEP6]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD7]], zeroinitializer
-; CHECK-NEXT: [[DOTNOT9:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast une <4 x float> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = fcmp fast une <4 x float> [[WIDE_LOAD7]], zeroinitializer
+; CHECK-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i1> splat (i1 true), <4 x i1> [[TMP19]]
; CHECK-NEXT: [[TMP4:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD]])
; CHECK-NEXT: [[TMP5:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD7]])
; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD7]]
; CHECK-NEXT: [[TMP8:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP7]])
; CHECK-NEXT: [[TMP9:%.*]] = fdiv fast <4 x float> [[TMP8]], [[TMP6]]
-; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[DOTNOT9]], <4 x float> splat (float -0.000000e+00), <4 x float> [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP20]], <4 x float> [[TMP9]], <4 x float> splat (float -0.000000e+00)
; CHECK-NEXT: [[PREDPHI]] = fadd reassoc arcp contract afn <4 x float> [[VEC_PHI]], [[TMP10]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index 50dfb689c2060..3f46f1725daf4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -747,8 +747,7 @@ define void @empty_block_with_phi_2(ptr %src, i64 %N) #0 {
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP11]], align 2
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 8 x i16> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP13:%.*]] = xor <vscale x 8 x i1> [[TMP12]], splat (i1 true)
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP13]], <vscale x 8 x i16> splat (i16 99), <vscale x 8 x i16> [[WIDE_LOAD]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 8 x i1> [[TMP12]], <vscale x 8 x i16> [[WIDE_LOAD]], <vscale x 8 x i16> splat (i16 99)
; CHECK-NEXT: store <vscale x 8 x i16> [[PREDPHI]], ptr [[TMP11]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
index c00868c3e0be2..258bb578cf927 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
@@ -39,8 +39,7 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {
; VLENUNK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP12]]
; VLENUNK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0
; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP15]], i32 4, <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> poison)
-; VLENUNK-NEXT: [[TMP16:%.*]] = xor <vscale x 4 x i1> [[TMP13]], splat (i1 true)
-; VLENUNK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; VLENUNK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
; VLENUNK-NEXT: [[TMP17:%.*]] = add <vscale x 4 x i32> [[PREDPHI]], [[BROADCAST_SPLAT]]
; VLENUNK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP12]]
; VLENUNK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index fefbdb473e2ab..082386e39f3f6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -434,8 +434,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP21]], align 4
; NO-VP-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 3)
; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
-; NO-VP-OUTLOOP-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[TMP18]], splat (i1 true)
-; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP16]]
+; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP18]], <vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VEC_PHI]]
; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
; NO-VP-OUTLOOP-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NO-VP-OUTLOOP-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -812,8 +811,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP27]], align 4
; NO-VP-OUTLOOP-NEXT: [[TMP28:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_IND]]
; NO-VP-OUTLOOP-NEXT: [[TMP22:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
-; NO-VP-OUTLOOP-NEXT: [[TMP23:%.*]] = xor <vscale x 4 x i1> [[TMP28]], splat (i1 true)
-; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP22]]
+; NO-VP-OUTLOOP-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP28]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[VEC_PHI]]
; NO-VP-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
; NO-VP-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
; NO-VP-OUTLOOP-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index 68695a8b1282c..686d177ff89ff 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -272,9 +272,8 @@ define void @preserve_nuw_nsw_no_addr(ptr %output) local_unnamed_addr #0 {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
; CHECK-NEXT: [[TMP5:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP5]], <4 x i64> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> zeroinitializer, <4 x i64> [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 4
@@ -364,8 +363,7 @@ define void @drop_zext_nneg(ptr noalias %p, ptr noalias %p1) #0 {
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[P]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[TMP3]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP4]], i32 8, <4 x i1> [[TMP0]], <4 x double> poison)
-; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x double> zeroinitializer, <4 x double> [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[PREDPHI]], i32 3
; CHECK-NEXT: store double [[TMP6]], ptr [[P1]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -479,9 +477,8 @@ define void @preserve_exact_no_addr(ptr %output) local_unnamed_addr #0 {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true)
; CHECK-NEXT: [[TMP5:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], splat (i64 2)
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP5]], <4 x i64> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> zeroinitializer, <4 x i64> [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
; CHECK-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 4
@@ -604,7 +601,7 @@ define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) {
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i8> [[WIDE_LOAD]], <4 x i8> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> zeroinitializer, <4 x i8> [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr %dst, i64 [[TMP0]]
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i32 0
; CHECK-NEXT: store <4 x i8> [[PREDPHI]], ptr [[TMP20]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/re...
[truncated]
|
@@ -851,8 +851,20 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { | |||
return; | |||
} | |||
|
|||
if (Blend->isNormalized()) | |||
if (Blend->isNormalized()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better placed below - after normalizing Blend if needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved, thanks!
VPValue *Inc1 = Blend->getIncomingValue(1); | ||
Blend->setOperand(0, Inc1); | ||
Blend->setOperand(1, Inc0); | ||
Blend->setOperand(2, NewMask); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Blend->setOperand(2, NewMask); | |
Blend->setOperand(2, NewMask); | |
recursivelyDeleteDeadRecipes(OldMask); |
or check if unused and delete, knowing its operand is live? Although it does seem to get collected eventually.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, although there are no more opportunities from the manual cleanup.
@@ -851,8 +851,20 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { | |||
return; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unrelated, while we're here: above optimization checks for a single unique (non masked out) value, can hold a pointer rather than a SmallPtrSet. OTOH, can filter away all masked out values, even if more than one remains.
; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast une <4 x float> [[WIDE_LOAD]], zeroinitializer | ||
; CHECK-NEXT: [[TMP19:%.*]] = fcmp fast une <4 x float> [[WIDE_LOAD7]], zeroinitializer | ||
; CHECK-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i1> splat (i1 true), <4 x i1> [[TMP19]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This NOTing of DOTNOT9
which involves both NOTing its first select operand and flipping the order of its last two operands, and NOTing each of these two operands (including replacing zeroinitializer with splat (i1 true)) is intended, and profitable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It may not be profitable, depending on the cost to materialize (splat (i1 true))
, but is is instcombine that rewrites the phi in that way. Such details should probably be dealt with in the backend, where there's more HW specific info available?
After LV, we get
< %19 = xor <4 x i1> %11, splat (i1 true)
< %predphi = select <4 x i1> %19, <4 x float> %vec.phi, <4 x float> %18
---
> %predphi = select <4 x i1> %11, <4 x float> %18, <4 x float> %vec.phi
@@ -604,7 +601,7 @@ define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) { | |||
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr [5 x i8], ptr @c, i64 0, i64 [[TMP16]] | |||
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0 | |||
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP18]], align 1 | |||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i8> [[WIDE_LOAD]], <4 x i8> zeroinitializer | |||
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> zeroinitializer, <4 x i8> [[WIDE_LOAD]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(here the NOTed mask TMP2 has other uses so remains)
; CHECK-NEXT: [[P:%.*]] = select i1 [[TMP14]], i64 [[TMP12]], i64 1 | ||
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 | ||
; CHECK-NEXT: [[PREDPHI7:%.*]] = select i1 [[TMP15]], i64 [[TMP13]], i64 1 | ||
; CHECK-NEXT: [[P:%.*]] = select i1 [[C]], i64 1, i64 [[TMP12]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Old mask TMP14 turns out to be NOT C.. and same for TMP15.)
@@ -71,7 +71,7 @@ for.end: | |||
|
|||
;CHECK-LABEL: @reduction_func( | |||
;CHECK: load <4 x i32> | |||
;CHECK: icmp slt <4 x i32> | |||
;CHECK: icmp sgt <4 x i32> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this reversing slt into sgt? What about equality.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is due to missing checking the operands.
Previously we had icmp slt ... 31
, now we have icmp sgt i32 ... 30
@@ -319,10 +317,9 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) { | |||
; VEC-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2 | |||
; VEC-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]] | |||
; VEC-NEXT: [[IND_END:%.*]] = add i64 [[V_1]], [[N_VEC]] | |||
; VEC-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[V_2:%.*]], i32 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: this can probably stay in its place(?)
The limited check lines make it difficult to reason about test changes in #128375.
The limited check lines make it difficult to reason about test changes in llvm/llvm-project#128375.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, last minor nit.
match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) { | ||
VPValue *Inc0 = NewBlend->getIncomingValue(0); | ||
VPValue *Inc1 = NewBlend->getIncomingValue(1); | ||
VPValue *OldMask = NewBlend->getOperand(2); | ||
NewBlend->setOperand(0, Inc1); | ||
NewBlend->setOperand(1, Inc0); | ||
NewBlend->setOperand(2, NewMask); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For consistency, this should all use getMask(1)/setMask(1), getIncomingValue(0,1)/setIncomingValue(0,1), with setters to be added, or all use getOperand(0,1,2)/setOperand(0,1,2) as in the following.
BTW, VPBlendRecipe could be a VPInstruction, or a derivative of it - which provides additional methods (but not fields).
match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) { | |
VPValue *Inc0 = NewBlend->getIncomingValue(0); | |
VPValue *Inc1 = NewBlend->getIncomingValue(1); | |
VPValue *OldMask = NewBlend->getOperand(2); | |
NewBlend->setOperand(0, Inc1); | |
NewBlend->setOperand(1, Inc0); | |
NewBlend->setOperand(2, NewMask); | |
match(NewBlend->getOperand(2), m_Not(m_VPValue(NewMask)))) { | |
VPValue *IncomingValue0 = NewBlend->getOperand(0); | |
VPValue *IncomingValue1 = NewBlend->getOperand(1); | |
VPValue *OldMask = NewBlend->getOperand(2); | |
NewBlend->setOperand(0, IncomingValue1); | |
NewBlend->setOperand(1, IncomingValue0); | |
NewBlend->setOperand(2, NewMask); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to use getOperand consistently for now, thanks!
…. (#128375) Avoid negations for normalized blends by reordering operands. PR: llvm/llvm-project#128375
) Avoid negations for normalized blends by reordering operands. PR: llvm#128375
Avoid negations for normalized blends by reordering operands.