-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[VectorCombine] foldPermuteOfBinops - fold "shuffle (binop (shuffle, other)), undef" --> "binop (shuffle), (shuffle)". #122118
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VectorCombine] foldPermuteOfBinops - fold "shuffle (binop (shuffle, other)), undef" --> "binop (shuffle), (shuffle)". #122118
Conversation
…other)), undef" --> "binop (shuffle), (shuffle)". foldPermuteOfBinops currently requires both binop operands to be oneuse shuffles to fold the shuffles across the binop, but there will be cases where its still profitable to fold across the binop with only one foldable shuffle.
@llvm/pr-subscribers-llvm-transforms Author: Simon Pilgrim (RKSimon) ChangesfoldPermuteOfBinops currently requires both binop operands to be oneuse shuffles to fold the shuffles across the binop, but there will be cases where its still profitable to fold across the binop with only one foldable shuffle. Full diff: https://github.com/llvm/llvm-project/pull/122118.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 120eafae8c5ac5..e9478103f36288 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1592,17 +1592,21 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
return false;
- Value *Op00, *Op01;
- ArrayRef<int> Mask0;
- if (!match(BinOp->getOperand(0),
- m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)))))
+ Value *Op00, *Op01, *Op10, *Op11;
+ ArrayRef<int> Mask0, Mask1;
+ bool Match0 =
+ match(BinOp->getOperand(0),
+ m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
+ bool Match1 =
+ match(BinOp->getOperand(1),
+ m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
+ if (!Match0 && !Match1)
return false;
- Value *Op10, *Op11;
- ArrayRef<int> Mask1;
- if (!match(BinOp->getOperand(1),
- m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)))))
- return false;
+ Op00 = Match0 ? Op00 : BinOp->getOperand(0);
+ Op01 = Match0 ? Op01 : BinOp->getOperand(0);
+ Op10 = Match1 ? Op10 : BinOp->getOperand(1);
+ Op11 = Match1 ? Op11 : BinOp->getOperand(1);
Instruction::BinaryOps Opcode = BinOp->getOpcode();
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
@@ -1620,15 +1624,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
return false;
- // Merge outer / inner shuffles.
+ // Merge outer / inner (or identity if no match) shuffles.
SmallVector<int> NewMask0, NewMask1;
for (int M : OuterMask) {
if (M < 0 || M >= (int)NumSrcElts) {
NewMask0.push_back(PoisonMaskElem);
NewMask1.push_back(PoisonMaskElem);
} else {
- NewMask0.push_back(Mask0[M]);
- NewMask1.push_back(Mask1[M]);
+ NewMask0.push_back(Match0 ? Mask0[M] : M);
+ NewMask1.push_back(Match1 ? Mask1[M] : M);
}
}
@@ -1636,13 +1640,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
InstructionCost OldCost =
TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy,
- OuterMask, CostKind, 0, nullptr, {BinOp}, &I) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, Mask0,
- CostKind, 0, nullptr, {Op00, Op01},
- cast<Instruction>(BinOp->getOperand(0))) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, Mask1,
- CostKind, 0, nullptr, {Op10, Op11},
- cast<Instruction>(BinOp->getOperand(1)));
+ OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
+ if (Match0)
+ OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty,
+ Mask0, CostKind, 0, nullptr, {Op00, Op01},
+ cast<Instruction>(BinOp->getOperand(0)));
+ if (Match1)
+ OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty,
+ Mask1, CostKind, 0, nullptr, {Op10, Op11},
+ cast<Instruction>(BinOp->getOperand(1)));
InstructionCost NewCost =
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, NewMask0,
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index a4aea02a335117..4863ea91803ad5 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -801,10 +801,10 @@ define <2 x double> @add_v2f64_01(<2 x double> %a, <2 x double> %b) {
define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @add_v2f64_u1(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: ret <2 x double> [[RESULT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
+; CHECK-NEXT: [[RESULT1:%.*]] = fadd <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
%a1 = extractelement <2 x double> %a, i32 1
@@ -820,9 +820,9 @@ define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
define <2 x double> @add_v2f64_0u(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @add_v2f64_0u(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = fadd <2 x double> [[TMP1]], [[SHIFT]]
; CHECK-NEXT: ret <2 x double> [[RESULT]]
;
%a0 = extractelement <2 x double> %a, i32 0
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index bcb316a4a73ea6..4f67ee0cb18c04 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -801,10 +801,10 @@ define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_u1(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: ret <2 x double> [[RESULT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
+; CHECK-NEXT: [[RESULT1:%.*]] = fsub <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
%a1 = extractelement <2 x double> %a, i32 1
@@ -820,9 +820,9 @@ define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_0u(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = fsub <2 x double> [[TMP1]], [[SHIFT]]
; CHECK-NEXT: ret <2 x double> [[RESULT]]
;
%a0 = extractelement <2 x double> %a, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
index 8db1990dcbb5d8..1dc324bbd63ff9 100644
--- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
@@ -70,16 +70,25 @@ define <4 x double> @fadd_v4f64_multiuse_op(<4 x double> %a, <4 x double> %b) {
ret <4 x double> %post
}
-; Negative test - multiple use of inner shuffle
+; Negative test - multiple use of inner shuffle (only fold if the moved shuffle is cheaper).
define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
-; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT: [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
-; CHECK-NEXT: [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT: call void @use_v4f64(<4 x double> [[A1]])
-; CHECK-NEXT: ret <4 x double> [[POST]]
+; SSE-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; SSE-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; SSE-NEXT: [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT: call void @use_v4f64(<4 x double> [[A1]])
+; SSE-NEXT: ret <4 x double> [[POST]]
+;
+; AVX-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; AVX-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT: [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; AVX-NEXT: [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
+; AVX-NEXT: [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; AVX-NEXT: call void @use_v4f64(<4 x double> [[A1]])
+; AVX-NEXT: ret <4 x double> [[POST]]
;
%a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -137,6 +146,3 @@ define <4 x i32> @sdiv_v4i32_poison_idx(<4 x i32> %a, <4 x i32> %b) {
%post = shufflevector <4 x i32> %op, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 4>
ret <4 x i32> %post
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
|
@llvm/pr-subscribers-vectorizers Author: Simon Pilgrim (RKSimon) ChangesfoldPermuteOfBinops currently requires both binop operands to be oneuse shuffles to fold the shuffles across the binop, but there will be cases where its still profitable to fold across the binop with only one foldable shuffle. Full diff: https://github.com/llvm/llvm-project/pull/122118.diff 4 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 120eafae8c5ac5..e9478103f36288 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1592,17 +1592,21 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
return false;
- Value *Op00, *Op01;
- ArrayRef<int> Mask0;
- if (!match(BinOp->getOperand(0),
- m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)))))
+ Value *Op00, *Op01, *Op10, *Op11;
+ ArrayRef<int> Mask0, Mask1;
+ bool Match0 =
+ match(BinOp->getOperand(0),
+ m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
+ bool Match1 =
+ match(BinOp->getOperand(1),
+ m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
+ if (!Match0 && !Match1)
return false;
- Value *Op10, *Op11;
- ArrayRef<int> Mask1;
- if (!match(BinOp->getOperand(1),
- m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)))))
- return false;
+ Op00 = Match0 ? Op00 : BinOp->getOperand(0);
+ Op01 = Match0 ? Op01 : BinOp->getOperand(0);
+ Op10 = Match1 ? Op10 : BinOp->getOperand(1);
+ Op11 = Match1 ? Op11 : BinOp->getOperand(1);
Instruction::BinaryOps Opcode = BinOp->getOpcode();
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
@@ -1620,15 +1624,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
return false;
- // Merge outer / inner shuffles.
+ // Merge outer / inner (or identity if no match) shuffles.
SmallVector<int> NewMask0, NewMask1;
for (int M : OuterMask) {
if (M < 0 || M >= (int)NumSrcElts) {
NewMask0.push_back(PoisonMaskElem);
NewMask1.push_back(PoisonMaskElem);
} else {
- NewMask0.push_back(Mask0[M]);
- NewMask1.push_back(Mask1[M]);
+ NewMask0.push_back(Match0 ? Mask0[M] : M);
+ NewMask1.push_back(Match1 ? Mask1[M] : M);
}
}
@@ -1636,13 +1640,15 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
InstructionCost OldCost =
TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, BinOpTy,
- OuterMask, CostKind, 0, nullptr, {BinOp}, &I) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, Mask0,
- CostKind, 0, nullptr, {Op00, Op01},
- cast<Instruction>(BinOp->getOperand(0))) +
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, Mask1,
- CostKind, 0, nullptr, {Op10, Op11},
- cast<Instruction>(BinOp->getOperand(1)));
+ OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
+ if (Match0)
+ OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty,
+ Mask0, CostKind, 0, nullptr, {Op00, Op01},
+ cast<Instruction>(BinOp->getOperand(0)));
+ if (Match1)
+ OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty,
+ Mask1, CostKind, 0, nullptr, {Op10, Op11},
+ cast<Instruction>(BinOp->getOperand(1)));
InstructionCost NewCost =
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, NewMask0,
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index a4aea02a335117..4863ea91803ad5 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -801,10 +801,10 @@ define <2 x double> @add_v2f64_01(<2 x double> %a, <2 x double> %b) {
define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @add_v2f64_u1(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: ret <2 x double> [[RESULT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
+; CHECK-NEXT: [[RESULT1:%.*]] = fadd <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
%a1 = extractelement <2 x double> %a, i32 1
@@ -820,9 +820,9 @@ define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
define <2 x double> @add_v2f64_0u(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @add_v2f64_0u(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = fadd <2 x double> [[TMP1]], [[SHIFT]]
; CHECK-NEXT: ret <2 x double> [[RESULT]]
;
%a0 = extractelement <2 x double> %a, i32 0
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index bcb316a4a73ea6..4f67ee0cb18c04 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -801,10 +801,10 @@ define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_u1(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[B]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT: ret <2 x double> [[RESULT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1>
+; CHECK-NEXT: [[RESULT1:%.*]] = fsub <2 x double> [[RESULT]], [[TMP2]]
+; CHECK-NEXT: ret <2 x double> [[RESULT1]]
;
%a0 = extractelement <2 x double> %a, i32 0
%a1 = extractelement <2 x double> %a, i32 1
@@ -820,9 +820,9 @@ define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @sub_v2f64_0u(
-; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[A]], [[SHIFT]]
-; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[RESULT:%.*]] = fsub <2 x double> [[TMP1]], [[SHIFT]]
; CHECK-NEXT: ret <2 x double> [[RESULT]]
;
%a0 = extractelement <2 x double> %a, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
index 8db1990dcbb5d8..1dc324bbd63ff9 100644
--- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll
@@ -70,16 +70,25 @@ define <4 x double> @fadd_v4f64_multiuse_op(<4 x double> %a, <4 x double> %b) {
ret <4 x double> %post
}
-; Negative test - multiple use of inner shuffle
+; Negative test - multiple use of inner shuffle (only fold if the moved shuffle is cheaper).
define <4 x double> @fadd_v4f64_multiuse_shuffle(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
-; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT: [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
-; CHECK-NEXT: [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT: call void @use_v4f64(<4 x double> [[A1]])
-; CHECK-NEXT: ret <4 x double> [[POST]]
+; SSE-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; SSE-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; SSE-NEXT: [[POST:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT: call void @use_v4f64(<4 x double> [[A1]])
+; SSE-NEXT: ret <4 x double> [[POST]]
+;
+; AVX-LABEL: define <4 x double> @fadd_v4f64_multiuse_shuffle(
+; AVX-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[A1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT: [[B1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; AVX-NEXT: [[OP:%.*]] = fadd <4 x double> [[A1]], [[B1]]
+; AVX-NEXT: [[POST:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; AVX-NEXT: call void @use_v4f64(<4 x double> [[A1]])
+; AVX-NEXT: ret <4 x double> [[POST]]
;
%a1 = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%b1 = shufflevector <4 x double> %b, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -137,6 +146,3 @@ define <4 x i32> @sdiv_v4i32_poison_idx(<4 x i32> %a, <4 x i32> %b) {
%post = shufflevector <4 x i32> %op, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 4>
ret <4 x i32> %post
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
|
ping? |
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison> | ||
; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[A]], [[SHIFT]] | ||
; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison> | ||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be just A
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes it could be - we're not very consistent with folding away identity like shuffles, as sometimes the poison mask elements can lead to further demanded elts simplification.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The problem with these shuffles is that TTI may incorrectly calculate them as broadcasts instead of identity
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK - I've updated the patch to skip known identity shuffles.
; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0> | ||
; CHECK-NEXT: ret <2 x double> [[RESULT]] | ||
; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0> | ||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 1> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just TMP1
?
OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op0Ty, | ||
Mask0, CostKind, 0, nullptr, {Op00, Op01}, | ||
cast<Instruction>(BinOp->getOperand(0))); | ||
if (Match1) | ||
OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, Op1Ty, | ||
Mask1, CostKind, 0, nullptr, {Op10, Op11}, | ||
cast<Instruction>(BinOp->getOperand(1))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use getInstructionCost
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getInstructionCost is still causing some regressions for some shuffle costs - I haven't been able to get to the bottom of them all yet - but the plan is to eventually move as much of the VectorCombine OldCost calculations over to getInstructionCost as possible.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/123/builds/11581 Here is the relevant piece of the build log for the reference
|
It appears that the bot stayed red after this patch landed. |
cheers - fix incoming |
Hi @RKSimon, thank you. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/11956 Here is the relevant piece of the build log for the reference
|
Looks like this is causing another crash:
|
… they match the destination type Fixes regression identified after #122118
foldPermuteOfBinops currently requires both binop operands to be oneuse shuffles to fold the shuffles across the binop, but there will be cases where its still profitable to fold across the binop with only one foldable shuffle.