-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[DAG] Fold nested add(add(reduce(a), b), add(reduce(c), d)) #115150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesThis patch reassociates There is another small change to move reassociateReduction inside fadd outside of a AllowNewConst block, as new constants will not be created and it should be OK to perform the combine later after legalization. Patch is 26.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115150.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index dcd5ca3b936e72..bf2faa34cc4192 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1329,6 +1329,38 @@ SDValue DAGCombiner::reassociateReduction(unsigned RedOpc, unsigned Opc,
DAG.getNode(Opc, DL, N0.getOperand(0).getValueType(),
N0.getOperand(0), N1.getOperand(0)));
}
+
+ // Reassociate add(add(vecreduce(a), b), add(vecreduce(c), d)) into
+ // add(vecreduce(add(a, c)), add(b, d)), to combine the reductions into a
+ // single node.
+ if (N0.getOpcode() == Opc && N1.getOpcode() == Opc && N0->hasOneUse() &&
+ N1->hasOneUse()) {
+ SDValue N00 = N0.getOperand(0);
+ SDValue N01 = N0.getOperand(1);
+ if (N00.getOpcode() != RedOpc && N01.getOpcode() == RedOpc)
+ std::swap(N00, N01);
+ if (N00.getOpcode() == RedOpc && N01.getOpcode() != RedOpc &&
+ N00->hasOneUse()) {
+ SDValue N10 = N1.getOperand(0);
+ SDValue N11 = N1.getOperand(1);
+ if (N10.getOpcode() != RedOpc && N11.getOpcode() == RedOpc)
+ std::swap(N10, N11);
+
+ if (N10.getOpcode() == RedOpc &&
+ N00.getOperand(0).getValueType() ==
+ N10.getOperand(0).getValueType() &&
+ N10->hasOneUse() &&
+ hasOperation(Opc, N00.getOperand(0).getValueType()) &&
+ TLI.shouldReassociateReduction(RedOpc, VT)) {
+ SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
+ SDValue Add = DAG.getNode(Opc, DL, N00.getOperand(0).getValueType(),
+ N00.getOperand(0), N10.getOperand(0));
+ SDValue Red = DAG.getNode(RedOpc, DL, VT, Add);
+ SDValue Add2 = DAG.getNode(Opc, DL, VT, N01, N11);
+ return DAG.getNode(Opc, DL, VT, Red, Add2);
+ }
+ }
+ }
return SDValue();
}
@@ -17098,12 +17130,15 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
DAG.getConstantFP(4.0, DL, VT));
}
}
+ } // enable-unsafe-fp-math && AllowNewConst
+ if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
+ (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros()))) {
// Fold fadd(vecreduce(x), vecreduce(y)) -> vecreduce(fadd(x, y))
if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FADD, ISD::FADD, DL,
VT, N0, N1, Flags))
return SD;
- } // enable-unsafe-fp-math
+ }
// FADD -> FMA combines:
if (SDValue Fused = visitFADDForFMACombine<EmptyMatchContext>(N)) {
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
index 0c356b1d98287f..b04e34a54af475 100644
--- a/llvm/test/CodeGen/AArch64/double_reduct.ll
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -288,13 +288,11 @@ define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
define float @nested_fadd_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
; CHECK-LABEL: nested_fadd_f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: fadd s2, s2, s3
; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: faddp s1, v1.2s
; CHECK-NEXT: faddp s0, v0.2s
-; CHECK-NEXT: fadd s1, s1, s3
; CHECK-NEXT: fadd s0, s0, s2
-; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
%a1 = fadd fast float %r1, %c
@@ -332,15 +330,12 @@ define float @nested_fadd_f32_slow(<4 x float> %a, <4 x float> %b, float %c, flo
define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
; CHECK-LABEL: nested_mul_f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: fmul v1.2s, v1.2s, v4.2s
-; CHECK-NEXT: fmul v0.2s, v0.2s, v5.2s
-; CHECK-NEXT: fmul s1, s1, v1.s[1]
+; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: fmul s2, s2, s3
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s
; CHECK-NEXT: fmul s0, s0, v0.s[1]
-; CHECK-NEXT: fmul s1, s1, s3
; CHECK-NEXT: fmul s0, s0, s2
-; CHECK-NEXT: fmul s0, s0, s1
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
%a1 = fmul fast float %r1, %c
@@ -353,12 +348,10 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d)
define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_add_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: addv s1, v1.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: add w8, w0, w1
; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: add w9, w9, w0
-; CHECK-NEXT: add w8, w8, w1
; CHECK-NEXT: add w0, w9, w8
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
@@ -372,12 +365,10 @@ define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_add_c1_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_add_c1_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: addv s1, v1.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: add w8, w0, w1
; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: add w9, w0, w9
-; CHECK-NEXT: add w8, w8, w1
; CHECK-NEXT: add w0, w9, w8
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
@@ -391,12 +382,10 @@ define i32 @nested_add_c1_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_add_c2_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_add_c2_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: addv s1, v1.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: add w8, w0, w1
; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: add w9, w9, w0
-; CHECK-NEXT: add w8, w1, w8
; CHECK-NEXT: add w0, w9, w8
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
@@ -429,19 +418,14 @@ define i32 @nested_add_manyreduct_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c,
define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_mul_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: mul v0.2s, v0.2s, v3.2s
-; CHECK-NEXT: mul v1.2s, v1.2s, v2.2s
-; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mul w8, w0, w1
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: mov w9, v0.s[1]
; CHECK-NEXT: fmov w10, s0
-; CHECK-NEXT: mov w9, v1.s[1]
-; CHECK-NEXT: mul w8, w10, w8
-; CHECK-NEXT: fmov w10, s1
; CHECK-NEXT: mul w9, w10, w9
-; CHECK-NEXT: mul w8, w8, w0
-; CHECK-NEXT: mul w9, w9, w1
-; CHECK-NEXT: mul w0, w8, w9
+; CHECK-NEXT: mul w0, w9, w8
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
%a1 = mul i32 %r1, %c
@@ -454,19 +438,14 @@ define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_and_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v3.8b
-; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and w8, w0, w1
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: fmov x9, d0
; CHECK-NEXT: lsr x10, x9, #32
-; CHECK-NEXT: lsr x11, x8, #32
-; CHECK-NEXT: and w9, w9, w0
-; CHECK-NEXT: and w8, w8, w1
-; CHECK-NEXT: and w9, w9, w10
-; CHECK-NEXT: and w8, w8, w11
-; CHECK-NEXT: and w0, w9, w8
+; CHECK-NEXT: and w8, w9, w8
+; CHECK-NEXT: and w0, w8, w10
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
%a1 = and i32 %r1, %c
@@ -479,19 +458,14 @@ define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_or_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: orr v1.8b, v1.8b, v2.8b
-; CHECK-NEXT: orr v0.8b, v0.8b, v3.8b
-; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: orr w8, w0, w1
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-NEXT: fmov x9, d0
; CHECK-NEXT: lsr x10, x9, #32
-; CHECK-NEXT: lsr x11, x8, #32
-; CHECK-NEXT: orr w9, w9, w0
-; CHECK-NEXT: orr w8, w8, w1
-; CHECK-NEXT: orr w9, w9, w10
-; CHECK-NEXT: orr w8, w8, w11
-; CHECK-NEXT: orr w0, w9, w8
+; CHECK-NEXT: orr w8, w9, w8
+; CHECK-NEXT: orr w0, w8, w10
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
%a1 = or i32 %r1, %c
@@ -504,19 +478,14 @@ define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_xor_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: eor v1.8b, v1.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v0.8b, v3.8b
-; CHECK-NEXT: fmov x8, d1
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: eor w8, w0, w1
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-NEXT: fmov x9, d0
; CHECK-NEXT: lsr x10, x9, #32
-; CHECK-NEXT: lsr x11, x8, #32
-; CHECK-NEXT: eor w9, w9, w0
-; CHECK-NEXT: eor w8, w8, w1
-; CHECK-NEXT: eor w9, w9, w10
-; CHECK-NEXT: eor w8, w8, w11
-; CHECK-NEXT: eor w0, w9, w8
+; CHECK-NEXT: eor w8, w9, w8
+; CHECK-NEXT: eor w0, w8, w10
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
%a1 = xor i32 %r1, %c
@@ -529,14 +498,11 @@ define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_smin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_smin_i32:
; CHECK: // %bb.0:
+; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmp w0, w1
+; CHECK-NEXT: csel w8, w0, w1, lt
; CHECK-NEXT: sminv s0, v0.4s
-; CHECK-NEXT: sminv s1, v1.4s
; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: cmp w9, w0
-; CHECK-NEXT: csel w9, w9, w0, lt
-; CHECK-NEXT: cmp w8, w1
-; CHECK-NEXT: csel w8, w8, w1, lt
; CHECK-NEXT: cmp w9, w8
; CHECK-NEXT: csel w0, w9, w8, lt
; CHECK-NEXT: ret
@@ -551,14 +517,11 @@ define i32 @nested_smin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_smax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_smax_i32:
; CHECK: // %bb.0:
+; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmp w0, w1
+; CHECK-NEXT: csel w8, w0, w1, gt
; CHECK-NEXT: smaxv s0, v0.4s
-; CHECK-NEXT: smaxv s1, v1.4s
; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: cmp w9, w0
-; CHECK-NEXT: csel w9, w9, w0, gt
-; CHECK-NEXT: cmp w8, w1
-; CHECK-NEXT: csel w8, w8, w1, gt
; CHECK-NEXT: cmp w9, w8
; CHECK-NEXT: csel w0, w9, w8, gt
; CHECK-NEXT: ret
@@ -573,14 +536,11 @@ define i32 @nested_smax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_umin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_umin_i32:
; CHECK: // %bb.0:
+; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmp w0, w1
+; CHECK-NEXT: csel w8, w0, w1, lo
; CHECK-NEXT: uminv s0, v0.4s
-; CHECK-NEXT: uminv s1, v1.4s
; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: cmp w9, w0
-; CHECK-NEXT: csel w9, w9, w0, lo
-; CHECK-NEXT: cmp w8, w1
-; CHECK-NEXT: csel w8, w8, w1, lo
; CHECK-NEXT: cmp w9, w8
; CHECK-NEXT: csel w0, w9, w8, lo
; CHECK-NEXT: ret
@@ -595,14 +555,11 @@ define i32 @nested_umin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_umax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_umax_i32:
; CHECK: // %bb.0:
+; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmp w0, w1
+; CHECK-NEXT: csel w8, w0, w1, hi
; CHECK-NEXT: umaxv s0, v0.4s
-; CHECK-NEXT: umaxv s1, v1.4s
; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: cmp w9, w0
-; CHECK-NEXT: csel w9, w9, w0, hi
-; CHECK-NEXT: cmp w8, w1
-; CHECK-NEXT: csel w8, w8, w1, hi
; CHECK-NEXT: cmp w9, w8
; CHECK-NEXT: csel w0, w9, w8, hi
; CHECK-NEXT: ret
@@ -617,11 +574,10 @@ define i32 @nested_umax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define float @nested_fmin_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
; CHECK-LABEL: nested_fmin_float:
; CHECK: // %bb.0:
-; CHECK-NEXT: fminnmv s1, v1.4s
+; CHECK-NEXT: fminnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: fminnm s2, s2, s3
; CHECK-NEXT: fminnmv s0, v0.4s
-; CHECK-NEXT: fminnm s1, s1, s3
; CHECK-NEXT: fminnm s0, s0, s2
-; CHECK-NEXT: fminnm s0, s0, s1
; CHECK-NEXT: ret
%r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
%a1 = call float @llvm.minnum.f32(float %r1, float %c)
@@ -634,11 +590,10 @@ define float @nested_fmin_float(<4 x float> %a, <4 x float> %b, float %c, float
define float @nested_fmax_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
; CHECK-LABEL: nested_fmax_float:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmaxnmv s1, v1.4s
+; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: fmaxnm s2, s2, s3
; CHECK-NEXT: fmaxnmv s0, v0.4s
-; CHECK-NEXT: fmaxnm s1, s1, s3
; CHECK-NEXT: fmaxnm s0, s0, s2
-; CHECK-NEXT: fmaxnm s0, s0, s1
; CHECK-NEXT: ret
%r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
%a1 = call float @llvm.maxnum.f32(float %r1, float %c)
diff --git a/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll
index 67723e8aa41ad7..1136246f6b14dd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll
@@ -244,15 +244,12 @@ define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
define float @nested_add_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
; CHECK-LABEL: nested_add_f32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vadd.f32 s6, s6, s7
-; CHECK-NEXT: vadd.f32 s4, s4, s5
+; CHECK-NEXT: vadd.f32 q0, q0, q1
+; CHECK-NEXT: vadd.f32 s4, s8, s9
; CHECK-NEXT: vadd.f32 s2, s2, s3
; CHECK-NEXT: vadd.f32 s0, s0, s1
-; CHECK-NEXT: vadd.f32 s4, s4, s6
-; CHECK-NEXT: vadd.f32 s0, s0, s2
-; CHECK-NEXT: vadd.f32 s2, s4, s9
-; CHECK-NEXT: vadd.f32 s0, s0, s8
; CHECK-NEXT: vadd.f32 s0, s0, s2
+; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: bx lr
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
%a1 = fadd fast float %r1, %c
@@ -265,15 +262,12 @@ define float @nested_add_f32(<4 x float> %a, <4 x float> %b, float %c, float %d)
define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
; CHECK-LABEL: nested_mul_f32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmul.f32 s6, s6, s7
-; CHECK-NEXT: vmul.f32 s4, s4, s5
+; CHECK-NEXT: vmul.f32 q0, q0, q1
+; CHECK-NEXT: vmul.f32 s4, s8, s9
; CHECK-NEXT: vmul.f32 s2, s2, s3
; CHECK-NEXT: vmul.f32 s0, s0, s1
-; CHECK-NEXT: vmul.f32 s4, s4, s6
-; CHECK-NEXT: vmul.f32 s0, s0, s2
-; CHECK-NEXT: vmul.f32 s2, s4, s9
-; CHECK-NEXT: vmul.f32 s0, s0, s8
; CHECK-NEXT: vmul.f32 s0, s0, s2
+; CHECK-NEXT: vmul.f32 s0, s0, s4
; CHECK-NEXT: bx lr
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
%a1 = fmul fast float %r1, %c
@@ -301,22 +295,17 @@ define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_mul_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: vmov r8, r3, d2
-; CHECK-NEXT: vmov r4, r5, d1
-; CHECK-NEXT: vmov r6, r7, d0
-; CHECK-NEXT: vmov r12, lr, d3
-; CHECK-NEXT: mul r3, r8, r3
-; CHECK-NEXT: muls r5, r4, r5
-; CHECK-NEXT: mul r2, r12, lr
-; CHECK-NEXT: muls r7, r6, r7
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: vmul.i32 q0, q0, q1
+; CHECK-NEXT: muls r0, r1, r0
+; CHECK-NEXT: vmov r12, lr, d1
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: mul r12, r12, lr
; CHECK-NEXT: muls r2, r3, r2
-; CHECK-NEXT: mul r3, r7, r5
-; CHECK-NEXT: muls r1, r2, r1
-; CHECK-NEXT: muls r0, r3, r0
+; CHECK-NEXT: mul r1, r2, r12
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: pop {r7, pc}
%r1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
%a1 = mul i32 %r1, %c
%r2 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %b)
@@ -328,22 +317,17 @@ define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_and_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: vmov r2, r3, d2
-; CHECK-NEXT: vmov r12, lr, d3
-; CHECK-NEXT: vmov r8, r5, d1
-; CHECK-NEXT: vmov r6, r7, d0
-; CHECK-NEXT: ands r2, r3
-; CHECK-NEXT: and.w r4, r12, lr
-; CHECK-NEXT: ands r2, r4
-; CHECK-NEXT: ands r1, r2
-; CHECK-NEXT: and.w r2, r8, r5
-; CHECK-NEXT: and.w r3, r6, r7
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: vmov r12, lr, d1
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: and.w r12, r12, lr
; CHECK-NEXT: ands r2, r3
+; CHECK-NEXT: and.w r2, r2, r12
; CHECK-NEXT: ands r0, r2
-; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: pop {r7, pc}
%r1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
%a1 = and i32 %r1, %c
%r2 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %b)
@@ -355,22 +339,17 @@ define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK-LABEL: nested_or_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: vmov r2, r3, d2
-; CHECK-NEXT: vmov r12, lr, d3
-; CHECK-NEXT: vmov r8, r5, d1
-; CHECK-NEXT: vmov r6, r7, d0
-; CHECK-NEXT: orrs r2, r3
-; CHECK-NEXT: orr.w r4, r12, lr
-; CHECK-NEXT: orrs r2, r4
-; CHECK-NEXT: orrs r1, r2
-; CHECK-NEXT: orr.w r2, r8, r5
-; CHECK-NEXT: orr.w r3, r6, r7
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: vorr q0, q0, q1
+; CHECK-NEXT: orrs r0, r1
+; CHECK-NEXT: vmov r12, lr, d1
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: orr.w r12, r12, lr
; CHECK-NEXT: orrs r2, r3
+; CHECK-NEXT: orr.w r2, r2, r12
; CHECK-NEXT: orrs r0, r2
-; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: pop {r7, pc}
%r1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
%a1 = or i32 %r1, %c
%r2 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %b)
@@ -382,22 +361,17 @@ define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
; CHECK...
[truncated]
|
d441d4c
to
6229a8a
Compare
6229a8a
to
966f6b0
Compare
966f6b0
to
c18fc6f
Compare
Rebase and ping. Thanks. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The codegen changes look good! Do you have any examples where this type of pattern comes up? I had a few other minor comments.
c18fc6f
to
27a6fee
Compare
This patch reassociates add(add(vecreduce(a), b), add(vecreduce(c), d)) into add(vecreduce(add(a, c)), add(b, d)), to combine the reductions into a single node. This comes up after unrolling vectorized loops. There is another small change to move reassociateReduction inside fadd outside of a AllowNewConst block, as new constants will not be created and it should be OK to perform the combine later after legalization.
27a6fee
to
04bfbfd
Compare
This patch reassociates
add(add(vecreduce(a), b), add(vecreduce(c), d))
intoadd(vecreduce(add(a, c)), add(b, d))
, to combine the reductions into a single node. This comes up after unrolling vectorized loops.There is another small change to move reassociateReduction inside fadd outside of a AllowNewConst block, as new constants will not be created and it should be OK to perform the combine later after legalization.