Skip to content

Commit 1821117

Browse files
committed
[X86][SSE] Attempt to match OP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y))
An initial backend patch towards fixing the various poor HADD combines (PR34724, PR41813, PR45747 etc.). This extends isHorizontalBinOp to check if we have per-element horizontal ops (odd+even element pairs), but not in the expected serial order - in which case we build a "post shuffle mask" that we can apply to the HOP result, assuming we have fast-hops/optsize etc. The next step will be to extend the SHUFFLE(HOP(X,Y)) combines as suggested on PR41813 - accepting more post-shuffle masks even on slow-hop targets if we can fold it into another shuffle. Differential Revision: https://reviews.llvm.org/D83789
1 parent 3218c06 commit 1821117

File tree

7 files changed

+321
-409
lines changed

7 files changed

+321
-409
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 55 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -44371,8 +44371,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
4437144371
/// A horizontal-op B, for some already available A and B, and if so then LHS is
4437244372
/// set to A, RHS to B, and the routine returns 'true'.
4437344373
static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
44374-
const X86Subtarget &Subtarget,
44375-
bool IsCommutative) {
44374+
const X86Subtarget &Subtarget, bool IsCommutative,
44375+
SmallVectorImpl<int> &PostShuffleMask) {
4437644376
// If either operand is undef, bail out. The binop should be simplified.
4437744377
if (LHS.isUndef() || RHS.isUndef())
4437844378
return false;
@@ -44465,6 +44465,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4446544465
RMask.push_back(i);
4446644466
}
4446744467

44468+
// Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
44469+
if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
44470+
(isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
44471+
isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
44472+
return false;
44473+
4446844474
// If A and B occur in reverse order in RHS, then canonicalize by commuting
4446944475
// RHS operands and shuffle mask.
4447044476
if (A != C) {
@@ -44475,6 +44481,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4447544481
if (!(A == C && B == D))
4447644482
return false;
4447744483

44484+
PostShuffleMask.clear();
44485+
PostShuffleMask.append(NumElts, SM_SentinelUndef);
44486+
4447844487
// LHS and RHS are now:
4447944488
// LHS = shuffle A, B, LMask
4448044489
// RHS = shuffle A, B, RMask
@@ -44483,6 +44492,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4448344492
// so we just repeat the inner loop if this is a 256-bit op.
4448444493
unsigned Num128BitChunks = VT.getSizeInBits() / 128;
4448544494
unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
44495+
unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
4448644496
assert((NumEltsPer128BitChunk % 2 == 0) &&
4448744497
"Vector type should have an even number of elements in each lane");
4448844498
for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
@@ -44494,25 +44504,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4449444504
(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
4449544505
continue;
4449644506

44507+
// Check that successive odd/even elements are being operated on. If not,
44508+
// this is not a horizontal operation.
44509+
if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
44510+
!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
44511+
return false;
44512+
44513+
// Compute the post-shuffle mask index based on where the element
44514+
// is stored in the HOP result, and where it needs to be moved to.
44515+
int Base = LIdx & ~1u;
44516+
int Index = ((Base % NumEltsPer128BitChunk) / 2) +
44517+
((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
44518+
4449744519
// The low half of the 128-bit result must choose from A.
4449844520
// The high half of the 128-bit result must choose from B,
4449944521
// unless B is undef. In that case, we are always choosing from A.
44500-
unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
44501-
unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
44502-
44503-
// Check that successive elements are being operated on. If not, this is
44504-
// not a horizontal operation.
44505-
int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
44506-
if (!(LIdx == Index && RIdx == Index + 1) &&
44507-
!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
44508-
return false;
44522+
if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
44523+
Index += NumEltsPer64BitChunk;
44524+
PostShuffleMask[i + j] = Index;
4450944525
}
4451044526
}
4451144527

4451244528
LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
4451344529
RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
4451444530

44515-
if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
44531+
bool IsIdentityPostShuffle =
44532+
isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
44533+
if (IsIdentityPostShuffle)
44534+
PostShuffleMask.clear();
44535+
44536+
// Assume a SingleSource HOP if we only shuffle one input and don't need to
44537+
// shuffle the result.
44538+
if (!shouldUseHorizontalOp(LHS == RHS &&
44539+
(NumShuffles < 2 || !IsIdentityPostShuffle),
44540+
DAG, Subtarget))
4451644541
return false;
4451744542

4451844543
LHS = DAG.getBitcast(VT, LHS);
@@ -44531,10 +44556,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
4453144556
assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
4453244557

4453344558
// Try to synthesize horizontal add/sub from adds/subs of shuffles.
44559+
SmallVector<int, 8> PostShuffleMask;
4453444560
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
4453544561
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
44536-
isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
44537-
return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
44562+
isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
44563+
SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
44564+
if (!PostShuffleMask.empty())
44565+
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
44566+
DAG.getUNDEF(VT), PostShuffleMask);
44567+
return HorizBinOp;
44568+
}
4453844569

4453944570
// NOTE: isHorizontalBinOp may have changed LHS/RHS variables.
4454044571

@@ -47636,17 +47667,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
4763647667
bool IsAdd = N->getOpcode() == ISD::ADD;
4763747668
assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
4763847669

47670+
SmallVector<int, 8> PostShuffleMask;
4763947671
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
4764047672
VT == MVT::v8i32) &&
4764147673
Subtarget.hasSSSE3() &&
47642-
isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) {
47674+
isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
4764347675
auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
4764447676
ArrayRef<SDValue> Ops) {
47645-
return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB,
47646-
DL, Ops[0].getValueType(), Ops);
47677+
return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
47678+
Ops[0].getValueType(), Ops);
4764747679
};
47648-
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
47649-
HOpBuilder);
47680+
SDValue HorizBinOp =
47681+
SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
47682+
if (!PostShuffleMask.empty())
47683+
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
47684+
DAG.getUNDEF(VT), PostShuffleMask);
47685+
return HorizBinOp;
4765047686
}
4765147687

4765247688
return SDValue();

llvm/test/CodeGen/X86/haddsub-3.ll

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,46 @@ define float @pr26491(<4 x float> %a0) {
1717
; SSE2-NEXT: addss %xmm1, %xmm0
1818
; SSE2-NEXT: retq
1919
;
20-
; SSSE3-LABEL: pr26491:
21-
; SSSE3: # %bb.0:
22-
; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
23-
; SSSE3-NEXT: addps %xmm0, %xmm1
24-
; SSSE3-NEXT: movaps %xmm1, %xmm0
25-
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
26-
; SSSE3-NEXT: addss %xmm1, %xmm0
27-
; SSSE3-NEXT: retq
20+
; SSSE3-SLOW-LABEL: pr26491:
21+
; SSSE3-SLOW: # %bb.0:
22+
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
23+
; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
24+
; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0
25+
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
26+
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
27+
; SSSE3-SLOW-NEXT: retq
2828
;
29-
; AVX-LABEL: pr26491:
30-
; AVX: # %bb.0:
31-
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
32-
; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
33-
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
34-
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
35-
; AVX-NEXT: retq
29+
; SSSE3-FAST-LABEL: pr26491:
30+
; SSSE3-FAST: # %bb.0:
31+
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
32+
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1
33+
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
34+
; SSSE3-FAST-NEXT: addss %xmm0, %xmm1
35+
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
36+
; SSSE3-FAST-NEXT: retq
37+
;
38+
; AVX1-SLOW-LABEL: pr26491:
39+
; AVX1-SLOW: # %bb.0:
40+
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
41+
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
42+
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
43+
; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
44+
; AVX1-SLOW-NEXT: retq
45+
;
46+
; AVX1-FAST-LABEL: pr26491:
47+
; AVX1-FAST: # %bb.0:
48+
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
49+
; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
50+
; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0
51+
; AVX1-FAST-NEXT: retq
52+
;
53+
; AVX2-LABEL: pr26491:
54+
; AVX2: # %bb.0:
55+
; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
56+
; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
57+
; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
58+
; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0
59+
; AVX2-NEXT: retq
3660
%1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
3761
%2 = fadd <4 x float> %1, %a0
3862
%3 = extractelement <4 x float> %2, i32 2

0 commit comments

Comments
 (0)