Skip to content

Commit d687594

Browse files
RKSimonzmodem
authored andcommitted
[X86][SSE] Attempt to match OP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y))
An initial backend patch towards fixing the various poor HADD combines (PR34724, PR41813, PR45747 etc.). This extends isHorizontalBinOp to check if we have per-element horizontal ops (odd+even element pairs), but not in the expected serial order - in which case we build a "post shuffle mask" that we can apply to the HOP result, assuming we have fast-hops/optsize etc. The next step will be to extend the SHUFFLE(HOP(X,Y)) combines as suggested on PR41813 - accepting more post-shuffle masks even on slow-hop targets if we can fold it into another shuffle. Differential Revision: https://reviews.llvm.org/D83789 (cherry picked from commit 1821117)
1 parent 592454c commit d687594

File tree

7 files changed

+321
-409
lines changed

7 files changed

+321
-409
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 55 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -44364,8 +44364,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
4436444364
/// A horizontal-op B, for some already available A and B, and if so then LHS is
4436544365
/// set to A, RHS to B, and the routine returns 'true'.
4436644366
static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
44367-
const X86Subtarget &Subtarget,
44368-
bool IsCommutative) {
44367+
const X86Subtarget &Subtarget, bool IsCommutative,
44368+
SmallVectorImpl<int> &PostShuffleMask) {
4436944369
// If either operand is undef, bail out. The binop should be simplified.
4437044370
if (LHS.isUndef() || RHS.isUndef())
4437144371
return false;
@@ -44458,6 +44458,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4445844458
RMask.push_back(i);
4445944459
}
4446044460

44461+
// Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
44462+
if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
44463+
(isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
44464+
isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
44465+
return false;
44466+
4446144467
// If A and B occur in reverse order in RHS, then canonicalize by commuting
4446244468
// RHS operands and shuffle mask.
4446344469
if (A != C) {
@@ -44468,6 +44474,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4446844474
if (!(A == C && B == D))
4446944475
return false;
4447044476

44477+
PostShuffleMask.clear();
44478+
PostShuffleMask.append(NumElts, SM_SentinelUndef);
44479+
4447144480
// LHS and RHS are now:
4447244481
// LHS = shuffle A, B, LMask
4447344482
// RHS = shuffle A, B, RMask
@@ -44476,6 +44485,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4447644485
// so we just repeat the inner loop if this is a 256-bit op.
4447744486
unsigned Num128BitChunks = VT.getSizeInBits() / 128;
4447844487
unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
44488+
unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
4447944489
assert((NumEltsPer128BitChunk % 2 == 0) &&
4448044490
"Vector type should have an even number of elements in each lane");
4448144491
for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
@@ -44487,25 +44497,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4448744497
(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
4448844498
continue;
4448944499

44500+
// Check that successive odd/even elements are being operated on. If not,
44501+
// this is not a horizontal operation.
44502+
if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
44503+
!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
44504+
return false;
44505+
44506+
// Compute the post-shuffle mask index based on where the element
44507+
// is stored in the HOP result, and where it needs to be moved to.
44508+
int Base = LIdx & ~1u;
44509+
int Index = ((Base % NumEltsPer128BitChunk) / 2) +
44510+
((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
44511+
4449044512
// The low half of the 128-bit result must choose from A.
4449144513
// The high half of the 128-bit result must choose from B,
4449244514
// unless B is undef. In that case, we are always choosing from A.
44493-
unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
44494-
unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
44495-
44496-
// Check that successive elements are being operated on. If not, this is
44497-
// not a horizontal operation.
44498-
int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
44499-
if (!(LIdx == Index && RIdx == Index + 1) &&
44500-
!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
44501-
return false;
44515+
if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
44516+
Index += NumEltsPer64BitChunk;
44517+
PostShuffleMask[i + j] = Index;
4450244518
}
4450344519
}
4450444520

4450544521
LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
4450644522
RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
4450744523

44508-
if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
44524+
bool IsIdentityPostShuffle =
44525+
isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
44526+
if (IsIdentityPostShuffle)
44527+
PostShuffleMask.clear();
44528+
44529+
// Assume a SingleSource HOP if we only shuffle one input and don't need to
44530+
// shuffle the result.
44531+
if (!shouldUseHorizontalOp(LHS == RHS &&
44532+
(NumShuffles < 2 || !IsIdentityPostShuffle),
44533+
DAG, Subtarget))
4450944534
return false;
4451044535

4451144536
LHS = DAG.getBitcast(VT, LHS);
@@ -44524,10 +44549,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
4452444549
assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
4452544550

4452644551
// Try to synthesize horizontal add/sub from adds/subs of shuffles.
44552+
SmallVector<int, 8> PostShuffleMask;
4452744553
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
4452844554
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
44529-
isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
44530-
return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
44555+
isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
44556+
SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
44557+
if (!PostShuffleMask.empty())
44558+
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
44559+
DAG.getUNDEF(VT), PostShuffleMask);
44560+
return HorizBinOp;
44561+
}
4453144562

4453244563
// NOTE: isHorizontalBinOp may have changed LHS/RHS variables.
4453344564

@@ -47620,17 +47651,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
4762047651
bool IsAdd = N->getOpcode() == ISD::ADD;
4762147652
assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
4762247653

47654+
SmallVector<int, 8> PostShuffleMask;
4762347655
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
4762447656
VT == MVT::v8i32) &&
4762547657
Subtarget.hasSSSE3() &&
47626-
isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) {
47658+
isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
4762747659
auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
4762847660
ArrayRef<SDValue> Ops) {
47629-
return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB,
47630-
DL, Ops[0].getValueType(), Ops);
47661+
return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
47662+
Ops[0].getValueType(), Ops);
4763147663
};
47632-
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
47633-
HOpBuilder);
47664+
SDValue HorizBinOp =
47665+
SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
47666+
if (!PostShuffleMask.empty())
47667+
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
47668+
DAG.getUNDEF(VT), PostShuffleMask);
47669+
return HorizBinOp;
4763447670
}
4763547671

4763647672
return SDValue();

llvm/test/CodeGen/X86/haddsub-3.ll

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,46 @@ define float @pr26491(<4 x float> %a0) {
1717
; SSE2-NEXT: addss %xmm1, %xmm0
1818
; SSE2-NEXT: retq
1919
;
20-
; SSSE3-LABEL: pr26491:
21-
; SSSE3: # %bb.0:
22-
; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
23-
; SSSE3-NEXT: addps %xmm0, %xmm1
24-
; SSSE3-NEXT: movaps %xmm1, %xmm0
25-
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
26-
; SSSE3-NEXT: addss %xmm1, %xmm0
27-
; SSSE3-NEXT: retq
20+
; SSSE3-SLOW-LABEL: pr26491:
21+
; SSSE3-SLOW: # %bb.0:
22+
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
23+
; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
24+
; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0
25+
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
26+
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
27+
; SSSE3-SLOW-NEXT: retq
2828
;
29-
; AVX-LABEL: pr26491:
30-
; AVX: # %bb.0:
31-
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
32-
; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
33-
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
34-
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
35-
; AVX-NEXT: retq
29+
; SSSE3-FAST-LABEL: pr26491:
30+
; SSSE3-FAST: # %bb.0:
31+
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
32+
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1
33+
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
34+
; SSSE3-FAST-NEXT: addss %xmm0, %xmm1
35+
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
36+
; SSSE3-FAST-NEXT: retq
37+
;
38+
; AVX1-SLOW-LABEL: pr26491:
39+
; AVX1-SLOW: # %bb.0:
40+
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
41+
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
42+
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
43+
; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
44+
; AVX1-SLOW-NEXT: retq
45+
;
46+
; AVX1-FAST-LABEL: pr26491:
47+
; AVX1-FAST: # %bb.0:
48+
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
49+
; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
50+
; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0
51+
; AVX1-FAST-NEXT: retq
52+
;
53+
; AVX2-LABEL: pr26491:
54+
; AVX2: # %bb.0:
55+
; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
56+
; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
57+
; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
58+
; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0
59+
; AVX2-NEXT: retq
3660
%1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
3761
%2 = fadd <4 x float> %1, %a0
3862
%3 = extractelement <4 x float> %2, i32 2

0 commit comments

Comments
 (0)