@@ -44371,8 +44371,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
4437144371/// A horizontal-op B, for some already available A and B, and if so then LHS is
4437244372/// set to A, RHS to B, and the routine returns 'true'.
4437344373static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
44374- const X86Subtarget &Subtarget,
44375- bool IsCommutative ) {
44374+ const X86Subtarget &Subtarget, bool IsCommutative,
44375+ SmallVectorImpl<int> &PostShuffleMask ) {
4437644376 // If either operand is undef, bail out. The binop should be simplified.
4437744377 if (LHS.isUndef() || RHS.isUndef())
4437844378 return false;
@@ -44465,6 +44465,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4446544465 RMask.push_back(i);
4446644466 }
4446744467
44468+ // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
44469+ if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
44470+ (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
44471+ isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
44472+ return false;
44473+
4446844474 // If A and B occur in reverse order in RHS, then canonicalize by commuting
4446944475 // RHS operands and shuffle mask.
4447044476 if (A != C) {
@@ -44475,6 +44481,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4447544481 if (!(A == C && B == D))
4447644482 return false;
4447744483
44484+ PostShuffleMask.clear();
44485+ PostShuffleMask.append(NumElts, SM_SentinelUndef);
44486+
4447844487 // LHS and RHS are now:
4447944488 // LHS = shuffle A, B, LMask
4448044489 // RHS = shuffle A, B, RMask
@@ -44483,6 +44492,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4448344492 // so we just repeat the inner loop if this is a 256-bit op.
4448444493 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
4448544494 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
44495+ unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
4448644496 assert((NumEltsPer128BitChunk % 2 == 0) &&
4448744497 "Vector type should have an even number of elements in each lane");
4448844498 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
@@ -44494,25 +44504,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4449444504 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
4449544505 continue;
4449644506
44507+ // Check that successive odd/even elements are being operated on. If not,
44508+ // this is not a horizontal operation.
44509+ if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
44510+ !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
44511+ return false;
44512+
44513+ // Compute the post-shuffle mask index based on where the element
44514+ // is stored in the HOP result, and where it needs to be moved to.
44515+ int Base = LIdx & ~1u;
44516+ int Index = ((Base % NumEltsPer128BitChunk) / 2) +
44517+ ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
44518+
4449744519 // The low half of the 128-bit result must choose from A.
4449844520 // The high half of the 128-bit result must choose from B,
4449944521 // unless B is undef. In that case, we are always choosing from A.
44500- unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
44501- unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
44502-
44503- // Check that successive elements are being operated on. If not, this is
44504- // not a horizontal operation.
44505- int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
44506- if (!(LIdx == Index && RIdx == Index + 1) &&
44507- !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
44508- return false;
44522+ if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
44523+ Index += NumEltsPer64BitChunk;
44524+ PostShuffleMask[i + j] = Index;
4450944525 }
4451044526 }
4451144527
4451244528 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
4451344529 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
4451444530
44515- if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
44531+ bool IsIdentityPostShuffle =
44532+ isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
44533+ if (IsIdentityPostShuffle)
44534+ PostShuffleMask.clear();
44535+
44536+ // Assume a SingleSource HOP if we only shuffle one input and don't need to
44537+ // shuffle the result.
44538+ if (!shouldUseHorizontalOp(LHS == RHS &&
44539+ (NumShuffles < 2 || !IsIdentityPostShuffle),
44540+ DAG, Subtarget))
4451644541 return false;
4451744542
4451844543 LHS = DAG.getBitcast(VT, LHS);
@@ -44531,10 +44556,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
4453144556 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
4453244557
4453344558 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
44559+ SmallVector<int, 8> PostShuffleMask;
4453444560 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
4453544561 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
44536- isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
44537- return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
44562+ isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
44563+ SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
44564+ if (!PostShuffleMask.empty())
44565+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
44566+ DAG.getUNDEF(VT), PostShuffleMask);
44567+ return HorizBinOp;
44568+ }
4453844569
4453944570 // NOTE: isHorizontalBinOp may have changed LHS/RHS variables.
4454044571
@@ -47636,17 +47667,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
4763647667 bool IsAdd = N->getOpcode() == ISD::ADD;
4763747668 assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
4763847669
47670+ SmallVector<int, 8> PostShuffleMask;
4763947671 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
4764047672 VT == MVT::v8i32) &&
4764147673 Subtarget.hasSSSE3() &&
47642- isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) {
47674+ isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask )) {
4764347675 auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
4764447676 ArrayRef<SDValue> Ops) {
47645- return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB,
47646- DL, Ops[0].getValueType(), Ops);
47677+ return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
47678+ Ops[0].getValueType(), Ops);
4764747679 };
47648- return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
47649- HOpBuilder);
47680+ SDValue HorizBinOp =
47681+ SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
47682+ if (!PostShuffleMask.empty())
47683+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
47684+ DAG.getUNDEF(VT), PostShuffleMask);
47685+ return HorizBinOp;
4765047686 }
4765147687
4765247688 return SDValue();
0 commit comments