@@ -44364,8 +44364,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
44364
44364
/// A horizontal-op B, for some already available A and B, and if so then LHS is
44365
44365
/// set to A, RHS to B, and the routine returns 'true'.
44366
44366
static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
44367
- const X86Subtarget &Subtarget,
44368
- bool IsCommutative ) {
44367
+ const X86Subtarget &Subtarget, bool IsCommutative,
44368
+ SmallVectorImpl<int> &PostShuffleMask ) {
44369
44369
// If either operand is undef, bail out. The binop should be simplified.
44370
44370
if (LHS.isUndef() || RHS.isUndef())
44371
44371
return false;
@@ -44458,6 +44458,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
44458
44458
RMask.push_back(i);
44459
44459
}
44460
44460
44461
+ // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
44462
+ if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
44463
+ (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
44464
+ isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
44465
+ return false;
44466
+
44461
44467
// If A and B occur in reverse order in RHS, then canonicalize by commuting
44462
44468
// RHS operands and shuffle mask.
44463
44469
if (A != C) {
@@ -44468,6 +44474,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
44468
44474
if (!(A == C && B == D))
44469
44475
return false;
44470
44476
44477
+ PostShuffleMask.clear();
44478
+ PostShuffleMask.append(NumElts, SM_SentinelUndef);
44479
+
44471
44480
// LHS and RHS are now:
44472
44481
// LHS = shuffle A, B, LMask
44473
44482
// RHS = shuffle A, B, RMask
@@ -44476,6 +44485,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
44476
44485
// so we just repeat the inner loop if this is a 256-bit op.
44477
44486
unsigned Num128BitChunks = VT.getSizeInBits() / 128;
44478
44487
unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
44488
+ unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
44479
44489
assert((NumEltsPer128BitChunk % 2 == 0) &&
44480
44490
"Vector type should have an even number of elements in each lane");
44481
44491
for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
@@ -44487,25 +44497,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
44487
44497
(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
44488
44498
continue;
44489
44499
44500
+ // Check that successive odd/even elements are being operated on. If not,
44501
+ // this is not a horizontal operation.
44502
+ if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
44503
+ !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
44504
+ return false;
44505
+
44506
+ // Compute the post-shuffle mask index based on where the element
44507
+ // is stored in the HOP result, and where it needs to be moved to.
44508
+ int Base = LIdx & ~1u;
44509
+ int Index = ((Base % NumEltsPer128BitChunk) / 2) +
44510
+ ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
44511
+
44490
44512
// The low half of the 128-bit result must choose from A.
44491
44513
// The high half of the 128-bit result must choose from B,
44492
44514
// unless B is undef. In that case, we are always choosing from A.
44493
- unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
44494
- unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
44495
-
44496
- // Check that successive elements are being operated on. If not, this is
44497
- // not a horizontal operation.
44498
- int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
44499
- if (!(LIdx == Index && RIdx == Index + 1) &&
44500
- !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
44501
- return false;
44515
+ if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
44516
+ Index += NumEltsPer64BitChunk;
44517
+ PostShuffleMask[i + j] = Index;
44502
44518
}
44503
44519
}
44504
44520
44505
44521
LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
44506
44522
RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
44507
44523
44508
- if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
44524
+ bool IsIdentityPostShuffle =
44525
+ isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
44526
+ if (IsIdentityPostShuffle)
44527
+ PostShuffleMask.clear();
44528
+
44529
+ // Assume a SingleSource HOP if we only shuffle one input and don't need to
44530
+ // shuffle the result.
44531
+ if (!shouldUseHorizontalOp(LHS == RHS &&
44532
+ (NumShuffles < 2 || !IsIdentityPostShuffle),
44533
+ DAG, Subtarget))
44509
44534
return false;
44510
44535
44511
44536
LHS = DAG.getBitcast(VT, LHS);
@@ -44524,10 +44549,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
44524
44549
assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
44525
44550
44526
44551
// Try to synthesize horizontal add/sub from adds/subs of shuffles.
44552
+ SmallVector<int, 8> PostShuffleMask;
44527
44553
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
44528
44554
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
44529
- isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
44530
- return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
44555
+ isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
44556
+ SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
44557
+ if (!PostShuffleMask.empty())
44558
+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
44559
+ DAG.getUNDEF(VT), PostShuffleMask);
44560
+ return HorizBinOp;
44561
+ }
44531
44562
44532
44563
// NOTE: isHorizontalBinOp may have changed LHS/RHS variables.
44533
44564
@@ -47620,17 +47651,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
47620
47651
bool IsAdd = N->getOpcode() == ISD::ADD;
47621
47652
assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
47622
47653
47654
+ SmallVector<int, 8> PostShuffleMask;
47623
47655
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
47624
47656
VT == MVT::v8i32) &&
47625
47657
Subtarget.hasSSSE3() &&
47626
- isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) {
47658
+ isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask )) {
47627
47659
auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
47628
47660
ArrayRef<SDValue> Ops) {
47629
- return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB,
47630
- DL, Ops[0].getValueType(), Ops);
47661
+ return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
47662
+ Ops[0].getValueType(), Ops);
47631
47663
};
47632
- return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
47633
- HOpBuilder);
47664
+ SDValue HorizBinOp =
47665
+ SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
47666
+ if (!PostShuffleMask.empty())
47667
+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
47668
+ DAG.getUNDEF(VT), PostShuffleMask);
47669
+ return HorizBinOp;
47634
47670
}
47635
47671
47636
47672
return SDValue();
0 commit comments