Skip to content

Commit 2bc8e07

Browse files
committed
[X86][SSE] Blend any v8i16/v4i32 shift with 2 shift unique values
We were only doing this for basic blends, despite shuffle lowering now being good enough to handle more complex blends. This means that the two v8i16 splat shifts are performed in parallel instead of serially as the general shift case. llvm-svn: 336113
1 parent a6be243 commit 2bc8e07

File tree

2 files changed

+34
-69
lines changed

2 files changed

+34
-69
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 22 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -23441,72 +23441,47 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
2344123441
return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
2344223442

2344323443
// If possible, lower this shift as a sequence of two shifts by
23444-
// constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
23444+
// constant plus a BLENDing shuffle instead of scalarizing it.
2344523445
// Example:
2344623446
// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
2344723447
//
2344823448
// Could be rewritten as:
2344923449
// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
2345023450
//
2345123451
// The advantage is that the two shifts from the example would be
23452-
// lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
23453-
// the vector shift into four scalar shifts plus four pairs of vector
23454-
// insert/extract.
23452+
// lowered as X86ISD::VSRLI nodes in parallel before blending.
2345523453
if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
23456-
bool UseMOVSD = false;
23457-
bool CanBeSimplified;
23458-
// The splat value for the first packed shift (the 'X' from the example).
23459-
SDValue Amt1 = Amt->getOperand(0);
23460-
// The splat value for the second packed shift (the 'Y' from the example).
23461-
SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
23462-
23463-
// See if it is possible to replace this node with a sequence of
23464-
// two shifts followed by a MOVSS/MOVSD/PBLEND.
23465-
if (VT == MVT::v4i32) {
23466-
// Check if it is legal to use a MOVSS.
23467-
CanBeSimplified = Amt2 == Amt->getOperand(2) &&
23468-
Amt2 == Amt->getOperand(3);
23469-
if (!CanBeSimplified) {
23470-
// Otherwise, check if we can still simplify this node using a MOVSD.
23471-
CanBeSimplified = Amt1 == Amt->getOperand(1) &&
23472-
Amt->getOperand(2) == Amt->getOperand(3);
23473-
UseMOVSD = true;
23474-
Amt2 = Amt->getOperand(2);
23454+
SDValue Amt1, Amt2;
23455+
unsigned NumElts = VT.getVectorNumElements();
23456+
SmallVector<int, 8> ShuffleMask;
23457+
for (unsigned i = 0; i != NumElts; ++i) {
23458+
SDValue A = Amt->getOperand(i);
23459+
if (A.isUndef()) {
23460+
ShuffleMask.push_back(SM_SentinelUndef);
23461+
continue;
2347523462
}
23476-
} else {
23477-
// Do similar checks for the case where the machine value type
23478-
// is MVT::v8i16.
23479-
CanBeSimplified = Amt1 == Amt->getOperand(1);
23480-
for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
23481-
CanBeSimplified = Amt2 == Amt->getOperand(i);
23482-
23483-
if (!CanBeSimplified) {
23484-
UseMOVSD = true;
23485-
CanBeSimplified = true;
23486-
Amt2 = Amt->getOperand(4);
23487-
for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
23488-
CanBeSimplified = Amt1 == Amt->getOperand(i);
23489-
for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
23490-
CanBeSimplified = Amt2 == Amt->getOperand(j);
23463+
if (!Amt1 || Amt1 == A) {
23464+
ShuffleMask.push_back(i);
23465+
Amt1 = A;
23466+
continue;
23467+
}
23468+
if (!Amt2 || Amt2 == A) {
23469+
ShuffleMask.push_back(i + NumElts);
23470+
Amt2 = A;
23471+
continue;
2349123472
}
23473+
break;
2349223474
}
2349323475

23494-
if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
23476+
if (ShuffleMask.size() == NumElts && isa<ConstantSDNode>(Amt1) &&
2349523477
isa<ConstantSDNode>(Amt2)) {
23496-
// Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
2349723478
SDValue Splat1 =
2349823479
DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
2349923480
SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
2350023481
SDValue Splat2 =
2350123482
DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
2350223483
SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
23503-
SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
23504-
SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
23505-
if (UseMOVSD)
23506-
return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23507-
BitCast2, {0, 1, 6, 7}));
23508-
return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23509-
BitCast2, {0, 5, 6, 7}));
23484+
return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
2351023485
}
2351123486
}
2351223487

llvm/test/CodeGen/X86/lower-vec-shift.ll

Lines changed: 12 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -211,31 +211,21 @@ define <4 x i32> @test8(<4 x i32> %a) {
211211
define <8 x i16> @test9(<8 x i16> %a) {
212212
; SSE-LABEL: test9:
213213
; SSE: # %bb.0:
214-
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0]
215214
; SSE-NEXT: movdqa %xmm0, %xmm1
216-
; SSE-NEXT: pand %xmm2, %xmm1
217-
; SSE-NEXT: psraw $2, %xmm0
218-
; SSE-NEXT: pandn %xmm0, %xmm2
219-
; SSE-NEXT: por %xmm2, %xmm1
220-
; SSE-NEXT: psraw $1, %xmm1
221-
; SSE-NEXT: movdqa %xmm1, %xmm0
215+
; SSE-NEXT: psraw $3, %xmm1
216+
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0]
217+
; SSE-NEXT: psraw $1, %xmm0
218+
; SSE-NEXT: pand %xmm2, %xmm0
219+
; SSE-NEXT: pandn %xmm1, %xmm2
220+
; SSE-NEXT: por %xmm2, %xmm0
222221
; SSE-NEXT: retq
223222
;
224-
; AVX1-LABEL: test9:
225-
; AVX1: # %bb.0:
226-
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
227-
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7]
228-
; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0
229-
; AVX1-NEXT: retq
230-
;
231-
; AVX2-LABEL: test9:
232-
; AVX2: # %bb.0:
233-
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
234-
; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
235-
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
236-
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
237-
; AVX2-NEXT: vzeroupper
238-
; AVX2-NEXT: retq
223+
; AVX-LABEL: test9:
224+
; AVX: # %bb.0:
225+
; AVX-NEXT: vpsraw $3, %xmm0, %xmm1
226+
; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
227+
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7]
228+
; AVX-NEXT: retq
239229
%lshr = ashr <8 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
240230
ret <8 x i16> %lshr
241231
}

0 commit comments

Comments
 (0)