Skip to content

Commit ef4330f

Browse files
committed
[X86] truncateVectorWithPACK - handle vector truncations to sub-64-bit vector widths
Extend the existing 128-bit -> 64-bit truncation handling by widening/narrowing the src/dst vectors and use the lower half operand/result for PACKSS/PACKUS instructions.
1 parent 0efdf3b commit ef4330f

File tree

7 files changed

+791
-1210
lines changed

7 files changed

+791
-1210
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3787,7 +3787,7 @@ static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
37873787
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
37883788
const X86Subtarget &Subtarget, SelectionDAG &DAG,
37893789
const SDLoc &dl) {
3790-
assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&
3790+
assert(Vec.getValueSizeInBits().getFixedValue() <= VT.getFixedSizeInBits() &&
37913791
Vec.getValueType().getScalarType() == VT.getScalarType() &&
37923792
"Unsupported vector widening type");
37933793
SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
@@ -3801,7 +3801,7 @@ static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
38013801
static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
38023802
const X86Subtarget &Subtarget, SelectionDAG &DAG,
38033803
const SDLoc &dl, unsigned WideSizeInBits) {
3804-
assert(Vec.getValueSizeInBits() < WideSizeInBits &&
3804+
assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
38053805
(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
38063806
"Unsupported vector widening type");
38073807
unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
@@ -19982,22 +19982,18 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
1998219982
if (SrcVT == DstVT)
1998319983
return In;
1998419984

19985-
// We only support vector truncation to 64bits or greater from a
19986-
// 128bits or greater source.
19987-
unsigned DstSizeInBits = DstVT.getSizeInBits();
19988-
unsigned SrcSizeInBits = SrcVT.getSizeInBits();
19989-
if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
19990-
return SDValue();
19991-
1999219985
unsigned NumElems = SrcVT.getVectorNumElements();
1999319986
if (!isPowerOf2_32(NumElems))
1999419987
return SDValue();
1999519988

19996-
LLVMContext &Ctx = *DAG.getContext();
19989+
unsigned DstSizeInBits = DstVT.getSizeInBits();
19990+
unsigned SrcSizeInBits = SrcVT.getSizeInBits();
1999719991
assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
1999819992
assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
1999919993

19994+
LLVMContext &Ctx = *DAG.getContext();
2000019995
EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
19996+
EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
2000119997

2000219998
// Pack to the largest type possible:
2000319999
// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
@@ -20008,14 +20004,16 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
2000820004
OutVT = MVT::i16;
2000920005
}
2001020006

20011-
// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
20012-
if (SrcVT.is128BitVector()) {
20007+
// Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20008+
if (SrcSizeInBits <= 128) {
2001320009
InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
2001420010
OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20011+
In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
2001520012
In = DAG.getBitcast(InVT, In);
2001620013
SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
20017-
Res = extractSubVector(Res, 0, DAG, DL, 64);
20018-
return DAG.getBitcast(DstVT, Res);
20014+
Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20015+
Res = DAG.getBitcast(PackedVT, Res);
20016+
return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
2001920017
}
2002020018

2002120019
// Split lower/upper subvectors.
@@ -20061,15 +20059,13 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
2006120059
return DAG.getBitcast(DstVT, Res);
2006220060

2006320061
// If 512bit -> 128bit truncate another stage.
20064-
EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
2006520062
Res = DAG.getBitcast(PackedVT, Res);
2006620063
return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
2006720064
}
2006820065

2006920066
// Recursively pack lower/upper subvectors, concat result and pack again.
2007020067
assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
2007120068

20072-
EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
2007320069
if (PackedVT.is128BitVector()) {
2007420070
// Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
2007520071
// type legalization.
@@ -50833,6 +50829,10 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
5083350829
if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
5083450830
return SDValue();
5083550831

50832+
// Truncation from sub-128bit to vXi8 can be better handled with PSHUFB.
50833+
if (SVT == MVT::i8 && InVT.getSizeInBits() <= 128 && Subtarget.hasSSSE3())
50834+
return SDValue();
50835+
5083650836
// AVX512 has fast truncate, but if the input is already going to be split,
5083750837
// there's no harm in trying pack.
5083850838
if (Subtarget.hasAVX512() &&

llvm/test/CodeGen/X86/fpclamptosat_vec.ll

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -701,13 +701,13 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) {
701701
; CHECK-NEXT: pand %xmm2, %xmm0
702702
; CHECK-NEXT: pandn %xmm1, %xmm2
703703
; CHECK-NEXT: por %xmm0, %xmm2
704-
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <4294934528,4294934528,u,u>
705-
; CHECK-NEXT: movdqa %xmm2, %xmm0
706-
; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
707-
; CHECK-NEXT: pand %xmm0, %xmm2
708-
; CHECK-NEXT: pandn %xmm1, %xmm0
709-
; CHECK-NEXT: por %xmm2, %xmm0
710-
; CHECK-NEXT: packssdw %xmm0, %xmm0
704+
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
705+
; CHECK-NEXT: movdqa %xmm2, %xmm1
706+
; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
707+
; CHECK-NEXT: pand %xmm1, %xmm2
708+
; CHECK-NEXT: pandn %xmm0, %xmm1
709+
; CHECK-NEXT: por %xmm2, %xmm1
710+
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
711711
; CHECK-NEXT: retq
712712
entry:
713713
%conv = fptosi <2 x double> %x to <2 x i32>
@@ -2265,13 +2265,13 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) {
22652265
; CHECK-NEXT: pand %xmm2, %xmm0
22662266
; CHECK-NEXT: pandn %xmm1, %xmm2
22672267
; CHECK-NEXT: por %xmm0, %xmm2
2268-
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <4294934528,4294934528,u,u>
2269-
; CHECK-NEXT: movdqa %xmm2, %xmm0
2270-
; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
2271-
; CHECK-NEXT: pand %xmm0, %xmm2
2272-
; CHECK-NEXT: pandn %xmm1, %xmm0
2273-
; CHECK-NEXT: por %xmm2, %xmm0
2274-
; CHECK-NEXT: packssdw %xmm0, %xmm0
2268+
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
2269+
; CHECK-NEXT: movdqa %xmm2, %xmm1
2270+
; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
2271+
; CHECK-NEXT: pand %xmm1, %xmm2
2272+
; CHECK-NEXT: pandn %xmm0, %xmm1
2273+
; CHECK-NEXT: por %xmm2, %xmm1
2274+
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
22752275
; CHECK-NEXT: retq
22762276
entry:
22772277
%conv = fptosi <2 x double> %x to <2 x i32>

0 commit comments

Comments
 (0)