Skip to content

Commit 5b9bade

Browse files
committed
[X86][SSSE3] Added PSHUFB LUT implementation of BITREVERSE
PSHUFB can speed up BITREVERSE of byte vectors by performing LUT on the low/high nibbles separately and ORing the results. Wider integer vector types are already BSWAP'd beforehand so also make use of this approach. llvm-svn: 272477
1 parent b13961d commit 5b9bade

File tree

2 files changed

+440
-948
lines changed

2 files changed

+440
-948
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -869,6 +869,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
869869
}
870870

871871
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
872+
setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
872873
setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
873874
setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
874875
// ISD::CTLZ v4i32 - scalarization is faster.
@@ -1005,6 +1006,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
10051006
setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
10061007
setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
10071008
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1009+
setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
10081010

10091011
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
10101012
setOperationAction(ISD::CTPOP, VT, Custom);
@@ -20910,7 +20912,7 @@ static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
2091020912
return LowerVectorCTPOP(Op, Subtarget, DAG);
2091120913
}
2091220914

20913-
static SDValue LowerBITREVERSE(SDValue Op, SelectionDAG &DAG) {
20915+
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
2091420916
MVT VT = Op.getSimpleValueType();
2091520917
SDValue In = Op.getOperand(0);
2091620918
SDLoc DL(Op);
@@ -20963,6 +20965,62 @@ static SDValue LowerBITREVERSE(SDValue Op, SelectionDAG &DAG) {
2096320965
return DAG.getBitcast(VT, Res);
2096420966
}
2096520967

20968+
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
20969+
SelectionDAG &DAG) {
20970+
if (Subtarget.hasXOP())
20971+
return LowerBITREVERSE_XOP(Op, DAG);
20972+
20973+
assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
20974+
20975+
MVT VT = Op.getSimpleValueType();
20976+
SDValue In = Op.getOperand(0);
20977+
SDLoc DL(Op);
20978+
20979+
unsigned NumElts = VT.getVectorNumElements();
20980+
assert(VT.getScalarType() == MVT::i8 &&
20981+
"Only byte vector BITREVERSE supported");
20982+
20983+
// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
20984+
if (VT.is256BitVector() && !Subtarget.hasInt256()) {
20985+
MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
20986+
SDValue Lo = extract128BitVector(In, 0, DAG, DL);
20987+
SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
20988+
Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
20989+
Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
20990+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20991+
}
20992+
20993+
// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
20994+
// two nibbles and a PSHUFB lookup to find the bitreverse of each
20995+
// 0-15 value (moved to the other nibble).
20996+
SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
20997+
SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
20998+
SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
20999+
21000+
const int LoLUT[16] = {
21001+
/* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
21002+
/* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
21003+
/* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
21004+
/* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
21005+
const int HiLUT[16] = {
21006+
/* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
21007+
/* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
21008+
/* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
21009+
/* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
21010+
21011+
SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
21012+
for (unsigned i = 0; i < NumElts; ++i) {
21013+
LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
21014+
HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
21015+
}
21016+
21017+
SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
21018+
SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
21019+
Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
21020+
Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
21021+
return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
21022+
}
21023+
2096621024
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
2096721025
unsigned NewOpc = 0;
2096821026
switch (N->getOpcode()) {
@@ -21462,7 +21520,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2146221520
case ISD::ATOMIC_LOAD_XOR:
2146321521
case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
2146421522
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
21465-
case ISD::BITREVERSE: return LowerBITREVERSE(Op, DAG);
21523+
case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
2146621524
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
2146721525
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
2146821526
case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);

0 commit comments

Comments
 (0)