Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 2e1ecd0

Browse files
committed
Implement AVX2 Gather intrinsic in JIT
1 parent 8c00bc5 commit 2e1ecd0

13 files changed

+429
-51
lines changed

src/jit/emitfmtsxarch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ IF_DEF(RRW_ARD_CNS, IS_AM_RD|IS_R1_RW, AMD_CNS) // r/w reg , read [
188188

189189
IF_DEF(RWR_RRD_ARD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD ) // write reg , read reg2, read [adr]
190190
IF_DEF(RWR_ARD_CNS, IS_AM_RD|IS_R1_WR, AMD_CNS) // write reg , read [adr], const
191+
IF_DEF(RWR_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD) // write reg , read [adr], read reg2
191192
IF_DEF(RWR_RRD_ARD_CNS, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD_CNS) // write reg , read reg2, read [adr], const
192193
IF_DEF(RWR_RRD_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, AMD_CNS) // write reg , read reg2, read [adr], read reg3
193194

src/jit/emitxarch.cpp

Lines changed: 95 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,10 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr)
589589
case INS_vfnmsub213sd:
590590
case INS_vfnmsub231sd:
591591
case INS_vpmaskmovq:
592+
case INS_vpgatherdq:
593+
case INS_vpgatherqq:
594+
case INS_vgatherdpd:
595+
case INS_vgatherqpd:
592596
return true;
593597
default:
594598
break;
@@ -3159,8 +3163,8 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
31593163
if (dst->isContained() || (dst->isLclField() && (dst->gtRegNum == REG_NA)) || dst->isUsedFromSpillTemp())
31603164
{
31613165
// dst can only be a modrm
3162-
assert(dst->isUsedFromMemory() || (dst->gtRegNum == REG_NA) ||
3163-
instrIs3opImul(ins)); // dst on 3opImul isn't really the dst
3166+
// dst on 3opImul isn't really the dst
3167+
assert(dst->isUsedFromMemory() || (dst->gtRegNum == REG_NA) || instrIs3opImul(ins));
31643168
assert(!src->isUsedFromMemory());
31653169

31663170
memOp = dst;
@@ -4365,6 +4369,74 @@ void emitter::emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, reg
43654369
emitCurIGsize += sz;
43664370
}
43674371

4372+
//------------------------------------------------------------------------
4373+
// IsAVX2GatherInstruction: return true if the instruction is AVX2 Gather
4374+
//
4375+
// Arguments:
4376+
// ins - the instruction to check
4377+
// Return Value:
4378+
// true if the instruction is AVX2 Gather
4379+
//
4380+
bool IsAVX2GatherInstruction(instruction ins)
4381+
{
4382+
switch (ins)
4383+
{
4384+
case INS_vpgatherdd:
4385+
case INS_vpgatherdq:
4386+
case INS_vpgatherqd:
4387+
case INS_vpgatherqq:
4388+
case INS_vgatherdps:
4389+
case INS_vgatherdpd:
4390+
case INS_vgatherqps:
4391+
case INS_vgatherqpd:
4392+
return true;
4393+
default:
4394+
return false;
4395+
}
4396+
}
4397+
4398+
//------------------------------------------------------------------------
4399+
// emitIns_R_AR_R: Emits an AVX2 Gather instructions
4400+
//
4401+
// Arguments:
4402+
// ins - the instruction to emit
4403+
// attr - the instruction operand size
4404+
// reg1 - the destination and first source operand
4405+
// reg2 - the mask operand (encoded in VEX.vvvv)
4406+
// base - the base register of address to load
4407+
// index - the index register of VSIB
4408+
// scale - the scale number of VSIB
4409+
// offs - the offset added to the memory address from base
4410+
//
4411+
void emitter::emitIns_R_AR_R(instruction ins,
4412+
emitAttr attr,
4413+
regNumber reg1,
4414+
regNumber reg2,
4415+
regNumber base,
4416+
regNumber index,
4417+
int scale,
4418+
int offs)
4419+
{
4420+
assert(IsAVX2GatherInstruction(ins));
4421+
4422+
instrDesc* id = emitNewInstrAmd(attr, offs);
4423+
4424+
id->idIns(ins);
4425+
id->idReg1(reg1);
4426+
id->idReg2(reg2);
4427+
4428+
id->idInsFmt(IF_RWR_ARD_RRD);
4429+
id->idAddr()->iiaAddrMode.amBaseReg = base;
4430+
id->idAddr()->iiaAddrMode.amIndxReg = index;
4431+
id->idAddr()->iiaAddrMode.amScale = emitEncodeSize((emitAttr)scale);
4432+
4433+
UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
4434+
id->idCodeSize(sz);
4435+
4436+
dispIns(id);
4437+
emitCurIGsize += sz;
4438+
}
4439+
43684440
void emitter::emitIns_R_R_C(
43694441
instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs)
43704442
{
@@ -8583,6 +8655,17 @@ void emitter::emitDispIns(
85838655
emitDispAddrMode(id);
85848656
break;
85858657

8658+
case IF_RWR_ARD_RRD:
8659+
if (ins == INS_vpgatherqd || ins == INS_vgatherqps)
8660+
{
8661+
attr = EA_16BYTE;
8662+
}
8663+
sstr = codeGen->genSizeStr(EA_ATTR(4));
8664+
printf("%s, %s", emitRegName(id->idReg1(), attr), sstr);
8665+
emitDispAddrMode(id);
8666+
printf(", %s", emitRegName(id->idReg2(), attr));
8667+
break;
8668+
85868669
case IF_RWR_RRD_ARD_CNS:
85878670
{
85888671
printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr);
@@ -9482,7 +9565,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
94829565
// encode source operand reg in 'vvvv' bits in 1's complement form
94839566
code = insEncodeReg3456(ins, src1, size, code);
94849567
}
9485-
else if (IsDstSrcSrcAVXInstruction(ins))
9568+
else if (IsDstSrcSrcAVXInstruction(ins) || id->idInsFmt() == IF_RWR_ARD_RRD)
94869569
{
94879570
code = insEncodeReg3456(ins, id->idReg2(), size, code);
94889571
}
@@ -13126,6 +13209,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
1312613209
break;
1312713210
}
1312813211

13212+
case IF_RWR_ARD_RRD:
13213+
{
13214+
assert(IsAVX2GatherInstruction(ins));
13215+
code = insCodeRM(ins);
13216+
dst = emitOutputAM(dst, id, code);
13217+
sz = emitSizeOfInsDsc(id);
13218+
break;
13219+
}
13220+
1312913221
case IF_RWR_RRD_ARD_CNS:
1313013222
case IF_RWR_RRD_ARD_RRD:
1313113223
{

src/jit/emitxarch.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,15 @@ void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg
335335

336336
void emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs);
337337

338+
void emitIns_R_AR_R(instruction ins,
339+
emitAttr attr,
340+
regNumber reg1,
341+
regNumber reg2,
342+
regNumber base,
343+
regNumber index,
344+
int scale,
345+
int offs);
346+
338347
void emitIns_R_R_C(
339348
instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs);
340349

src/jit/gentree.cpp

Lines changed: 9 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -17522,13 +17522,18 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad()
1752217522
{
1752317523
// Some AVX instructions here also have MemoryLoad sematics
1752417524

17525-
// Do we have 3 operands?
17526-
if (HWIntrinsicInfo::lookupNumArgs(this) != 3)
17525+
// Do we have less than 3 operands?
17526+
if (HWIntrinsicInfo::lookupNumArgs(this) < 3)
1752717527
{
1752817528
return false;
1752917529
}
17530-
else // We have 3 operands/args
17530+
else // We have 3 or more operands/args
1753117531
{
17532+
if (HWIntrinsicInfo::isAVX2GatherIntrinsic(gtHWIntrinsicId))
17533+
{
17534+
return true;
17535+
}
17536+
1753217537
GenTreeArgList* argList = gtOp.gtOp1->AsArgList();
1753317538

1753417539
if ((gtHWIntrinsicId == NI_AVX_InsertVector128 || gtHWIntrinsicId == NI_AVX2_InsertVector128) &&
@@ -17579,38 +17584,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryStore()
1757917584
bool GenTreeHWIntrinsic::OperIsMemoryLoadOrStore()
1758017585
{
1758117586
#ifdef _TARGET_XARCH_
17582-
// Some xarch instructions have MemoryLoad sematics
17583-
HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(gtHWIntrinsicId);
17584-
if ((category == HW_Category_MemoryLoad) || (category == HW_Category_MemoryStore))
17585-
{
17586-
return true;
17587-
}
17588-
else if (category == HW_Category_IMM)
17589-
{
17590-
// Some AVX instructions here also have MemoryLoad or MemoryStore sematics
17591-
17592-
// Do we have 3 operands?
17593-
if (HWIntrinsicInfo::lookupNumArgs(this) != 3)
17594-
{
17595-
return false;
17596-
}
17597-
else // We have 3 operands/args
17598-
{
17599-
GenTreeArgList* argList = gtOp.gtOp1->AsArgList();
17600-
17601-
if ((gtHWIntrinsicId == NI_AVX_InsertVector128 || gtHWIntrinsicId == NI_AVX2_InsertVector128) &&
17602-
(argList->Rest()->Current()->TypeGet() == TYP_I_IMPL)) // Is the type of the second arg TYP_I_IMPL?
17603-
{
17604-
// This is Avx/Avx2.InsertVector128
17605-
return true;
17606-
}
17607-
else if ((gtHWIntrinsicId == NI_AVX_ExtractVector128 || gtHWIntrinsicId == NI_AVX2_ExtractVector128))
17608-
{
17609-
// This is Avx/Avx2.ExtractVector128
17610-
return true;
17611-
}
17612-
}
17613-
}
17587+
return OperIsMemoryLoad() || OperIsMemoryStore();
1761417588
#endif // _TARGET_XARCH_
1761517589
return false;
1761617590
}

src/jit/gentree.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -479,8 +479,8 @@ struct GenTree
479479
// happening.
480480
void CopyCosts(const GenTree* const tree)
481481
{
482-
INDEBUG(gtCostsInitialized =
483-
tree->gtCostsInitialized;) // If the 'tree' costs aren't initialized, we'll hit an assert below.
482+
// If the 'tree' costs aren't initialized, we'll hit an assert below.
483+
INDEBUG(gtCostsInitialized = tree->gtCostsInitialized;)
484484
_gtCostEx = tree->gtCostEx;
485485
_gtCostSz = tree->gtCostSz;
486486
}
@@ -4112,6 +4112,7 @@ struct GenTreeSIMD : public GenTreeJitIntrinsic
41124112
struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic
41134113
{
41144114
NamedIntrinsic gtHWIntrinsicId;
4115+
var_types gtIndexBaseType; // for AVX2 Gather* intrinsics
41154116

41164117
GenTreeHWIntrinsic(var_types type, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned size)
41174118
: GenTreeJitIntrinsic(GT_HWIntrinsic, type, nullptr, nullptr, baseType, size), gtHWIntrinsicId(hwIntrinsicID)

src/jit/hwintrinsiccodegenxarch.cpp

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,6 +1184,9 @@ void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsi
11841184
HWIntrinsicSwitchCaseBody emitSwCase)
11851185
{
11861186
assert(nonConstImmReg != REG_NA);
1187+
// AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
1188+
// that does work with the current compiler generated jump-table fallback
1189+
assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
11871190
emitter* emit = getEmitter();
11881191

11891192
const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
@@ -2008,6 +2011,117 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
20082011
break;
20092012
}
20102013

2014+
case NI_AVX2_GatherVector128:
2015+
case NI_AVX2_GatherVector256:
2016+
case NI_AVX2_GatherMaskVector128:
2017+
case NI_AVX2_GatherMaskVector256:
2018+
{
2019+
GenTreeArgList* list = op1->AsArgList();
2020+
op1 = list->Current();
2021+
op1Reg = op1->gtRegNum;
2022+
genConsumeRegs(op1);
2023+
2024+
list = list->Rest();
2025+
op2 = list->Current();
2026+
op2Reg = op2->gtRegNum;
2027+
genConsumeRegs(op2);
2028+
2029+
list = list->Rest();
2030+
GenTree* op3 = list->Current();
2031+
genConsumeRegs(op3);
2032+
2033+
list = list->Rest();
2034+
GenTree* op4 = nullptr;
2035+
GenTree* lastOp = nullptr;
2036+
GenTree* indexOp = nullptr;
2037+
2038+
regNumber op3Reg = REG_NA;
2039+
regNumber op4Reg = REG_NA;
2040+
regNumber addrBaseReg = REG_NA;
2041+
regNumber addrIndexReg = REG_NA;
2042+
regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT);
2043+
2044+
if (numArgs == 5)
2045+
{
2046+
assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
2047+
op4 = list->Current();
2048+
list = list->Rest();
2049+
lastOp = list->Current();
2050+
op3Reg = op3->gtRegNum;
2051+
op4Reg = op4->gtRegNum;
2052+
genConsumeRegs(op4);
2053+
addrBaseReg = op2Reg;
2054+
addrIndexReg = op3Reg;
2055+
indexOp = op3;
2056+
2057+
// copy op4Reg into the tmp mask register,
2058+
// the mask register will be cleared by gather instructions
2059+
emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);
2060+
2061+
if (targetReg != op1Reg)
2062+
{
2063+
// copy source vector to the target register for masking merge
2064+
emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
2065+
}
2066+
}
2067+
else
2068+
{
2069+
assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
2070+
addrBaseReg = op1Reg;
2071+
addrIndexReg = op2Reg;
2072+
indexOp = op2;
2073+
lastOp = op3;
2074+
2075+
// generate all-one mask vector
2076+
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
2077+
}
2078+
2079+
bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);
2080+
2081+
// hwintrinsiclistxarch.h uses Dword index instructions in default
2082+
if (varTypeIsLong(node->gtIndexBaseType))
2083+
{
2084+
switch (ins)
2085+
{
2086+
case INS_vpgatherdd:
2087+
ins = INS_vpgatherqd;
2088+
if (isVector128GatherWithVector256Index)
2089+
{
2090+
// YMM index in address mode
2091+
attr = emitTypeSize(TYP_SIMD32);
2092+
}
2093+
break;
2094+
case INS_vpgatherdq:
2095+
ins = INS_vpgatherqq;
2096+
break;
2097+
case INS_vgatherdps:
2098+
ins = INS_vgatherqps;
2099+
if (isVector128GatherWithVector256Index)
2100+
{
2101+
// YMM index in address mode
2102+
attr = emitTypeSize(TYP_SIMD32);
2103+
}
2104+
break;
2105+
case INS_vgatherdpd:
2106+
ins = INS_vgatherqpd;
2107+
break;
2108+
default:
2109+
unreached();
2110+
}
2111+
}
2112+
2113+
assert(lastOp->IsCnsIntOrI());
2114+
ssize_t ival = lastOp->AsIntCon()->IconValue();
2115+
assert((ival >= 0) && (ival <= 255));
2116+
2117+
assert(targetReg != maskReg);
2118+
assert(targetReg != addrIndexReg);
2119+
assert(maskReg != addrIndexReg);
2120+
emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);
2121+
2122+
break;
2123+
}
2124+
20112125
case NI_AVX_GetLowerHalf:
20122126
{
20132127
assert(op2 == nullptr);

src/jit/hwintrinsiclistxarch.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,10 @@ HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int32, "ConvertToVe
430430
HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt32, "ConvertToVector256UInt32", AVX2, -1, 32, 1, {INS_invalid, INS_pmovzxbd, INS_invalid, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
431431
HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int64, "ConvertToVector256Int64", AVX2, -1, 32, 1, {INS_pmovsxbq, INS_invalid, INS_pmovsxwq, INS_invalid, INS_pmovsxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
432432
HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt64, "ConvertToVector256UInt64", AVX2, -1, 32, 1, {INS_invalid, INS_pmovzxbq, INS_invalid, INS_pmovzxwq, INS_invalid, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
433+
HARDWARE_INTRINSIC(AVX2_GatherVector128, "GatherVector128", AVX2, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_NoContainment)
434+
HARDWARE_INTRINSIC(AVX2_GatherVector256, "GatherVector256", AVX2, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_NoContainment)
435+
HARDWARE_INTRINSIC(AVX2_GatherMaskVector128, "GatherMaskVector128", AVX2, -1, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
436+
HARDWARE_INTRINSIC(AVX2_GatherMaskVector256, "GatherMaskVector256", AVX2, -1, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
433437
HARDWARE_INTRINSIC(AVX2_HorizontalAdd, "HorizontalAdd", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_invalid, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
434438
HARDWARE_INTRINSIC(AVX2_HorizontalAddSaturate, "HorizontalAddSaturate", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
435439
HARDWARE_INTRINSIC(AVX2_HorizontalSubtract, "HorizontalSubtract", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)

0 commit comments

Comments
 (0)