-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Implement AVX2 Gather intrinsic #19392
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -333,6 +333,10 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr) | |
case INS_vfnmsub213sd: | ||
case INS_vfnmsub231sd: | ||
case INS_vpmaskmovq: | ||
case INS_vpgatherdq: | ||
case INS_vpgatherqq: | ||
case INS_vgatherdpd: | ||
case INS_vgatherqpd: | ||
return true; | ||
default: | ||
break; | ||
|
@@ -2900,8 +2904,8 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G | |
if (dst->isContained() || (dst->isLclField() && (dst->gtRegNum == REG_NA)) || dst->isUsedFromSpillTemp()) | ||
{ | ||
// dst can only be a modrm | ||
assert(dst->isUsedFromMemory() || (dst->gtRegNum == REG_NA) || | ||
instrIs3opImul(ins)); // dst on 3opImul isn't really the dst | ||
// dst on 3opImul isn't really the dst | ||
assert(dst->isUsedFromMemory() || (dst->gtRegNum == REG_NA) || instrIs3opImul(ins)); | ||
assert(!src->isUsedFromMemory()); | ||
|
||
memOp = dst; | ||
|
@@ -4122,6 +4126,74 @@ void emitter::emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, reg | |
emitCurIGsize += sz; | ||
} | ||
|
||
//------------------------------------------------------------------------ | ||
// IsAVX2GatherInstruction: return true if the instruction is AVX2 Gather | ||
// | ||
// Arguments: | ||
// ins - the instruction to check | ||
// Return Value: | ||
// true if the instruction is AVX2 Gather | ||
// | ||
bool IsAVX2GatherInstruction(instruction ins) | ||
{ | ||
switch (ins) | ||
{ | ||
case INS_vpgatherdd: | ||
case INS_vpgatherdq: | ||
case INS_vpgatherqd: | ||
case INS_vpgatherqq: | ||
case INS_vgatherdps: | ||
case INS_vgatherdpd: | ||
case INS_vgatherqps: | ||
case INS_vgatherqpd: | ||
return true; | ||
default: | ||
return false; | ||
} | ||
} | ||
|
||
//------------------------------------------------------------------------ | ||
// emitIns_R_AR_R: Emits an AVX2 Gather instructions | ||
// | ||
// Arguments: | ||
// ins - the instruction to emit | ||
// attr - the instruction operand size | ||
// reg1 - the destination and first source operand | ||
// reg2 - the mask operand (encoded in VEX.vvvv) | ||
// base - the base register of address to load | ||
// index - the index register of VSIB | ||
// scale - the scale number of VSIB | ||
// offs - the offset added to the memory address from base | ||
// | ||
void emitter::emitIns_R_AR_R(instruction ins, | ||
emitAttr attr, | ||
regNumber reg1, | ||
regNumber reg2, | ||
regNumber base, | ||
regNumber index, | ||
int scale, | ||
int offs) | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Coding convention: method needs header There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
assert(IsAVX2GatherInstruction(ins)); | ||
|
||
instrDesc* id = emitNewInstrAmd(attr, offs); | ||
|
||
id->idIns(ins); | ||
id->idReg1(reg1); | ||
id->idReg2(reg2); | ||
|
||
id->idInsFmt(IF_RWR_ARD_RRD); | ||
id->idAddr()->iiaAddrMode.amBaseReg = base; | ||
id->idAddr()->iiaAddrMode.amIndxReg = index; | ||
id->idAddr()->iiaAddrMode.amScale = emitEncodeSize((emitAttr)scale); | ||
|
||
UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins)); | ||
id->idCodeSize(sz); | ||
|
||
dispIns(id); | ||
emitCurIGsize += sz; | ||
} | ||
|
||
void emitter::emitIns_R_R_C( | ||
instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs) | ||
{ | ||
|
@@ -8340,6 +8412,17 @@ void emitter::emitDispIns( | |
emitDispAddrMode(id); | ||
break; | ||
|
||
case IF_RWR_ARD_RRD: | ||
if (ins == INS_vpgatherqd || ins == INS_vgatherqps) | ||
{ | ||
attr = EA_16BYTE; | ||
} | ||
sstr = codeGen->genSizeStr(EA_ATTR(4)); | ||
printf("%s, %s", emitRegName(id->idReg1(), attr), sstr); | ||
emitDispAddrMode(id); | ||
printf(", %s", emitRegName(id->idReg2(), attr)); | ||
break; | ||
|
||
case IF_RWR_RRD_ARD_CNS: | ||
{ | ||
printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr); | ||
|
@@ -9222,6 +9305,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) | |
switch (id->idInsFmt()) | ||
{ | ||
case IF_RWR_RRD_ARD: | ||
case IF_RWR_ARD_RRD: | ||
case IF_RWR_RRD_ARD_CNS: | ||
case IF_RWR_RRD_ARD_RRD: | ||
{ | ||
|
@@ -12883,6 +12967,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) | |
break; | ||
} | ||
|
||
case IF_RWR_ARD_RRD: | ||
{ | ||
assert(IsAVX2GatherInstruction(ins)); | ||
code = insCodeRM(ins); | ||
dst = emitOutputAM(dst, id, code); | ||
sz = emitSizeOfInsDsc(id); | ||
break; | ||
} | ||
|
||
case IF_RWR_RRD_ARD_CNS: | ||
case IF_RWR_RRD_ARD_RRD: | ||
{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -335,6 +335,15 @@ void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg | |
|
||
void emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs); | ||
|
||
void emitIns_R_AR_R(instruction ins, | ||
emitAttr attr, | ||
regNumber reg1, | ||
regNumber reg2, | ||
regNumber base, | ||
regNumber index, | ||
int scale, | ||
int offs); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know if this was formatted this way by jit-format, but I'd prefer to see it declared in a format more similar to those around it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this was given by jit-format, let me try to unify the format. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This format is given by clang-format and cannot manually change... |
||
|
||
void emitIns_R_R_C( | ||
instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -479,8 +479,8 @@ struct GenTree | |
// happening. | ||
void CopyCosts(const GenTree* const tree) | ||
{ | ||
INDEBUG(gtCostsInitialized = | ||
tree->gtCostsInitialized;) // If the 'tree' costs aren't initialized, we'll hit an assert below. | ||
// If the 'tree' costs aren't initialized, we'll hit an assert below. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are all these random comment re-alignments popping up? They seem unrelated to the PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Clang-format is confused by some comments, so I fixed here to avoid more unrelated changes from clang-format. |
||
INDEBUG(gtCostsInitialized = tree->gtCostsInitialized;) | ||
_gtCostEx = tree->gtCostEx; | ||
_gtCostSz = tree->gtCostSz; | ||
} | ||
|
@@ -4115,6 +4115,7 @@ struct GenTreeSIMD : public GenTreeJitIntrinsic | |
struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic | ||
{ | ||
NamedIntrinsic gtHWIntrinsicId; | ||
var_types gtIndexBaseType; // for AVX2 Gather* intrinsics | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gather intrinsics have complex overloads that need additional information (the base-type of index vector) for codegen, so adding a field in IR. But that let GenTreeHWIntrinsic become a large node. @CarolEidt do you think it ok? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is probably not a big issue for intrinsics, though for methods with heavy intrinsic usage it could be an impact. Did you consider deriving from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might also be worth considering whether to define something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps make There are also spare bytes in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@mikedn Good point! Yes, 64k is definitely enough for the foreseeable future (AVX-512 based ISAs). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Made this change, now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the |
||
|
||
GenTreeHWIntrinsic(var_types type, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned size) | ||
: GenTreeJitIntrinsic(GT_HWIntrinsic, type, nullptr, nullptr, baseType, size), gtHWIntrinsicId(hwIntrinsicID) | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -1184,6 +1184,9 @@ void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsi | |||
HWIntrinsicSwitchCaseBody emitSwCase) | ||||
{ | ||||
assert(nonConstImmReg != REG_NA); | ||||
// AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range | ||||
// that does work with the current compiler generated jump-table fallback | ||||
assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic)); | ||||
emitter* emit = getEmitter(); | ||||
|
||||
const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1; | ||||
|
@@ -2008,6 +2011,117 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) | |||
break; | ||||
} | ||||
|
||||
case NI_AVX2_GatherVector128: | ||||
case NI_AVX2_GatherVector256: | ||||
case NI_AVX2_GatherMaskVector128: | ||||
case NI_AVX2_GatherMaskVector256: | ||||
{ | ||||
GenTreeArgList* list = op1->AsArgList(); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: It is helpful to break up the argList blocks, for readability: Ex: coreclr/src/jit/hwintrinsiccodegenxarch.cpp Line 217 in 7f36c93
|
||||
op1 = list->Current(); | ||||
op1Reg = op1->gtRegNum; | ||||
genConsumeRegs(op1); | ||||
|
||||
list = list->Rest(); | ||||
op2 = list->Current(); | ||||
op2Reg = op2->gtRegNum; | ||||
genConsumeRegs(op2); | ||||
|
||||
list = list->Rest(); | ||||
GenTree* op3 = list->Current(); | ||||
genConsumeRegs(op3); | ||||
|
||||
list = list->Rest(); | ||||
GenTree* op4 = nullptr; | ||||
GenTree* lastOp = nullptr; | ||||
GenTree* indexOp = nullptr; | ||||
|
||||
regNumber op3Reg = REG_NA; | ||||
regNumber op4Reg = REG_NA; | ||||
regNumber addrBaseReg = REG_NA; | ||||
regNumber addrIndexReg = REG_NA; | ||||
regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT); | ||||
|
||||
if (numArgs == 5) | ||||
{ | ||||
assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256); | ||||
op4 = list->Current(); | ||||
list = list->Rest(); | ||||
lastOp = list->Current(); | ||||
op3Reg = op3->gtRegNum; | ||||
op4Reg = op4->gtRegNum; | ||||
genConsumeRegs(op4); | ||||
addrBaseReg = op2Reg; | ||||
addrIndexReg = op3Reg; | ||||
indexOp = op3; | ||||
|
||||
// copy op4Reg into the tmp mask register, | ||||
// the mask register will be cleared by gather instructions | ||||
emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg); | ||||
|
||||
if (targetReg != op1Reg) | ||||
{ | ||||
// copy source vector to the target register for masking merge | ||||
emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Op2Reg is a GPR (base register of the address) and cannot be same as the vector target register. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right - thanks for clarifying. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A comment clarifying this would be nice |
||||
} | ||||
} | ||||
else | ||||
{ | ||||
assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256); | ||||
addrBaseReg = op1Reg; | ||||
addrIndexReg = op2Reg; | ||||
indexOp = op2; | ||||
lastOp = op3; | ||||
|
||||
// generate all-one mask vector | ||||
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg); | ||||
} | ||||
|
||||
bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32); | ||||
|
||||
// hwintrinsiclistxarch.h uses Dword index instructions in default | ||||
if (varTypeIsLong(node->gtIndexBaseType)) | ||||
{ | ||||
switch (ins) | ||||
{ | ||||
case INS_vpgatherdd: | ||||
ins = INS_vpgatherqd; | ||||
if (isVector128GatherWithVector256Index) | ||||
{ | ||||
// YMM index in address mode | ||||
attr = emitTypeSize(TYP_SIMD32); | ||||
} | ||||
break; | ||||
case INS_vpgatherdq: | ||||
ins = INS_vpgatherqq; | ||||
break; | ||||
case INS_vgatherdps: | ||||
ins = INS_vgatherqps; | ||||
if (isVector128GatherWithVector256Index) | ||||
{ | ||||
// YMM index in address mode | ||||
attr = emitTypeSize(TYP_SIMD32); | ||||
} | ||||
break; | ||||
case INS_vgatherdpd: | ||||
ins = INS_vgatherqpd; | ||||
break; | ||||
default: | ||||
unreached(); | ||||
} | ||||
} | ||||
|
||||
assert(lastOp->IsCnsIntOrI()); | ||||
ssize_t ival = lastOp->AsIntCon()->IconValue(); | ||||
assert((ival >= 0) && (ival <= 255)); | ||||
|
||||
assert(targetReg != maskReg); | ||||
assert(targetReg != addrIndexReg); | ||||
assert(maskReg != addrIndexReg); | ||||
emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0); | ||||
|
||||
break; | ||||
} | ||||
|
||||
case NI_AVX_GetLowerHalf: | ||||
{ | ||||
assert(op2 == nullptr); | ||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if we shouldn't be encoding the scale/offs in an
Indir
node and lettingemitHandleMemOp
handle it@CarolEidt ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The scale/offs would be on a
GenTreeAddrMode
- presumably aGT_LEA
. It would make sense for that to be a child of the gather node, which would not only consolidate the scale/offs handling, but also would reduce the operand count of the gather.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@CarolEidt Thanks for the comments. Do you suggest to make the change in this PR? Or I can improve it in a new PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be fine (and probably a bit cleaner) to do it as a separate PR.