Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Implement AVX2 Gather intrinsic #19392

Merged
merged 3 commits into from
Sep 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

882 changes: 825 additions & 57 deletions src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/X86/Avx2.cs

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/jit/emitfmtsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ IF_DEF(RRW_ARD_CNS, IS_AM_RD|IS_R1_RW, AMD_CNS) // r/w reg , read [

IF_DEF(RWR_RRD_ARD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD ) // write reg , read reg2, read [adr]
IF_DEF(RWR_ARD_CNS, IS_AM_RD|IS_R1_WR, AMD_CNS) // write reg , read [adr], const
IF_DEF(RWR_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD) // write reg , read [adr], read reg2
IF_DEF(RWR_RRD_ARD_CNS, IS_AM_RD|IS_R1_WR|IS_R2_RD, AMD_CNS) // write reg , read reg2, read [adr], const
IF_DEF(RWR_RRD_ARD_RRD, IS_AM_RD|IS_R1_WR|IS_R2_RD|IS_R3_RD, AMD_CNS) // write reg , read reg2, read [adr], read reg3

Expand Down
97 changes: 95 additions & 2 deletions src/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,10 @@ bool TakesRexWPrefix(instruction ins, emitAttr attr)
case INS_vfnmsub213sd:
case INS_vfnmsub231sd:
case INS_vpmaskmovq:
case INS_vpgatherdq:
case INS_vpgatherqq:
case INS_vgatherdpd:
case INS_vgatherqpd:
return true;
default:
break;
Expand Down Expand Up @@ -2900,8 +2904,8 @@ regNumber emitter::emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, G
if (dst->isContained() || (dst->isLclField() && (dst->gtRegNum == REG_NA)) || dst->isUsedFromSpillTemp())
{
// dst can only be a modrm
assert(dst->isUsedFromMemory() || (dst->gtRegNum == REG_NA) ||
instrIs3opImul(ins)); // dst on 3opImul isn't really the dst
// dst on 3opImul isn't really the dst
assert(dst->isUsedFromMemory() || (dst->gtRegNum == REG_NA) || instrIs3opImul(ins));
assert(!src->isUsedFromMemory());

memOp = dst;
Expand Down Expand Up @@ -4122,6 +4126,74 @@ void emitter::emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, reg
emitCurIGsize += sz;
}

//------------------------------------------------------------------------
// IsAVX2GatherInstruction: return true if the instruction is AVX2 Gather
//
// Arguments:
// ins - the instruction to check
// Return Value:
// true if the instruction is AVX2 Gather
//
bool IsAVX2GatherInstruction(instruction ins)
{
switch (ins)
{
case INS_vpgatherdd:
case INS_vpgatherdq:
case INS_vpgatherqd:
case INS_vpgatherqq:
case INS_vgatherdps:
case INS_vgatherdpd:
case INS_vgatherqps:
case INS_vgatherqpd:
return true;
default:
return false;
}
}

//------------------------------------------------------------------------
// emitIns_R_AR_R: Emits an AVX2 Gather instructions
//
// Arguments:
// ins - the instruction to emit
// attr - the instruction operand size
// reg1 - the destination and first source operand
// reg2 - the mask operand (encoded in VEX.vvvv)
// base - the base register of address to load
// index - the index register of VSIB
// scale - the scale number of VSIB
// offs - the offset added to the memory address from base
//
void emitter::emitIns_R_AR_R(instruction ins,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we shouldn't be encoding the scale/offs in an Indir node and letting emitHandleMemOp handle it

@CarolEidt ?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The scale/offs would be on a GenTreeAddrMode - presumably a GT_LEA. It would make sense for that to be a child of the gather node, which would not only consolidate the scale/offs handling, but also would reduce the operand count of the gather.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@CarolEidt Thanks for the comments. Do you suggest to make the change in this PR? Or I can improve it in a new PR?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be fine (and probably a bit cleaner) to do it as a separate PR.

emitAttr attr,
regNumber reg1,
regNumber reg2,
regNumber base,
regNumber index,
int scale,
int offs)
{

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coding convention: method needs header

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

assert(IsAVX2GatherInstruction(ins));

instrDesc* id = emitNewInstrAmd(attr, offs);

id->idIns(ins);
id->idReg1(reg1);
id->idReg2(reg2);

id->idInsFmt(IF_RWR_ARD_RRD);
id->idAddr()->iiaAddrMode.amBaseReg = base;
id->idAddr()->iiaAddrMode.amIndxReg = index;
id->idAddr()->iiaAddrMode.amScale = emitEncodeSize((emitAttr)scale);

UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
id->idCodeSize(sz);

dispIns(id);
emitCurIGsize += sz;
}

void emitter::emitIns_R_R_C(
instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs)
{
Expand Down Expand Up @@ -8340,6 +8412,17 @@ void emitter::emitDispIns(
emitDispAddrMode(id);
break;

case IF_RWR_ARD_RRD:
if (ins == INS_vpgatherqd || ins == INS_vgatherqps)
{
attr = EA_16BYTE;
}
sstr = codeGen->genSizeStr(EA_ATTR(4));
printf("%s, %s", emitRegName(id->idReg1(), attr), sstr);
emitDispAddrMode(id);
printf(", %s", emitRegName(id->idReg2(), attr));
break;

case IF_RWR_RRD_ARD_CNS:
{
printf("%s, %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), attr), sstr);
Expand Down Expand Up @@ -9222,6 +9305,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
switch (id->idInsFmt())
{
case IF_RWR_RRD_ARD:
case IF_RWR_ARD_RRD:
case IF_RWR_RRD_ARD_CNS:
case IF_RWR_RRD_ARD_RRD:
{
Expand Down Expand Up @@ -12883,6 +12967,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
break;
}

case IF_RWR_ARD_RRD:
{
assert(IsAVX2GatherInstruction(ins));
code = insCodeRM(ins);
dst = emitOutputAM(dst, id, code);
sz = emitSizeOfInsDsc(id);
break;
}

case IF_RWR_RRD_ARD_CNS:
case IF_RWR_RRD_ARD_RRD:
{
Expand Down
9 changes: 9 additions & 0 deletions src/jit/emitxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,15 @@ void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg

void emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs);

void emitIns_R_AR_R(instruction ins,
emitAttr attr,
regNumber reg1,
regNumber reg2,
regNumber base,
regNumber index,
int scale,
int offs);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if this was formatted this way by jit-format, but I'd prefer to see it declared in a format more similar to those around it.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this was given by jit-format, let me try to unify the format.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This format is given by clang-format and cannot manually change...


void emitIns_R_R_C(
instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs);

Expand Down
44 changes: 9 additions & 35 deletions src/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17501,13 +17501,18 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad()
{
// Some AVX instructions here also have MemoryLoad sematics

// Do we have 3 operands?
if (HWIntrinsicInfo::lookupNumArgs(this) != 3)
// Do we have less than 3 operands?
if (HWIntrinsicInfo::lookupNumArgs(this) < 3)
{
return false;
}
else // We have 3 operands/args
else // We have 3 or more operands/args
{
if (HWIntrinsicInfo::isAVX2GatherIntrinsic(gtHWIntrinsicId))
{
return true;
}

GenTreeArgList* argList = gtOp.gtOp1->AsArgList();

if ((gtHWIntrinsicId == NI_AVX_InsertVector128 || gtHWIntrinsicId == NI_AVX2_InsertVector128) &&
Expand Down Expand Up @@ -17558,38 +17563,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryStore()
bool GenTreeHWIntrinsic::OperIsMemoryLoadOrStore()
{
#ifdef _TARGET_XARCH_
// Some xarch instructions have MemoryLoad sematics
HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(gtHWIntrinsicId);
if ((category == HW_Category_MemoryLoad) || (category == HW_Category_MemoryStore))
{
return true;
}
else if (category == HW_Category_IMM)
{
// Some AVX instructions here also have MemoryLoad or MemoryStore sematics

// Do we have 3 operands?
if (HWIntrinsicInfo::lookupNumArgs(this) != 3)
{
return false;
}
else // We have 3 operands/args
{
GenTreeArgList* argList = gtOp.gtOp1->AsArgList();

if ((gtHWIntrinsicId == NI_AVX_InsertVector128 || gtHWIntrinsicId == NI_AVX2_InsertVector128) &&
(argList->Rest()->Current()->TypeGet() == TYP_I_IMPL)) // Is the type of the second arg TYP_I_IMPL?
{
// This is Avx/Avx2.InsertVector128
return true;
}
else if ((gtHWIntrinsicId == NI_AVX_ExtractVector128 || gtHWIntrinsicId == NI_AVX2_ExtractVector128))
{
// This is Avx/Avx2.ExtractVector128
return true;
}
}
}
return OperIsMemoryLoad() || OperIsMemoryStore();
#endif // _TARGET_XARCH_
return false;
}
Expand Down
5 changes: 3 additions & 2 deletions src/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -479,8 +479,8 @@ struct GenTree
// happening.
void CopyCosts(const GenTree* const tree)
{
INDEBUG(gtCostsInitialized =
tree->gtCostsInitialized;) // If the 'tree' costs aren't initialized, we'll hit an assert below.
// If the 'tree' costs aren't initialized, we'll hit an assert below.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are all these random comment re-alignments popping up? They seem unrelated to the PR.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clang-format is confused by some comments, so I fixed here to avoid more unrelated changes from clang-format.

INDEBUG(gtCostsInitialized = tree->gtCostsInitialized;)
_gtCostEx = tree->gtCostEx;
_gtCostSz = tree->gtCostSz;
}
Expand Down Expand Up @@ -4115,6 +4115,7 @@ struct GenTreeSIMD : public GenTreeJitIntrinsic
struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic
{
NamedIntrinsic gtHWIntrinsicId;
var_types gtIndexBaseType; // for AVX2 Gather* intrinsics
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gather intrinsics have complex overloads that need additional information (the base-type of index vector) for codegen, so adding a field in IR. But that let GenTreeHWIntrinsic become a large node. @CarolEidt do you think it ok?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is probably not a big issue for intrinsics, though for methods with heavy intrinsic usage it could be an impact. Did you consider deriving from GenTreeHWIntrinsic to make a specialized node?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might also be worth considering whether to define something like GenTreeHWIntrinsicBig, to isolate both the additional fields as well as the extra operands.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps make NamedIntrinsic unsigned short? 64k intrinsics ought to be enough for anybody.

There are also spare bytes in the GenTree class but I don't know an elegant way to use those in derived classes.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps make NamedIntrinsic unsigned short? 64k intrinsics ought to be enough for anybody.

@mikedn Good point! Yes, 64k is definitely enough for the foreseeable future (AVX-512 based ISAs).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make NamedIntrinsic unsigned short

Made this change, now GenTreeHWIntrinsic is still a small node.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the gtIndexBaseType actually needed for? Can it not be inferred from the rest of the signature?


GenTreeHWIntrinsic(var_types type, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned size)
: GenTreeJitIntrinsic(GT_HWIntrinsic, type, nullptr, nullptr, baseType, size), gtHWIntrinsicId(hwIntrinsicID)
Expand Down
114 changes: 114 additions & 0 deletions src/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1184,6 +1184,9 @@ void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic intrinsi
HWIntrinsicSwitchCaseBody emitSwCase)
{
assert(nonConstImmReg != REG_NA);
// AVX2 Gather intrinsics use managed non-const fallback since they have discrete imm8 value range
// that does work with the current compiler generated jump-table fallback
assert(!HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsic));
emitter* emit = getEmitter();

const unsigned maxByte = (unsigned)HWIntrinsicInfo::lookupImmUpperBound(intrinsic) + 1;
Expand Down Expand Up @@ -2008,6 +2011,117 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_AVX2_GatherVector128:
case NI_AVX2_GatherVector256:
case NI_AVX2_GatherMaskVector128:
case NI_AVX2_GatherMaskVector256:
{
GenTreeArgList* list = op1->AsArgList();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: It is helpful to break up the argList blocks, for readability: Ex:

GenTreeArgList* argList = op1->AsArgList();

op1 = list->Current();
op1Reg = op1->gtRegNum;
genConsumeRegs(op1);

list = list->Rest();
op2 = list->Current();
op2Reg = op2->gtRegNum;
genConsumeRegs(op2);

list = list->Rest();
GenTree* op3 = list->Current();
genConsumeRegs(op3);

list = list->Rest();
GenTree* op4 = nullptr;
GenTree* lastOp = nullptr;
GenTree* indexOp = nullptr;

regNumber op3Reg = REG_NA;
regNumber op4Reg = REG_NA;
regNumber addrBaseReg = REG_NA;
regNumber addrIndexReg = REG_NA;
regNumber maskReg = node->ExtractTempReg(RBM_ALLFLOAT);

if (numArgs == 5)
{
assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
op4 = list->Current();
list = list->Rest();
lastOp = list->Current();
op3Reg = op3->gtRegNum;
op4Reg = op4->gtRegNum;
genConsumeRegs(op4);
addrBaseReg = op2Reg;
addrIndexReg = op3Reg;
indexOp = op3;

// copy op4Reg into the tmp mask register,
// the mask register will be cleared by gather instructions
emit->emitIns_R_R(INS_movaps, attr, maskReg, op4Reg);

if (targetReg != op1Reg)
{
// copy source vector to the target register for masking merge
emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if op2Reg == targetReg? I don't think it's been set as delayFree.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Op2Reg is a GPR (base register of the address) and cannot be same as the vector target register.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right - thanks for clarifying.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment clarifying this would be nice

}
}
else
{
assert(intrinsicId == NI_AVX2_GatherVector128 || intrinsicId == NI_AVX2_GatherVector256);
addrBaseReg = op1Reg;
addrIndexReg = op2Reg;
indexOp = op2;
lastOp = op3;

// generate all-one mask vector
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, maskReg, maskReg, maskReg);
}

bool isVector128GatherWithVector256Index = (targetType == TYP_SIMD16) && (indexOp->TypeGet() == TYP_SIMD32);

// hwintrinsiclistxarch.h uses Dword index instructions in default
if (varTypeIsLong(node->gtIndexBaseType))
{
switch (ins)
{
case INS_vpgatherdd:
ins = INS_vpgatherqd;
if (isVector128GatherWithVector256Index)
{
// YMM index in address mode
attr = emitTypeSize(TYP_SIMD32);
}
break;
case INS_vpgatherdq:
ins = INS_vpgatherqq;
break;
case INS_vgatherdps:
ins = INS_vgatherqps;
if (isVector128GatherWithVector256Index)
{
// YMM index in address mode
attr = emitTypeSize(TYP_SIMD32);
}
break;
case INS_vgatherdpd:
ins = INS_vgatherqpd;
break;
default:
unreached();
}
}

assert(lastOp->IsCnsIntOrI());
ssize_t ival = lastOp->AsIntCon()->IconValue();
assert((ival >= 0) && (ival <= 255));

assert(targetReg != maskReg);
assert(targetReg != addrIndexReg);
assert(maskReg != addrIndexReg);
emit->emitIns_R_AR_R(ins, attr, targetReg, maskReg, addrBaseReg, addrIndexReg, (int8_t)ival, 0);

break;
}

case NI_AVX_GetLowerHalf:
{
assert(op2 == nullptr);
Expand Down
4 changes: 4 additions & 0 deletions src/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,10 @@ HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int32, "ConvertToVe
HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt32, "ConvertToVector256UInt32", AVX2, -1, 32, 1, {INS_invalid, INS_pmovzxbd, INS_invalid, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AVX2_ConvertToVector256Int64, "ConvertToVector256Int64", AVX2, -1, 32, 1, {INS_pmovsxbq, INS_invalid, INS_pmovsxwq, INS_invalid, INS_pmovsxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AVX2_ConvertToVector256UInt64, "ConvertToVector256UInt64", AVX2, -1, 32, 1, {INS_invalid, INS_pmovzxbq, INS_invalid, INS_pmovzxwq, INS_invalid, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AVX2_GatherVector128, "GatherVector128", AVX2, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_NoContainment)
HARDWARE_INTRINSIC(AVX2_GatherVector256, "GatherVector256", AVX2, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_NoContainment)
HARDWARE_INTRINSIC(AVX2_GatherMaskVector128, "GatherMaskVector128", AVX2, -1, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
HARDWARE_INTRINSIC(AVX2_GatherMaskVector256, "GatherMaskVector256", AVX2, -1, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment)
HARDWARE_INTRINSIC(AVX2_HorizontalAdd, "HorizontalAdd", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_invalid, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_HorizontalAddSaturate, "HorizontalAddSaturate", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AVX2_HorizontalSubtract, "HorizontalSubtract", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
Expand Down
Loading