Skip to content

Additional cleanup and simplification of hwintrinsic instruction sets for xarch #116406

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
598 changes: 172 additions & 426 deletions src/coreclr/inc/corinfoinstructionset.h

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions src/coreclr/inc/jiteeversionguid.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@

#include <minipal/guid.h>

constexpr GUID JITEEVersionIdentifier = { /* f22d9c39-8d24-4e4d-86aa-7b883aecf97f */
0xf22d9c39,
0x8d24,
0x4e4d,
{0x86, 0xaa, 0x7b, 0x88, 0x3a, 0xec, 0xf9, 0x7f}
constexpr GUID JITEEVersionIdentifier = { /* 7a77e6d9-7280-439d-bb9d-9887b4516a86 */
0x7a77e6d9,
0x7280,
0x439d,
{0xbb, 0x9d, 0x98, 0x87, 0xb4, 0x51, 0x6a, 0x86}
};

#endif // JIT_EE_VERSIONING_GUID_H
22 changes: 11 additions & 11 deletions src/coreclr/jit/assertionprop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,17 +251,17 @@ bool IntegralRange::Contains(int64_t value) const
case NI_X86Base_CompareScalarUnorderedLessThan:
case NI_X86Base_CompareScalarUnorderedGreaterThanOrEqual:
case NI_X86Base_CompareScalarUnorderedGreaterThan:
case NI_SSE41_TestC:
case NI_SSE41_TestZ:
case NI_SSE41_TestNotZAndNotC:
case NI_SSE42_TestC:
case NI_SSE42_TestZ:
case NI_SSE42_TestNotZAndNotC:
case NI_AVX_TestC:
case NI_AVX_TestZ:
case NI_AVX_TestNotZAndNotC:
return {SymbolicIntegerValue::Zero, SymbolicIntegerValue::One};

case NI_X86Base_Extract:
case NI_SSE41_Extract:
case NI_SSE41_X64_Extract:
case NI_SSE42_Extract:
case NI_SSE42_X64_Extract:
case NI_Vector128_ToScalar:
case NI_Vector256_ToScalar:
case NI_Vector512_ToScalar:
Expand All @@ -274,12 +274,12 @@ bool IntegralRange::Contains(int64_t value) const
}
break;

case NI_BMI1_TrailingZeroCount:
case NI_BMI1_X64_TrailingZeroCount:
case NI_LZCNT_LeadingZeroCount:
case NI_LZCNT_X64_LeadingZeroCount:
case NI_POPCNT_PopCount:
case NI_POPCNT_X64_PopCount:
case NI_AVX2_LeadingZeroCount:
case NI_AVX2_TrailingZeroCount:
case NI_AVX2_X64_LeadingZeroCount:
case NI_AVX2_X64_TrailingZeroCount:
case NI_SSE42_PopCount:
case NI_SSE42_X64_PopCount:
// Note: No advantage in using a precise range for IntegralRange.
// Example: IntCns = 42 gives [0..127] with a non -precise range, [42,42] with a precise range.
return {SymbolicIntegerValue::Zero, SymbolicIntegerValue::ByteMax};
Expand Down
12 changes: 4 additions & 8 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ class CodeGen final : public CodeGenInterface
// Generates SSE2 code for the given tree as "Operand BitWiseOp BitMask"
void genSSE2BitwiseOp(GenTree* treeNode);

// Generates SSE41 code for the given tree as a round operation
void genSSE41RoundOp(GenTreeOp* treeNode);
// Generates SSE42 code for the given tree as a round operation
void genSSE42RoundOp(GenTreeOp* treeNode);

instruction simdAlignedMovIns()
{
Expand Down Expand Up @@ -937,14 +937,10 @@ class CodeGen final : public CodeGenInterface

void genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genX86BaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genSSE41Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genSSE42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genSse42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genFMAIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genFmaIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions);
void genPermuteVar2x(GenTreeHWIntrinsic* node, insOpts instOptions);
void genLZCNTIntrinsic(GenTreeHWIntrinsic* node);
void genPOPCNTIntrinsic(GenTreeHWIntrinsic* node);
void genXCNTIntrinsic(GenTreeHWIntrinsic* node, instruction ins);
void genX86SerializeIntrinsic(GenTreeHWIntrinsic* node);

Expand Down
35 changes: 17 additions & 18 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1410,7 +1410,7 @@ void CodeGen::genSIMDSplitReturn(GenTree* src, const ReturnTypeDesc* retTypeDesc
inst_Mov(TYP_INT, reg0, opReg, /* canSkip */ false);

// reg1 = opRef[61:32]
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
inst_RV_TT_IV(INS_pextrd, EA_4BYTE, reg1, src, 1, INS_OPTS_NONE);
}
Expand Down Expand Up @@ -2427,7 +2427,7 @@ void CodeGen::genMultiRegStoreToSIMDLocal(GenTreeLclVar* lclNode)

inst_Mov(TYP_FLOAT, targetReg, reg0, /* canSkip */ false);
const emitAttr size = emitTypeSize(TYP_SIMD8);
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
GetEmitter()->emitIns_SIMD_R_R_R_I(INS_pinsrd, size, targetReg, targetReg, reg1, 1, INS_OPTS_NONE);
}
Expand Down Expand Up @@ -4855,7 +4855,7 @@ void CodeGen::genCodeForShift(GenTree* tree)
// Only the non-RMW case here.
assert(tree->OperIsShiftOrRotate());
assert(tree->GetRegNum() != REG_NA);
assert(tree->AsOp()->gtOp1->isUsedFromReg() || compiler->compIsaSupportedDebugOnly(InstructionSet_BMI2));
assert(tree->AsOp()->gtOp1->isUsedFromReg() || compiler->compIsaSupportedDebugOnly(InstructionSet_AVX2));

genConsumeOperands(tree->AsOp());

Expand Down Expand Up @@ -4902,7 +4902,7 @@ void CodeGen::genCodeForShift(GenTree* tree)
{
int shiftByValue = (int)shiftBy->AsIntConCommon()->IconValue();

if (tree->OperIsRotate() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) &&
if (tree->OperIsRotate() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2) &&
!tree->gtSetFlags())
{
// If we have a contained source operand, we must emit rorx.
Expand Down Expand Up @@ -4930,7 +4930,7 @@ void CodeGen::genCodeForShift(GenTree* tree)
return;
}
}
else if (tree->OperIsShift() && compiler->compOpportunisticallyDependsOn(InstructionSet_BMI2) &&
else if (tree->OperIsShift() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2) &&
!tree->gtSetFlags())
{
// Emit shlx, sarx, shrx if BMI2 is available instead of mov+shl, mov+sar, mov+shr.
Expand Down Expand Up @@ -5758,8 +5758,8 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
}

case NI_X86Base_Extract:
case NI_SSE41_Extract:
case NI_SSE41_X64_Extract:
case NI_SSE42_Extract:
case NI_SSE42_X64_Extract:
case NI_AVX_ExtractVector128:
case NI_AVX2_ExtractVector128:
case NI_AVX512_ExtractVector128:
Expand All @@ -5771,8 +5771,7 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)

if (intrinsicId == NI_X86Base_Extract)
{
// The encoding that supports containment is SSE4.1 only
ins = INS_pextrw_sse41;
ins = INS_pextrw_sse42;
}

// The hardware intrinsics take unsigned bytes between [0, 255].
Expand Down Expand Up @@ -7742,7 +7741,7 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
}

//-----------------------------------------------------------------------------------------
// genSSE41RoundOp - generate SSE41 code for the given tree as a round operation
// genSSE42RoundOp - generate SSE42 code for the given tree as a round operation
//
// Arguments:
// treeNode - tree node
Expand All @@ -7751,16 +7750,16 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode)
// None
//
// Assumptions:
// i) SSE4.1 is supported by the underlying hardware
// i) SSE4.2 is supported by the underlying hardware
// ii) treeNode oper is a GT_INTRINSIC
// iii) treeNode type is a floating point type
// iv) treeNode is not used from memory
// v) tree oper is NI_System_Math{F}_Round, _Ceiling, _Floor, or _Truncate
// vi) caller of this routine needs to call genProduceReg()
void CodeGen::genSSE41RoundOp(GenTreeOp* treeNode)
void CodeGen::genSSE42RoundOp(GenTreeOp* treeNode)
{
// i) SSE4.1 is supported by the underlying hardware
assert(compiler->compIsaSupportedDebugOnly(InstructionSet_SSE41));
// i) SSE4.2 is supported by the underlying hardware
assert(compiler->compIsaSupportedDebugOnly(InstructionSet_SSE42));

// ii) treeNode oper is a GT_INTRINSIC
assert(treeNode->OperIs(GT_INTRINSIC));
Expand Down Expand Up @@ -7804,7 +7803,7 @@ void CodeGen::genSSE41RoundOp(GenTreeOp* treeNode)

default:
ins = INS_invalid;
assert(!"genSSE41RoundOp: unsupported intrinsic");
assert(!"genSSE42RoundOp: unsupported intrinsic");
unreached();
}

Expand Down Expand Up @@ -7834,7 +7833,7 @@ void CodeGen::genIntrinsic(GenTreeIntrinsic* treeNode)
case NI_System_Math_Floor:
case NI_System_Math_Truncate:
case NI_System_Math_Round:
genSSE41RoundOp(treeNode->AsOp());
genSSE42RoundOp(treeNode->AsOp());
break;

case NI_System_Math_Sqrt:
Expand Down Expand Up @@ -9576,7 +9575,7 @@ void CodeGen::genAmd64EmitterUnitTestsCCMP()
theEmitter->emitIns_R_R(INS_ccmpe, EA_1BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf);

// Test all CC codes
for (uint32_t ins = INS_FIRST_CCMP_INSTRUCTION + 1; ins < INS_LAST_CCMP_INSTRUCTION; ins++)
for (uint32_t ins = FIRST_CCMP_INSTRUCTION; ins <= LAST_CCMP_INSTRUCTION; ins++)
{
theEmitter->emitIns_R_R((instruction)ins, EA_4BYTE, REG_RAX, REG_RCX, INS_OPTS_EVEX_dfv_cf);
}
Expand All @@ -9598,7 +9597,7 @@ void CodeGen::genAmd64EmitterUnitTestsCCMP()
theEmitter->emitIns_R_S(INS_ccmpe, EA_1BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf);

// Test all CC codes
for (uint32_t ins = INS_FIRST_CCMP_INSTRUCTION + 1; ins < INS_LAST_CCMP_INSTRUCTION; ins++)
for (uint32_t ins = FIRST_CCMP_INSTRUCTION; ins <= LAST_CCMP_INSTRUCTION; ins++)
{
theEmitter->emitIns_R_S((instruction)ins, EA_4BYTE, REG_RAX, 0, 0, INS_OPTS_EVEX_dfv_cf);
}
Expand Down
26 changes: 7 additions & 19 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6042,11 +6042,7 @@ int Compiler::compCompile(CORINFO_MODULE_HANDLE classPtr,

if (JitConfig.EnableSSE42() != 0)
{
instructionSetFlags.AddInstructionSet(InstructionSet_SSE3);
instructionSetFlags.AddInstructionSet(InstructionSet_SSSE3);
instructionSetFlags.AddInstructionSet(InstructionSet_SSE41);
instructionSetFlags.AddInstructionSet(InstructionSet_SSE42);
instructionSetFlags.AddInstructionSet(InstructionSet_POPCNT);
}

if (JitConfig.EnableAVX() != 0)
Expand All @@ -6057,11 +6053,6 @@ int Compiler::compCompile(CORINFO_MODULE_HANDLE classPtr,
if (JitConfig.EnableAVX2() != 0)
{
instructionSetFlags.AddInstructionSet(InstructionSet_AVX2);
instructionSetFlags.AddInstructionSet(InstructionSet_BMI1);
instructionSetFlags.AddInstructionSet(InstructionSet_BMI2);
instructionSetFlags.AddInstructionSet(InstructionSet_FMA);
instructionSetFlags.AddInstructionSet(InstructionSet_LZCNT);
instructionSetFlags.AddInstructionSet(InstructionSet_MOVBE);
}

if (JitConfig.EnableAVX512() != 0)
Expand All @@ -6071,7 +6062,7 @@ int Compiler::compCompile(CORINFO_MODULE_HANDLE classPtr,

if (JitConfig.EnableAVX512v2() != 0)
{
instructionSetFlags.AddInstructionSet(InstructionSet_AVX512VBMI);
instructionSetFlags.AddInstructionSet(InstructionSet_AVX512v2);
}

if (JitConfig.EnableAVX512v3() != 0)
Expand All @@ -6097,7 +6088,12 @@ int Compiler::compCompile(CORINFO_MODULE_HANDLE classPtr,
if (JitConfig.EnableAES() != 0)
{
instructionSetFlags.AddInstructionSet(InstructionSet_AES);
instructionSetFlags.AddInstructionSet(InstructionSet_PCLMULQDQ);

if (JitConfig.EnableVAES() != 0)
{
instructionSetFlags.AddInstructionSet(InstructionSet_AES_V256);
instructionSetFlags.AddInstructionSet(InstructionSet_AES_V512);
}
}

if (JitConfig.EnableAVX512VP2INTERSECT() != 0)
Expand Down Expand Up @@ -6127,14 +6123,6 @@ int Compiler::compCompile(CORINFO_MODULE_HANDLE classPtr,
instructionSetFlags.AddInstructionSet(InstructionSet_SHA);
}

if (JitConfig.EnableVAES() != 0)
{
instructionSetFlags.AddInstructionSet(InstructionSet_AES_V256);
instructionSetFlags.AddInstructionSet(InstructionSet_AES_V512);
instructionSetFlags.AddInstructionSet(InstructionSet_PCLMULQDQ_V256);
instructionSetFlags.AddInstructionSet(InstructionSet_PCLMULQDQ_V512);
}

if (JitConfig.EnableWAITPKG() != 0)
{
instructionSetFlags.AddInstructionSet(InstructionSet_WAITPKG);
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/decomposelongs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1961,7 +1961,7 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicToScalar(LIR::Use& use, GenTreeHWIn
Range().InsertAfter(loResult, simdTmpVar);

GenTree* hiResult;
if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_SSE41))
if (m_compiler->compOpportunisticallyDependsOn(InstructionSet_SSE42))
{
GenTree* one = m_compiler->gtNewIconNode(1);
hiResult = m_compiler->gtNewSimdGetElementNode(TYP_INT, simdTmpVar, one, CORINFO_TYPE_INT, simdSize);
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8265,7 +8265,7 @@ void emitter::emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, reg

if ((dataSize == 16) && (constValue->u64[1] == constValue->u64[0]))
{
if (((cnsSize == 16) && emitComp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) ||
if (((cnsSize == 16) && emitComp->compOpportunisticallyDependsOn(InstructionSet_SSE42)) ||
emitComp->compOpportunisticallyDependsOn(InstructionSet_AVX))
{
dataSize = 8;
Expand Down
Loading
Loading