Skip to content

Commit be7db81

Browse files
alexcovingtonAlex Covington (Advanced Micro Devices Inc)EgorBo
authored
Vector int divide short byte (#117996)
* Extend vector divide to allow for short * Support byte/sbyte * Move logic to gentree * Fix issue with unsigned types * Jit formatting * Allow for vectorization with AVX * Fix issue with narrowing, jit formatting * SSE 4.2 support * Add shuffle for SSE4.2 path * Support uint for SSE4.2 * Fix bug with registor allocation, make sure to allocate XMM0 when AVX is not supported * Cleanup, jit formatting * Feedback, simplify some checks, remove redundant checks * Remove assert that is no longer needed --------- Co-authored-by: Alex Covington (Advanced Micro Devices Inc) <b-alexco@microsoft.com> Co-authored-by: Egor Bogatov <egorbo@gmail.com>
1 parent c1b153a commit be7db81

File tree

4 files changed

+222
-36
lines changed

4 files changed

+222
-36
lines changed

src/coreclr/jit/gentree.cpp

Lines changed: 95 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21254,17 +21254,103 @@ GenTree* Compiler::gtNewSimdBinOpNode(
2125421254
#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)
2125521255
case GT_DIV:
2125621256
{
21257-
if (simdBaseType == TYP_INT)
21257+
if (varTypeIsIntegral(simdBaseType))
2125821258
{
21259-
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
21260-
assert(simdSize == 16 || simdSize == 32);
21259+
assert(!varTypeIsLong(simdBaseType));
21260+
if ((varTypeIsSmall(simdBaseType) && simdSize > 16) ||
21261+
(varTypeIsInt(simdBaseType) && simdSize == 32 &&
21262+
!compOpportunisticallyDependsOn(InstructionSet_AVX512)) ||
21263+
simdSize == 64)
21264+
{
21265+
var_types divType = simdSize == 64 ? TYP_SIMD32 : TYP_SIMD16;
21266+
GenTree* op1Dup = fgMakeMultiUse(&op1);
21267+
GenTree* op2Dup = fgMakeMultiUse(&op2);
21268+
GenTree* op1Lower = gtNewSimdGetLowerNode(divType, op1, simdBaseJitType, simdSize);
21269+
GenTree* op2Lower = gtNewSimdGetLowerNode(divType, op2, simdBaseJitType, simdSize);
21270+
GenTree* divLower =
21271+
gtNewSimdBinOpNode(GT_DIV, divType, op1Lower, op2Lower, simdBaseJitType, simdSize / 2);
21272+
GenTree* op1Upper = gtNewSimdGetUpperNode(divType, op1Dup, simdBaseJitType, simdSize);
21273+
GenTree* op2Upper = gtNewSimdGetUpperNode(divType, op2Dup, simdBaseJitType, simdSize);
21274+
GenTree* divUpper =
21275+
gtNewSimdBinOpNode(GT_DIV, divType, op1Upper, op2Upper, simdBaseJitType, simdSize / 2);
21276+
GenTree* divResult = gtNewSimdWithUpperNode(type, divLower, divUpper, simdBaseJitType, simdSize);
21277+
return divResult;
21278+
}
21279+
21280+
if (varTypeIsSmall(simdBaseType))
21281+
{
21282+
assert(simdSize == 16);
21283+
if (compOpportunisticallyDependsOn(InstructionSet_AVX512))
21284+
{
21285+
CorInfoType cvtBaseType =
21286+
varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UINT : CORINFO_TYPE_INT;
21287+
NamedIntrinsic widenCvtIntrinsic =
21288+
varTypeIsByte(simdBaseType)
21289+
? (varTypeIsSigned(simdBaseType) ? NI_AVX512_ConvertToVector512Int32
21290+
: NI_AVX512_ConvertToVector512UInt32)
21291+
: NI_AVX2_ConvertToVector256Int32;
21292+
NamedIntrinsic narrowCvtIntrinsic =
21293+
varTypeIsByte(simdBaseType)
21294+
? (varTypeIsSigned(simdBaseType) ? NI_AVX512_ConvertToVector128SByte
21295+
: NI_AVX512_ConvertToVector128Byte)
21296+
: (varTypeIsSigned(simdBaseType) ? NI_AVX512_ConvertToVector128Int16
21297+
: NI_AVX512_ConvertToVector128UInt16);
21298+
var_types cvtType = varTypeIsByte(simdBaseType) ? TYP_SIMD64 : TYP_SIMD32;
21299+
int cvtSize = varTypeIsByte(simdBaseType) ? 64 : 32;
21300+
21301+
op1 = gtNewSimdHWIntrinsicNode(cvtType, op1, widenCvtIntrinsic, simdBaseJitType, cvtSize);
21302+
op2 = gtNewSimdHWIntrinsicNode(cvtType, op2, widenCvtIntrinsic, simdBaseJitType, cvtSize);
21303+
GenTree* div = gtNewSimdBinOpNode(GT_DIV, cvtType, op1, op2, cvtBaseType, cvtSize);
21304+
return gtNewSimdHWIntrinsicNode(type, div, narrowCvtIntrinsic, cvtBaseType, cvtSize);
21305+
}
21306+
CorInfoType signedType = varTypeIsShort(simdBaseType) ? CORINFO_TYPE_INT : CORINFO_TYPE_SHORT;
21307+
CorInfoType unsignedType = varTypeIsShort(simdBaseType) ? CORINFO_TYPE_UINT : CORINFO_TYPE_USHORT;
21308+
CorInfoType cvtType = varTypeIsSigned(simdBaseType) ? signedType : unsignedType;
21309+
GenTree* op1Dup = fgMakeMultiUse(&op1);
21310+
GenTree* op2Dup = fgMakeMultiUse(&op2);
21311+
GenTree* op1LowerWiden = gtNewSimdWidenLowerNode(type, op1, simdBaseJitType, simdSize);
21312+
GenTree* op2LowerWiden = gtNewSimdWidenLowerNode(type, op2, simdBaseJitType, simdSize);
21313+
GenTree* divLower =
21314+
gtNewSimdBinOpNode(GT_DIV, type, op1LowerWiden, op2LowerWiden, cvtType, simdSize);
21315+
GenTree* op1UpperWiden = gtNewSimdWidenUpperNode(type, op1Dup, simdBaseJitType, simdSize);
21316+
GenTree* op2UpperWiden = gtNewSimdWidenUpperNode(type, op2Dup, simdBaseJitType, simdSize);
21317+
GenTree* divUpper =
21318+
gtNewSimdBinOpNode(GT_DIV, type, op1UpperWiden, op2UpperWiden, cvtType, simdSize);
21319+
return gtNewSimdNarrowNode(type, divLower, divUpper, simdBaseJitType, simdSize);
21320+
}
21321+
21322+
assert(varTypeIsInt(simdBaseType));
21323+
21324+
if (compOpportunisticallyDependsOn(InstructionSet_AVX512) && simdSize == 32)
21325+
{
21326+
return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_Vector256_op_Division, simdBaseJitType,
21327+
simdSize);
21328+
}
2126121329

21262-
NamedIntrinsic divIntrinsic = simdSize == 16 ? NI_Vector128_op_Division : NI_Vector256_op_Division;
21263-
unsigned int divideOpSimdSize = simdSize * 2;
21330+
assert(simdSize == 16);
21331+
21332+
if (compOpportunisticallyDependsOn(InstructionSet_AVX))
21333+
{
21334+
return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_Vector128_op_Division, simdBaseJitType,
21335+
simdSize);
21336+
}
2126421337

21265-
GenTree* divOp =
21266-
gtNewSimdHWIntrinsicNode(op1->TypeGet(), op1, op2, divIntrinsic, simdBaseJitType, divideOpSimdSize);
21267-
return divOp;
21338+
GenTree* op1Dup = fgMakeMultiUse(&op1);
21339+
GenTree* op2Dup = fgMakeMultiUse(&op2);
21340+
GenTree* op1Dup2 = fgMakeMultiUse(&op1Dup);
21341+
GenTree* op2Dup2 = fgMakeMultiUse(&op2Dup);
21342+
GenTree* op1Hi =
21343+
gtNewSimdHWIntrinsicNode(type, op1, op1Dup, NI_X86Base_MoveHighToLow, CORINFO_TYPE_FLOAT, simdSize);
21344+
GenTree* op2Hi =
21345+
gtNewSimdHWIntrinsicNode(type, op2, op2Dup, NI_X86Base_MoveHighToLow, CORINFO_TYPE_FLOAT, simdSize);
21346+
GenTree* divLo = gtNewSimdHWIntrinsicNode(type, op1Dup2, op2Dup2, NI_Vector128_op_Division,
21347+
simdBaseJitType, simdSize);
21348+
GenTree* divHi =
21349+
gtNewSimdHWIntrinsicNode(type, op1Hi, op2Hi, NI_Vector128_op_Division, simdBaseJitType, simdSize);
21350+
GenTree* div = gtNewSimdHWIntrinsicNode(type, divHi, divLo, NI_X86Base_MoveLowToHigh,
21351+
CORINFO_TYPE_FLOAT, simdSize);
21352+
return gtNewSimdHWIntrinsicNode(type, div, gtNewIconNode(0x4E), NI_X86Base_Shuffle, simdBaseJitType,
21353+
simdSize);
2126821354
}
2126921355
unreached();
2127021356
}
@@ -29727,7 +29813,7 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp,
2972729813
case GT_DIV:
2972829814
{
2972929815
#if defined(TARGET_XARCH)
29730-
assert(varTypeIsFloating(simdBaseType) || varTypeIsInt(simdBaseType));
29816+
assert(varTypeIsFloating(simdBaseType) || !varTypeIsLong(simdBaseType));
2973129817
#else
2973229818
assert(varTypeIsFloating(simdBaseType));
2973329819
#endif

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

Lines changed: 116 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2330,40 +2330,132 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
23302330
// Vector256<long> div_i64 = Vector256.ConvertToInt64(div_f64);
23312331
// Vector128<int> div_i32 = Vector256.Narrow(div_i64.GetLower(), div_i64.GetUpper());
23322332
// return div_i32;
2333-
regNumber op2Reg = op2->GetRegNum();
2334-
regNumber tmpReg1 = internalRegisters.Extract(node, RBM_ALLFLOAT);
2333+
regNumber op2Reg = op2->GetRegNum();
2334+
regNumber tmpReg1 = REG_NA;
2335+
if (!compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
2336+
{
2337+
tmpReg1 = internalRegisters.Extract(node, compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
2338+
? RBM_ALLFLOAT
2339+
: SRBM_XMM0);
2340+
}
23352341
regNumber tmpReg2 = internalRegisters.Extract(node, RBM_ALLFLOAT);
2336-
emitAttr typeSize = emitTypeSize(node->TypeGet());
2342+
regNumber tmpReg3 = internalRegisters.Extract(node, RBM_ALLFLOAT);
2343+
var_types nodeType = node->TypeGet();
2344+
emitAttr typeSize = emitTypeSize(nodeType);
23372345
noway_assert(typeSize == EA_16BYTE || typeSize == EA_32BYTE);
2338-
emitAttr divTypeSize = typeSize == EA_16BYTE ? EA_32BYTE : EA_64BYTE;
2346+
emitAttr divTypeSize = typeSize;
23392347

2340-
simd_t negOneIntVec = simd_t::AllBitsSet();
2341-
simd_t minValueInt{};
2342-
int numElements = genTypeSize(node->TypeGet()) / 4;
2343-
for (int i = 0; i < numElements; i++)
2348+
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
23442349
{
2345-
minValueInt.i32[i] = INT_MIN;
2350+
divTypeSize = typeSize == EA_16BYTE ? EA_32BYTE : EA_64BYTE;
23462351
}
2347-
CORINFO_FIELD_HANDLE minValueFld = emit->emitSimdConst(&minValueInt, typeSize);
2348-
CORINFO_FIELD_HANDLE negOneFld = emit->emitSimdConst(&negOneIntVec, typeSize);
2352+
else if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX) && typeSize == EA_16BYTE)
2353+
{
2354+
divTypeSize = EA_32BYTE;
2355+
}
2356+
simd_t negOneIntVec = simd_t::AllBitsSet();
2357+
CORINFO_FIELD_HANDLE negOneFld = emit->emitSimdConst(&negOneIntVec, typeSize);
23492358

23502359
// div-by-zero check
2351-
emit->emitIns_SIMD_R_R_R(INS_xorpd, typeSize, tmpReg1, tmpReg1, tmpReg1, instOptions);
2352-
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, typeSize, tmpReg1, tmpReg1, op2Reg, instOptions);
2353-
emit->emitIns_R_R(INS_ptest, typeSize, tmpReg1, tmpReg1, instOptions);
2360+
emit->emitIns_SIMD_R_R_R(INS_xorpd, typeSize, tmpReg2, tmpReg2, tmpReg2, instOptions);
2361+
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, typeSize, tmpReg2, tmpReg2, op2Reg, instOptions);
2362+
emit->emitIns_R_R(INS_ptest, typeSize, tmpReg2, tmpReg2, instOptions);
23542363
genJumpToThrowHlpBlk(EJ_jne, SCK_DIV_BY_ZERO);
23552364

23562365
// overflow check
2357-
emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg1, op1Reg, minValueFld, 0, instOptions);
2358-
emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg2, op2Reg, negOneFld, 0, instOptions);
2359-
emit->emitIns_SIMD_R_R_R(INS_pandd, typeSize, tmpReg1, tmpReg1, tmpReg2, instOptions);
2360-
emit->emitIns_R_R(INS_ptest, typeSize, tmpReg1, tmpReg1, instOptions);
2361-
genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
2362-
2363-
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op1Reg, instOptions);
2364-
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg2, op2Reg, instOptions);
2365-
emit->emitIns_SIMD_R_R_R(INS_divpd, divTypeSize, targetReg, tmpReg1, tmpReg2, instOptions);
2366-
emit->emitIns_R_R(INS_cvttpd2dq, divTypeSize, targetReg, targetReg, instOptions);
2366+
if (varTypeIsSigned(baseType))
2367+
{
2368+
simd_t minValueInt{};
2369+
int numElements = genTypeSize(nodeType) / 4;
2370+
for (int i = 0; i < numElements; i++)
2371+
{
2372+
minValueInt.i32[i] = INT_MIN;
2373+
}
2374+
CORINFO_FIELD_HANDLE minValueFld = emit->emitSimdConst(&minValueInt, typeSize);
2375+
2376+
emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg2, op1Reg, minValueFld, 0, instOptions);
2377+
emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg3, op2Reg, negOneFld, 0, instOptions);
2378+
emit->emitIns_SIMD_R_R_R(INS_pandd, typeSize, tmpReg2, tmpReg2, tmpReg3, instOptions);
2379+
emit->emitIns_R_R(INS_ptest, typeSize, tmpReg2, tmpReg2, instOptions);
2380+
genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
2381+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg2, op1Reg, instOptions);
2382+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg3, op2Reg, instOptions);
2383+
}
2384+
else if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
2385+
{
2386+
emit->emitIns_R_R(INS_vcvtudq2pd, divTypeSize, tmpReg2, op1Reg, instOptions);
2387+
emit->emitIns_R_R(INS_vcvtudq2pd, divTypeSize, tmpReg3, op2Reg, instOptions);
2388+
}
2389+
else
2390+
{
2391+
simd_t double2To32Const{};
2392+
int numElements = genTypeSize(nodeType) / 2;
2393+
for (int i = 0; i < numElements; i++)
2394+
{
2395+
double2To32Const.f64[i] = 4294967296.0; // 2^32
2396+
}
2397+
CORINFO_FIELD_HANDLE double2To32ConstFld = emit->emitSimdConst(&double2To32Const, divTypeSize);
2398+
2399+
// Convert uint -> double
2400+
// tmpReg2 = double(op1Reg)
2401+
// tmpReg3 = double(op2Reg)
2402+
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX))
2403+
{
2404+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op1Reg, instOptions);
2405+
emit->emitIns_Mov(INS_movups, divTypeSize, tmpReg2, tmpReg1, false, instOptions);
2406+
emit->emitIns_R_C(INS_addpd, divTypeSize, tmpReg2, double2To32ConstFld, instOptions);
2407+
emit->emitIns_SIMD_R_R_R_R(INS_blendvpd, divTypeSize, tmpReg2, tmpReg1, tmpReg2, tmpReg1,
2408+
instOptions);
2409+
2410+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op2Reg, instOptions);
2411+
emit->emitIns_Mov(INS_movups, divTypeSize, tmpReg3, tmpReg1, false, instOptions);
2412+
emit->emitIns_R_C(INS_addpd, divTypeSize, tmpReg3, double2To32ConstFld, instOptions);
2413+
emit->emitIns_SIMD_R_R_R_R(INS_blendvpd, divTypeSize, tmpReg3, tmpReg1, tmpReg3, tmpReg1,
2414+
instOptions);
2415+
}
2416+
else
2417+
{
2418+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op1Reg, instOptions);
2419+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg2, tmpReg1, false, instOptions);
2420+
emit->emitIns_R_C(INS_addpd, typeSize, tmpReg2, double2To32ConstFld, instOptions);
2421+
emit->emitIns_R_R(INS_blendvpd, typeSize, tmpReg1, tmpReg2, instOptions);
2422+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg2, tmpReg1, instOptions);
2423+
2424+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op2Reg, instOptions);
2425+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg3, tmpReg1, false, instOptions);
2426+
emit->emitIns_R_C(INS_addpd, typeSize, tmpReg3, double2To32ConstFld, instOptions);
2427+
emit->emitIns_R_R(INS_blendvpd, typeSize, tmpReg1, tmpReg3, instOptions);
2428+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg3, tmpReg1, instOptions);
2429+
}
2430+
}
2431+
2432+
if (varTypeIsSigned(baseType) || compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
2433+
{
2434+
emit->emitIns_SIMD_R_R_R(INS_divpd, divTypeSize, targetReg, tmpReg2, tmpReg3, instOptions);
2435+
emit->emitIns_R_R(varTypeIsSigned(baseType) ? INS_cvttpd2dq : INS_vcvttpd2udq, divTypeSize, targetReg,
2436+
targetReg, instOptions);
2437+
}
2438+
else
2439+
{
2440+
assert(varTypeIsUnsigned(baseType));
2441+
emit->emitIns_SIMD_R_R_R(INS_divpd, divTypeSize, tmpReg1, tmpReg2, tmpReg3, instOptions);
2442+
2443+
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX))
2444+
{
2445+
emit->emitIns_R_R(INS_cvttpd2dq, divTypeSize, tmpReg3, tmpReg1, instOptions);
2446+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg1, op1Reg, instOptions);
2447+
emit->emitIns_SIMD_R_R_R_R(INS_blendvpd, typeSize, targetReg, tmpReg3, tmpReg1, tmpReg3,
2448+
instOptions);
2449+
}
2450+
else
2451+
{
2452+
emit->emitIns_R_R(INS_cvttpd2dq, divTypeSize, tmpReg1, tmpReg1, instOptions);
2453+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg2, op1Reg, instOptions);
2454+
emit->emitIns_R_R(INS_blendvpd, typeSize, tmpReg1, tmpReg2, instOptions);
2455+
emit->emitIns_Mov(INS_movups, typeSize, targetReg, tmpReg1, false);
2456+
}
2457+
}
2458+
23672459
break;
23682460
}
23692461

src/coreclr/jit/hwintrinsicxarch.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2270,9 +2270,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
22702270
{
22712271
#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)
22722272
// Check to see if it is possible to emulate the integer division
2273-
if (!(simdBaseType == TYP_INT &&
2274-
((simdSize == 16 && compOpportunisticallyDependsOn(InstructionSet_AVX)) ||
2275-
(simdSize == 32 && compOpportunisticallyDependsOn(InstructionSet_AVX512)))))
2273+
if (varTypeIsLong(simdBaseType))
22762274
{
22772275
break;
22782276
}

src/coreclr/jit/lsraxarch.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2838,6 +2838,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
28382838

28392839
// get a tmp register for overflow check
28402840
buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs());
2841+
2842+
if (!compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
2843+
{
2844+
// If AVX is not supported, we need to specifically allocate XMM0 because we will eventually
2845+
// generate a pblendvpd, which requires XMM0 specifically for the mask register.
2846+
buildInternalFloatRegisterDefForNode(intrinsicTree,
2847+
compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
2848+
? lowSIMDRegs()
2849+
: SRBM_XMM0);
2850+
}
28412851
setInternalRegsDelayFree = true;
28422852

28432853
buildUses = false;

0 commit comments

Comments
 (0)