Skip to content

Commit c2751de

Browse files
author
Alex Covington (Advanced Micro Devices Inc)
committed
Fix bug with registor allocation, make sure to allocate XMM0 when AVX is not supported
1 parent f81f950 commit c2751de

File tree

3 files changed

+65
-70
lines changed

3 files changed

+65
-70
lines changed

src/coreclr/jit/gentree.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21427,22 +21427,21 @@ GenTree* Compiler::gtNewSimdBinOpNode(
2142721427

2142821428
assert(compIsaSupportedDebugOnly(InstructionSet_SSE42));
2142921429
assert(simdSize == 16);
21430-
GenTree* op1Dup = fgMakeMultiUse(&op1);
21431-
GenTree* op2Dup = fgMakeMultiUse(&op2);
21430+
GenTree* op1Dup = fgMakeMultiUse(&op1);
21431+
GenTree* op2Dup = fgMakeMultiUse(&op2);
21432+
GenTree* op1Dup2 = fgMakeMultiUse(&op1Dup);
21433+
GenTree* op2Dup2 = fgMakeMultiUse(&op2Dup);
2143221434
GenTree* op1Hi =
2143321435
gtNewSimdHWIntrinsicNode(type, op1, op1Dup, NI_X86Base_MoveHighToLow, CORINFO_TYPE_FLOAT, simdSize);
2143421436
GenTree* op2Hi =
2143521437
gtNewSimdHWIntrinsicNode(type, op2, op2Dup, NI_X86Base_MoveHighToLow, CORINFO_TYPE_FLOAT, simdSize);
21436-
GenTree* op1Dup2 = fgMakeMultiUse(&op1Dup);
21437-
GenTree* op2Dup2 = fgMakeMultiUse(&op2Dup);
21438-
GenTree* divHi =
21439-
gtNewSimdHWIntrinsicNode(type, op1Hi, op2Hi, NI_Vector128_op_Division, simdBaseJitType, simdSize);
2144021438
GenTree* divLo = gtNewSimdHWIntrinsicNode(type, op1Dup2, op2Dup2, NI_Vector128_op_Division,
2144121439
simdBaseJitType, simdSize);
21442-
21440+
GenTree* divHi =
21441+
gtNewSimdHWIntrinsicNode(type, op1Hi, op2Hi, NI_Vector128_op_Division, simdBaseJitType, simdSize);
2144321442
GenTree* div = gtNewSimdHWIntrinsicNode(type, divHi, divLo, NI_X86Base_MoveLowToHigh,
2144421443
CORINFO_TYPE_FLOAT, simdSize);
21445-
return gtNewSimdHWIntrinsicNode(type, div, gtNewIconNode(0x4E), NI_X86Base_Shuffle, CORINFO_TYPE_INT,
21444+
return gtNewSimdHWIntrinsicNode(type, div, gtNewIconNode(0x4E), NI_X86Base_Shuffle, simdBaseJitType,
2144621445
simdSize);
2144721446
}
2144821447
unreached();

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

Lines changed: 51 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2360,10 +2360,16 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
23602360
// Vector256<long> div_i64 = Vector256.ConvertToInt64(div_f64);
23612361
// Vector128<int> div_i32 = Vector256.Narrow(div_i64.GetLower(), div_i64.GetUpper());
23622362
// return div_i32;
2363-
regNumber op2Reg = op2->GetRegNum();
2364-
regNumber tmpReg1 = internalRegisters.Extract(node, RBM_ALLFLOAT);
2363+
regNumber op2Reg = op2->GetRegNum();
2364+
regNumber tmpReg1 = REG_NA;
2365+
if (!compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
2366+
{
2367+
tmpReg1 = internalRegisters.Extract(node, compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
2368+
? RBM_ALLFLOAT
2369+
: SRBM_XMM0);
2370+
}
23652371
regNumber tmpReg2 = internalRegisters.Extract(node, RBM_ALLFLOAT);
2366-
regNumber tmpReg3 = targetReg;
2372+
regNumber tmpReg3 = internalRegisters.Extract(node, RBM_ALLFLOAT);
23672373
var_types nodeType = node->TypeGet();
23682374
emitAttr typeSize = emitTypeSize(nodeType);
23692375
noway_assert(typeSize == EA_16BYTE || typeSize == EA_32BYTE);
@@ -2381,9 +2387,9 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
23812387
CORINFO_FIELD_HANDLE negOneFld = emit->emitSimdConst(&negOneIntVec, typeSize);
23822388

23832389
// div-by-zero check
2384-
emit->emitIns_SIMD_R_R_R(INS_xorpd, typeSize, tmpReg1, tmpReg1, tmpReg1, instOptions);
2385-
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, typeSize, tmpReg1, tmpReg1, op2Reg, instOptions);
2386-
emit->emitIns_R_R(INS_ptest, typeSize, tmpReg1, tmpReg1, instOptions);
2390+
emit->emitIns_SIMD_R_R_R(INS_xorpd, typeSize, tmpReg2, tmpReg2, tmpReg2, instOptions);
2391+
emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, typeSize, tmpReg2, tmpReg2, op2Reg, instOptions);
2392+
emit->emitIns_R_R(INS_ptest, typeSize, tmpReg2, tmpReg2, instOptions);
23872393
genJumpToThrowHlpBlk(EJ_jne, SCK_DIV_BY_ZERO);
23882394

23892395
// overflow check
@@ -2397,18 +2403,18 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
23972403
}
23982404
CORINFO_FIELD_HANDLE minValueFld = emit->emitSimdConst(&minValueInt, typeSize);
23992405

2400-
emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg1, op1Reg, minValueFld, 0, instOptions);
2401-
emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg2, op2Reg, negOneFld, 0, instOptions);
2402-
emit->emitIns_SIMD_R_R_R(INS_pandd, typeSize, tmpReg1, tmpReg1, tmpReg2, instOptions);
2403-
emit->emitIns_R_R(INS_ptest, typeSize, tmpReg1, tmpReg1, instOptions);
2406+
emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg2, op1Reg, minValueFld, 0, instOptions);
2407+
emit->emitIns_SIMD_R_R_C(INS_pcmpeqd, typeSize, tmpReg3, op2Reg, negOneFld, 0, instOptions);
2408+
emit->emitIns_SIMD_R_R_R(INS_pandd, typeSize, tmpReg2, tmpReg2, tmpReg3, instOptions);
2409+
emit->emitIns_R_R(INS_ptest, typeSize, tmpReg2, tmpReg2, instOptions);
24042410
genJumpToThrowHlpBlk(EJ_jne, SCK_OVERFLOW);
2405-
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op1Reg, instOptions);
2406-
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg2, op2Reg, instOptions);
2411+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg2, op1Reg, instOptions);
2412+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg3, op2Reg, instOptions);
24072413
}
24082414
else if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
24092415
{
2410-
emit->emitIns_R_R(INS_vcvtudq2pd, divTypeSize, tmpReg1, op1Reg, instOptions);
2411-
emit->emitIns_R_R(INS_vcvtudq2pd, divTypeSize, tmpReg2, op2Reg, instOptions);
2416+
emit->emitIns_R_R(INS_vcvtudq2pd, divTypeSize, tmpReg2, op1Reg, instOptions);
2417+
emit->emitIns_R_R(INS_vcvtudq2pd, divTypeSize, tmpReg3, op2Reg, instOptions);
24122418
}
24132419
else
24142420
{
@@ -2419,79 +2425,64 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions)
24192425
double2To32Const.f64[i] = 4294967296.0; // 2^32
24202426
}
24212427
CORINFO_FIELD_HANDLE double2To32ConstFld = emit->emitSimdConst(&double2To32Const, divTypeSize);
2422-
tmpReg3 = internalRegisters.Extract(node, RBM_ALLFLOAT);
24232428

2429+
// Convert uint -> double
2430+
// tmpReg2 = double(op1Reg)
2431+
// tmpReg3 = double(op2Reg)
24242432
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX))
24252433
{
2426-
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg3, op1Reg, instOptions);
2427-
emit->emitIns_Mov(INS_movups, divTypeSize, tmpReg1, tmpReg3, false, instOptions);
2428-
emit->emitIns_R_C(INS_addpd, divTypeSize, tmpReg1, double2To32ConstFld, instOptions);
2429-
emit->emitIns_SIMD_R_R_R_R(INS_blendvpd, divTypeSize, tmpReg1, tmpReg3, tmpReg1, tmpReg3,
2434+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op1Reg, instOptions);
2435+
emit->emitIns_Mov(INS_movups, divTypeSize, tmpReg2, tmpReg1, false, instOptions);
2436+
emit->emitIns_R_C(INS_addpd, divTypeSize, tmpReg2, double2To32ConstFld, instOptions);
2437+
emit->emitIns_SIMD_R_R_R_R(INS_blendvpd, divTypeSize, tmpReg2, tmpReg1, tmpReg2, tmpReg1,
24302438
instOptions);
24312439

2432-
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg3, op2Reg, instOptions);
2433-
emit->emitIns_Mov(INS_movups, divTypeSize, tmpReg2, tmpReg3, false, instOptions);
2434-
emit->emitIns_R_C(INS_addpd, divTypeSize, tmpReg2, double2To32ConstFld, instOptions);
2435-
emit->emitIns_SIMD_R_R_R_R(INS_blendvpd, divTypeSize, tmpReg2, tmpReg3, tmpReg2, tmpReg3,
2440+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op2Reg, instOptions);
2441+
emit->emitIns_Mov(INS_movups, divTypeSize, tmpReg3, tmpReg1, false, instOptions);
2442+
emit->emitIns_R_C(INS_addpd, divTypeSize, tmpReg3, double2To32ConstFld, instOptions);
2443+
emit->emitIns_SIMD_R_R_R_R(INS_blendvpd, divTypeSize, tmpReg3, tmpReg1, tmpReg3, tmpReg1,
24362444
instOptions);
24372445
}
24382446
else
24392447
{
2440-
// blendvpd requires that the mask value is in XMM0. Preserve XMM0 in a tmp reg while we convert
2441-
// from uint -> long -> double, then restore XMM0 when we're done
2442-
emit->emitIns_Mov(INS_movups, typeSize, tmpReg3, JITREG_XMM0, false);
2443-
2444-
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, JITREG_XMM0, op1Reg, instOptions);
2445-
emit->emitIns_Mov(INS_movups, typeSize, tmpReg1, JITREG_XMM0, false, instOptions);
2446-
emit->emitIns_R_C(INS_addpd, typeSize, tmpReg1, double2To32ConstFld, instOptions);
2447-
emit->emitIns_R_R(INS_blendvpd, typeSize, JITREG_XMM0, tmpReg1, instOptions);
2448-
emit->emitIns_Mov(INS_movups, typeSize, tmpReg1, JITREG_XMM0, instOptions);
2449-
2450-
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, JITREG_XMM0, op2Reg, instOptions);
2451-
emit->emitIns_Mov(INS_movups, typeSize, tmpReg2, JITREG_XMM0, false, instOptions);
2448+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op1Reg, instOptions);
2449+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg2, tmpReg1, false, instOptions);
24522450
emit->emitIns_R_C(INS_addpd, typeSize, tmpReg2, double2To32ConstFld, instOptions);
2453-
emit->emitIns_R_R(INS_blendvpd, typeSize, JITREG_XMM0, tmpReg2, instOptions);
2454-
emit->emitIns_Mov(INS_movups, typeSize, tmpReg2, JITREG_XMM0, instOptions);
2455-
2456-
emit->emitIns_Mov(INS_movups, typeSize, JITREG_XMM0, tmpReg3, false);
2451+
emit->emitIns_R_R(INS_blendvpd, typeSize, tmpReg1, tmpReg2, instOptions);
2452+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg2, tmpReg1, instOptions);
2453+
2454+
emit->emitIns_R_R(INS_cvtdq2pd, divTypeSize, tmpReg1, op2Reg, instOptions);
2455+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg3, tmpReg1, false, instOptions);
2456+
emit->emitIns_R_C(INS_addpd, typeSize, tmpReg3, double2To32ConstFld, instOptions);
2457+
emit->emitIns_R_R(INS_blendvpd, typeSize, tmpReg1, tmpReg3, instOptions);
2458+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg3, tmpReg1, instOptions);
24572459
}
24582460
}
24592461

2460-
emit->emitIns_SIMD_R_R_R(INS_divpd, divTypeSize, tmpReg3, tmpReg1, tmpReg2, instOptions);
2461-
24622462
if (varTypeIsSigned(baseType) || compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
24632463
{
2464+
emit->emitIns_SIMD_R_R_R(INS_divpd, divTypeSize, targetReg, tmpReg2, tmpReg3, instOptions);
24642465
emit->emitIns_R_R(varTypeIsSigned(baseType) ? INS_cvttpd2dq : INS_vcvttpd2udq, divTypeSize, targetReg,
2465-
tmpReg3, instOptions);
2466+
targetReg, instOptions);
24662467
}
24672468
else
24682469
{
24692470
assert(varTypeIsUnsigned(baseType));
2471+
emit->emitIns_SIMD_R_R_R(INS_divpd, divTypeSize, tmpReg1, tmpReg2, tmpReg3, instOptions);
24702472

24712473
if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX))
24722474
{
2473-
emit->emitIns_R_R(INS_cvttpd2dq, divTypeSize, tmpReg2, tmpReg3, instOptions);
2474-
emit->emitIns_Mov(INS_movups, typeSize, tmpReg3, op1Reg, instOptions);
2475-
emit->emitIns_SIMD_R_R_R_R(INS_blendvpd, typeSize, targetReg, tmpReg2, tmpReg3, tmpReg2,
2475+
emit->emitIns_R_R(INS_cvttpd2dq, divTypeSize, tmpReg3, tmpReg1, instOptions);
2476+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg1, op1Reg, instOptions);
2477+
emit->emitIns_SIMD_R_R_R_R(INS_blendvpd, typeSize, targetReg, tmpReg3, tmpReg1, tmpReg3,
24762478
instOptions);
24772479
}
24782480
else
24792481
{
2480-
emit->emitIns_R_R(INS_cvttpd2dq, divTypeSize, tmpReg2, tmpReg3, instOptions);
2481-
emit->emitIns_Mov(INS_movups, typeSize, tmpReg3, op1Reg, instOptions);
2482-
if (targetReg != JITREG_XMM0)
2483-
{
2484-
emit->emitIns_Mov(INS_movups, typeSize, tmpReg1, JITREG_XMM0, false);
2485-
}
2486-
2487-
emit->emitIns_Mov(INS_movups, typeSize, JITREG_XMM0, tmpReg2, instOptions);
2488-
emit->emitIns_R_R(INS_blendvpd, typeSize, JITREG_XMM0, tmpReg3, instOptions);
2489-
2490-
if (targetReg != JITREG_XMM0)
2491-
{
2492-
emit->emitIns_Mov(INS_movups, typeSize, targetReg, JITREG_XMM0, false);
2493-
emit->emitIns_Mov(INS_movups, typeSize, JITREG_XMM0, tmpReg1, false);
2494-
}
2482+
emit->emitIns_R_R(INS_cvttpd2dq, divTypeSize, tmpReg1, tmpReg1, instOptions);
2483+
emit->emitIns_Mov(INS_movups, typeSize, tmpReg2, op1Reg, instOptions);
2484+
emit->emitIns_R_R(INS_blendvpd, typeSize, tmpReg1, tmpReg2, instOptions);
2485+
emit->emitIns_Mov(INS_movups, typeSize, targetReg, tmpReg1, false);
24952486
}
24962487
}
24972488

src/coreclr/jit/lsraxarch.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2858,9 +2858,14 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
28582858
// get a tmp register for overflow check
28592859
buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs());
28602860

2861-
if (baseType == TYP_UINT && !compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
2861+
if (!compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512))
28622862
{
2863-
buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs());
2863+
// If AVX is not supported, we need to specifically allocate XMM0 because we will eventually
2864+
// generate a pblendvpd, which requires XMM0 specifically for the mask register.
2865+
buildInternalFloatRegisterDefForNode(intrinsicTree,
2866+
compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
2867+
? lowSIMDRegs()
2868+
: SRBM_XMM0);
28642869
}
28652870
setInternalRegsDelayFree = true;
28662871

0 commit comments

Comments
 (0)