Skip to content

Commit fdafc7c

Browse files
Adding support for X86Base.Pause() and ArmBase.Yield() (#61065)
* Adding support for X86Base.Pause() and ArmBase.Yield() * Applying formatting patch * Ensure NI_ArmBase_Yield actually gets through to codegen on arm64
1 parent a93e0d2 commit fdafc7c

25 files changed

+273
-38
lines changed

src/coreclr/jit/compiler.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3318,6 +3318,7 @@ class Compiler
33183318
unsigned simdSize,
33193319
bool isSimdAsHWIntrinsic);
33203320

3321+
GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID);
33213322
GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID);
33223323
GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(var_types type,
33233324
GenTree* op1,

src/coreclr/jit/emit.h

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,21 +1240,22 @@ class emitter
12401240

12411241
#define PERFSCORE_THROUGHPUT_1C 1.0f // Single Issue
12421242

1243-
#define PERFSCORE_THROUGHPUT_2C 2.0f // slower - 2 cycles
1244-
#define PERFSCORE_THROUGHPUT_3C 3.0f // slower - 3 cycles
1245-
#define PERFSCORE_THROUGHPUT_4C 4.0f // slower - 4 cycles
1246-
#define PERFSCORE_THROUGHPUT_5C 5.0f // slower - 5 cycles
1247-
#define PERFSCORE_THROUGHPUT_6C 6.0f // slower - 6 cycles
1248-
#define PERFSCORE_THROUGHPUT_7C 7.0f // slower - 7 cycles
1249-
#define PERFSCORE_THROUGHPUT_8C 8.0f // slower - 8 cycles
1250-
#define PERFSCORE_THROUGHPUT_9C 9.0f // slower - 9 cycles
1251-
#define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles
1252-
#define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles
1253-
#define PERFSCORE_THROUGHPUT_19C 19.0f // slower - 19 cycles
1254-
#define PERFSCORE_THROUGHPUT_25C 25.0f // slower - 25 cycles
1255-
#define PERFSCORE_THROUGHPUT_33C 33.0f // slower - 33 cycles
1256-
#define PERFSCORE_THROUGHPUT_52C 52.0f // slower - 52 cycles
1257-
#define PERFSCORE_THROUGHPUT_57C 57.0f // slower - 57 cycles
1243+
#define PERFSCORE_THROUGHPUT_2C 2.0f // slower - 2 cycles
1244+
#define PERFSCORE_THROUGHPUT_3C 3.0f // slower - 3 cycles
1245+
#define PERFSCORE_THROUGHPUT_4C 4.0f // slower - 4 cycles
1246+
#define PERFSCORE_THROUGHPUT_5C 5.0f // slower - 5 cycles
1247+
#define PERFSCORE_THROUGHPUT_6C 6.0f // slower - 6 cycles
1248+
#define PERFSCORE_THROUGHPUT_7C 7.0f // slower - 7 cycles
1249+
#define PERFSCORE_THROUGHPUT_8C 8.0f // slower - 8 cycles
1250+
#define PERFSCORE_THROUGHPUT_9C 9.0f // slower - 9 cycles
1251+
#define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles
1252+
#define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles
1253+
#define PERFSCORE_THROUGHPUT_19C 19.0f // slower - 19 cycles
1254+
#define PERFSCORE_THROUGHPUT_25C 25.0f // slower - 25 cycles
1255+
#define PERFSCORE_THROUGHPUT_33C 33.0f // slower - 33 cycles
1256+
#define PERFSCORE_THROUGHPUT_52C 52.0f // slower - 52 cycles
1257+
#define PERFSCORE_THROUGHPUT_57C 57.0f // slower - 57 cycles
1258+
#define PERFSCORE_THROUGHPUT_140C 140.0f // slower - 140 cycles
12581259

12591260
#define PERFSCORE_LATENCY_ILLEGAL -1024.0f
12601261

@@ -1281,6 +1282,7 @@ class emitter
12811282
#define PERFSCORE_LATENCY_26C 26.0f
12821283
#define PERFSCORE_LATENCY_62C 62.0f
12831284
#define PERFSCORE_LATENCY_69C 69.0f
1285+
#define PERFSCORE_LATENCY_140C 140.0f
12841286
#define PERFSCORE_LATENCY_400C 400.0f // Intel microcode issue with these instuctions
12851287

12861288
#define PERFSCORE_LATENCY_BRANCH_DIRECT 1.0f // cost of an unconditional branch

src/coreclr/jit/emitarm64.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14588,6 +14588,12 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
1458814588
result.insThroughput = PERFSCORE_THROUGHPUT_ZERO;
1458914589
result.insLatency = PERFSCORE_LATENCY_ZERO;
1459014590
}
14591+
else if (ins == INS_yield)
14592+
{
14593+
// @ToDo - find out the actual latency, match x86/x64 for now
14594+
result.insThroughput = PERFSCORE_THROUGHPUT_140C;
14595+
result.insLatency = PERFSCORE_LATENCY_140C;
14596+
}
1459114597
else
1459214598
{
1459314599
result.insThroughput = PERFSCORE_THROUGHPUT_2X;

src/coreclr/jit/emitxarch.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2898,7 +2898,8 @@ void emitter::emitIns(instruction ins)
28982898
ins == INS_r_movsp || ins == INS_r_stosb || ins == INS_r_stosd || ins == INS_r_stosp || ins == INS_ret ||
28992899
ins == INS_sahf || ins == INS_stosb || ins == INS_stosd || ins == INS_stosp
29002900
// These instructions take zero operands
2901-
|| ins == INS_vzeroupper || ins == INS_lfence || ins == INS_mfence || ins == INS_sfence);
2901+
|| ins == INS_vzeroupper || ins == INS_lfence || ins == INS_mfence || ins == INS_sfence ||
2902+
ins == INS_pause);
29022903

29032904
assert(assertCond);
29042905
}
@@ -12333,8 +12334,8 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id)
1233312334
// Due to elided register moves, we can't have the following assert.
1233412335
// For example, consider:
1233512336
// t85 = LCL_VAR byref V01 arg1 rdx (last use) REG rdx
12336-
// /--* t85 byref
12337-
// * STORE_LCL_VAR byref V40 tmp31 rdx REG rdx
12337+
// /--* t85 byref
12338+
// * STORE_LCL_VAR byref V40 tmp31 rdx REG rdx
1233812339
// Here, V01 is type `long` on entry, then is stored as a byref. But because
1233912340
// the register allocator assigned the same register, no instruction was
1234012341
// generated, and we only (currently) make gcref/byref changes in emitter GC info
@@ -16104,6 +16105,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
1610416105
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
1610516106
break;
1610616107

16108+
case INS_pause:
16109+
{
16110+
result.insLatency = PERFSCORE_LATENCY_140C;
16111+
result.insThroughput = PERFSCORE_THROUGHPUT_140C;
16112+
break;
16113+
}
16114+
1610716115
default:
1610816116
// unhandled instruction insFmt combination
1610916117
perfScoreUnhandledInstruction(id, &result);

src/coreclr/jit/gentree.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21798,6 +21798,12 @@ GenTree* Compiler::gtNewSimdZeroNode(var_types type,
2179821798
return gtNewSimdHWIntrinsicNode(type, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic);
2179921799
}
2180021800

21801+
GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID)
21802+
{
21803+
return new (this, GT_HWINTRINSIC)
21804+
GenTreeHWIntrinsic(type, hwIntrinsicID, CORINFO_TYPE_UNDEF, 0, /* isSimdAsHWIntrinsic */ false);
21805+
}
21806+
2180121807
GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID)
2180221808
{
2180321809
SetOpLclRelatedToSIMDIntrinsic(op1);

src/coreclr/jit/hwintrinsic.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -804,7 +804,7 @@ struct HWIntrinsic final
804804

805805
if (baseType == TYP_UNKNOWN)
806806
{
807-
assert(category == HW_Category_Scalar);
807+
assert((category == HW_Category_Scalar) || (category == HW_Category_Special));
808808

809809
if (HWIntrinsicInfo::BaseTypeFromFirstArg(id))
810810
{

src/coreclr/jit/hwintrinsicarm64.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -308,17 +308,23 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
308308
var_types retType,
309309
unsigned simdSize)
310310
{
311-
HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsic);
312-
int numArgs = sig->numArgs;
313-
var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
311+
HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsic);
312+
int numArgs = sig->numArgs;
314313

315314
if (!featureSIMD || !IsBaselineSimdIsaSupported())
316315
{
317316
return nullptr;
318317
}
319318

320319
assert(numArgs >= 0);
321-
assert(varTypeIsArithmetic(simdBaseType));
320+
321+
var_types simdBaseType = TYP_UNKNOWN;
322+
323+
if (intrinsic != NI_ArmBase_Yield)
324+
{
325+
simdBaseType = JitType2PreciseVarType(simdBaseJitType);
326+
assert(varTypeIsArithmetic(simdBaseType));
327+
}
322328

323329
GenTree* retNode = nullptr;
324330
GenTree* op1 = nullptr;
@@ -327,6 +333,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
327333

328334
switch (intrinsic)
329335
{
336+
case NI_ArmBase_Yield:
337+
{
338+
assert(sig->numArgs == 0);
339+
assert(JITtype2varType(sig->retType) == TYP_VOID);
340+
assert(simdSize == 0);
341+
342+
retNode = gtNewScalarHWIntrinsicNode(TYP_VOID, intrinsic);
343+
break;
344+
}
345+
330346
case NI_Vector64_Abs:
331347
case NI_Vector128_Abs:
332348
{

src/coreclr/jit/hwintrinsiccodegenarm64.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
255255
emitSize = emitActualTypeSize(intrin.baseType);
256256
opt = INS_OPTS_NONE;
257257
}
258+
else if (intrin.category == HW_Category_Special)
259+
{
260+
assert(intrin.id == NI_ArmBase_Yield);
261+
262+
emitSize = EA_UNKNOWN;
263+
opt = INS_OPTS_NONE;
264+
}
258265
else
259266
{
260267
emitSize = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize()));
@@ -443,6 +450,12 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
443450
}
444451
break;
445452

453+
case NI_ArmBase_Yield:
454+
{
455+
ins = INS_yield;
456+
break;
457+
}
458+
446459
default:
447460
ins = HWIntrinsicInfo::lookupIns(intrin.id, intrin.baseType);
448461
break;
@@ -735,6 +748,12 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
735748
}
736749
break;
737750

751+
case NI_ArmBase_Yield:
752+
{
753+
GetEmitter()->emitIns(ins);
754+
break;
755+
}
756+
738757
// mvni doesn't support the range of element types, so hard code the 'opts' value.
739758
case NI_Vector64_get_Zero:
740759
case NI_Vector64_get_AllBitsSet:

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1447,6 +1447,8 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node)
14471447
{
14481448
NamedIntrinsic intrinsicId = node->gtHWIntrinsicId;
14491449

1450+
genConsumeOperands(node);
1451+
14501452
switch (intrinsicId)
14511453
{
14521454
case NI_X86Base_BitScanForward:
@@ -1459,16 +1461,25 @@ void CodeGen::genX86BaseIntrinsic(GenTreeHWIntrinsic* node)
14591461
var_types targetType = node->TypeGet();
14601462
instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, targetType);
14611463

1462-
genConsumeOperands(node);
14631464
genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType), targetReg, op1);
1464-
genProduceReg(node);
1465+
break;
1466+
}
1467+
1468+
case NI_X86Base_Pause:
1469+
{
1470+
assert(node->GetSimdBaseType() == TYP_UNKNOWN);
1471+
assert(node->gtGetOp1() == nullptr);
1472+
assert(node->gtGetOp2() == nullptr);
1473+
GetEmitter()->emitIns(INS_pause);
14651474
break;
14661475
}
14671476

14681477
default:
14691478
unreached();
14701479
break;
14711480
}
1481+
1482+
genProduceReg(node);
14721483
}
14731484

14741485
//------------------------------------------------------------------------
@@ -1532,7 +1543,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
15321543

15331544
case NI_SSE_StoreFence:
15341545
{
1535-
assert(baseType == TYP_VOID);
1546+
assert(baseType == TYP_UNKNOWN);
15361547
assert(op1 == nullptr);
15371548
assert(op2 == nullptr);
15381549
emit->emitIns(INS_sfence);
@@ -1617,7 +1628,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
16171628

16181629
case NI_SSE2_LoadFence:
16191630
{
1620-
assert(baseType == TYP_VOID);
1631+
assert(baseType == TYP_UNKNOWN);
16211632
assert(op1 == nullptr);
16221633
assert(op2 == nullptr);
16231634
emit->emitIns(INS_lfence);
@@ -1626,7 +1637,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
16261637

16271638
case NI_SSE2_MemoryFence:
16281639
{
1629-
assert(baseType == TYP_VOID);
1640+
assert(baseType == TYP_UNKNOWN);
16301641
assert(op1 == nullptr);
16311642
assert(op2 == nullptr);
16321643
emit->emitIns(INS_mfence);

src/coreclr/jit/hwintrinsiclistarm64.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,7 @@ HARDWARE_INTRINSIC(Aes, PolynomialMultiplyWideningUpper,
627627
// Base Intrinsics
628628
HARDWARE_INTRINSIC(ArmBase, LeadingZeroCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_clz, INS_clz, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoFloatingPointUsed)
629629
HARDWARE_INTRINSIC(ArmBase, ReverseElementBits, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rbit, INS_rbit, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed)
630+
HARDWARE_INTRINSIC(ArmBase, Yield, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
630631

631632
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
632633
// ISA Function name SIMD size Number of arguments Instructions Category Flags

0 commit comments

Comments
 (0)