Skip to content

Commit 991ae97

Browse files
TIHantannergoodingkunalspathakSwapnilGaikwad
authored
JIT: Added SVE GetFfr, SetFfr, LoadVectorFirstFaulting, GatherVectorFirstFaulting (#104502)
* Initial work * FirstFaulting partially works * Added template * Trying to test first-faulting behavior * Using BoundedMemory to test FirstFaulting behavior for LoadVector. * Fix size in validation * Added more helper functions. Added conditional select tests for LoadVectorFirstFaulting. * Added first-faulting behavior tests for GatherVectorFirstFaulting * Added GetFfr suffix-style APIs * Fixing GatherVector tests * Formatting * Feedback * Feedback * Ensure the P/Invokes are blittable * Fix build * Remove checking for zeroes after the fault * Added GatherVectorFirstFaultingVectorBases test template, but currently without the FirstFaulting test. Added SveFfrTest template. * Mark GetFfr methods as side-effectful * Verifying expected fault result. Test weaks. * Fix build * Add tracking of FFR register somewhat workable code cleanup Remove FFR Add all the GetFfr* wip Work with MskCns() model Use physReg approach Remove commented prototypes working Remove bunch of unnecessary code Remove SpecialImport from GetFFR/SetFFR/LoadFirstFaulting some more code cleanup some fixup * Change condition for PhysReg * jit format * Fix PoisonPage configuration while creating BoundedMemory * Use mmap() instead of memalign() for memory allocation * review feedback * unspill for LoadVectorFirstFaulting as well * Show error codes on failing failure * Feedback * Feedback * Feedback * Feedback * Handle FFR correctly * reuse some of the code * Handle the special effect for SetFfr * some fixes + test coverage * do not zero init lvaFfrRegister * reverted local change * fix build break --------- Co-authored-by: Tanner Gooding <tagoo@outlook.com> Co-authored-by: Kunal Pathak <Kunal.Pathak@microsoft.com> Co-authored-by: Swapnil Gaikwad <swapnil.gaikwad@arm.com>
1 parent 67d5c92 commit 991ae97

22 files changed

+3217
-164
lines changed

src/coreclr/jit/compiler.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4245,6 +4245,10 @@ bool Compiler::fgVarIsNeverZeroInitializedInProlog(unsigned varNum)
42454245
bool result = varDsc->lvIsParam || lvaIsOSRLocal(varNum) || (varNum == lvaGSSecurityCookie) ||
42464246
(varNum == lvaInlinedPInvokeFrameVar) || (varNum == lvaStubArgumentVar) || (varNum == lvaRetAddrVar);
42474247

4248+
#ifdef TARGET_ARM64
4249+
result = result || (varNum == lvaFfrRegister);
4250+
#endif
4251+
42484252
#if FEATURE_FIXED_OUT_ARGS
42494253
result = result || (varNum == lvaOutgoingArgSpaceVar);
42504254
#endif

src/coreclr/jit/fgdiagnostic.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3428,14 +3428,15 @@ void Compiler::fgDebugCheckFlags(GenTree* tree, BasicBlock* block)
34283428

34293429
#if defined(TARGET_ARM64)
34303430
case NI_ArmBase_Yield:
3431-
case NI_Sve_PrefetchBytes:
3432-
case NI_Sve_PrefetchInt16:
3433-
case NI_Sve_PrefetchInt32:
3434-
case NI_Sve_PrefetchInt64:
34353431
case NI_Sve_GatherPrefetch16Bit:
34363432
case NI_Sve_GatherPrefetch32Bit:
34373433
case NI_Sve_GatherPrefetch64Bit:
34383434
case NI_Sve_GatherPrefetch8Bit:
3435+
case NI_Sve_PrefetchBytes:
3436+
case NI_Sve_PrefetchInt16:
3437+
case NI_Sve_PrefetchInt32:
3438+
case NI_Sve_PrefetchInt64:
3439+
case NI_Sve_SetFfr:
34393440
{
34403441
assert(tree->OperRequiresCallFlag(this));
34413442
expectedFlags |= GTF_GLOB_REF;

src/coreclr/jit/gentree.cpp

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26732,6 +26732,18 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
2673226732
addr = Op(3);
2673326733
break;
2673426734

26735+
case NI_Sve_GatherVector:
26736+
case NI_Sve_GatherVectorByteZeroExtend:
26737+
case NI_Sve_GatherVectorFirstFaulting:
26738+
case NI_Sve_GatherVectorInt16SignExtend:
26739+
case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend:
26740+
case NI_Sve_GatherVectorInt32SignExtend:
26741+
case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend:
26742+
case NI_Sve_GatherVectorSByteSignExtend:
26743+
case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend:
26744+
case NI_Sve_GatherVectorUInt16ZeroExtend:
26745+
case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
26746+
case NI_Sve_GatherVectorUInt32ZeroExtend:
2673526747
case NI_Sve_GatherVectorWithByteOffsets:
2673626748
case NI_Sve_LoadVector:
2673726749
case NI_Sve_LoadVectorNonTemporal:
@@ -26742,6 +26754,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
2674226754
case NI_Sve_LoadVectorByteZeroExtendToUInt16:
2674326755
case NI_Sve_LoadVectorByteZeroExtendToUInt32:
2674426756
case NI_Sve_LoadVectorByteZeroExtendToUInt64:
26757+
case NI_Sve_LoadVectorFirstFaulting:
2674526758
case NI_Sve_LoadVectorInt16SignExtendToInt32:
2674626759
case NI_Sve_LoadVectorInt16SignExtendToInt64:
2674726760
case NI_Sve_LoadVectorInt16SignExtendToUInt32:
@@ -26766,20 +26779,6 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
2676626779
addr = Op(2);
2676726780
break;
2676826781

26769-
case NI_Sve_GatherVector:
26770-
case NI_Sve_GatherVectorByteZeroExtend:
26771-
case NI_Sve_GatherVectorInt16SignExtend:
26772-
case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend:
26773-
case NI_Sve_GatherVectorInt32SignExtend:
26774-
case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend:
26775-
case NI_Sve_GatherVectorSByteSignExtend:
26776-
case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend:
26777-
case NI_Sve_GatherVectorUInt16ZeroExtend:
26778-
case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
26779-
case NI_Sve_GatherVectorUInt32ZeroExtend:
26780-
addr = Op(2);
26781-
break;
26782-
2678326782
#endif // TARGET_ARM64
2678426783

2678526784
default:
@@ -26859,11 +26858,12 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
2685926858
{
2686026859
#ifdef TARGET_ARM64
2686126860
static_assert_no_msg(
26862-
AreContiguous(NI_Sve_GatherVector, NI_Sve_GatherVectorByteZeroExtend, NI_Sve_GatherVectorInt16SignExtend,
26863-
NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend, NI_Sve_GatherVectorInt32SignExtend,
26864-
NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend, NI_Sve_GatherVectorSByteSignExtend,
26865-
NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend, NI_Sve_GatherVectorUInt16ZeroExtend,
26866-
NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend, NI_Sve_GatherVectorUInt32ZeroExtend));
26861+
AreContiguous(NI_Sve_GatherVector, NI_Sve_GatherVectorByteZeroExtend, NI_Sve_GatherVectorFirstFaulting,
26862+
NI_Sve_GatherVectorInt16SignExtend, NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend,
26863+
NI_Sve_GatherVectorInt32SignExtend, NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend,
26864+
NI_Sve_GatherVectorSByteSignExtend, NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend,
26865+
NI_Sve_GatherVectorUInt16ZeroExtend, NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend,
26866+
NI_Sve_GatherVectorUInt32ZeroExtend));
2686726867
assert(varTypeIsI(addr) || (varTypeIsSIMD(addr) && ((intrinsicId >= NI_Sve_GatherVector) &&
2686826868
(intrinsicId <= NI_Sve_GatherVectorUInt32ZeroExtend))));
2686926869
#else
@@ -27281,6 +27281,7 @@ bool GenTreeHWIntrinsic::OperRequiresCallFlag() const
2728127281
case NI_Sve_GatherPrefetch32Bit:
2728227282
case NI_Sve_GatherPrefetch64Bit:
2728327283
case NI_Sve_GatherPrefetch8Bit:
27284+
case NI_Sve_SetFfr:
2728427285
{
2728527286
return true;
2728627287
}
@@ -27463,14 +27464,15 @@ void GenTreeHWIntrinsic::Initialize(NamedIntrinsic intrinsicId)
2746327464

2746427465
#if defined(TARGET_ARM64)
2746527466
case NI_ArmBase_Yield:
27466-
case NI_Sve_PrefetchBytes:
27467-
case NI_Sve_PrefetchInt16:
27468-
case NI_Sve_PrefetchInt32:
27469-
case NI_Sve_PrefetchInt64:
2747027467
case NI_Sve_GatherPrefetch16Bit:
2747127468
case NI_Sve_GatherPrefetch32Bit:
2747227469
case NI_Sve_GatherPrefetch64Bit:
2747327470
case NI_Sve_GatherPrefetch8Bit:
27471+
case NI_Sve_PrefetchBytes:
27472+
case NI_Sve_PrefetchInt16:
27473+
case NI_Sve_PrefetchInt32:
27474+
case NI_Sve_PrefetchInt64:
27475+
case NI_Sve_SetFfr:
2747427476
{
2747527477
// Mark as a call and global reference, much as is done for GT_KEEPALIVE
2747627478
gtFlags |= (GTF_CALL | GTF_GLOB_REF);

src/coreclr/jit/hwintrinsic.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2217,6 +2217,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
22172217
#elif defined(TARGET_ARM64)
22182218
case NI_Sve_GatherVector:
22192219
case NI_Sve_GatherVectorByteZeroExtend:
2220+
case NI_Sve_GatherVectorFirstFaulting:
22202221
case NI_Sve_GatherVectorInt16SignExtend:
22212222
case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend:
22222223
case NI_Sve_GatherVectorInt32SignExtend:

src/coreclr/jit/hwintrinsiccodegenarm64.cpp

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2049,6 +2049,34 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
20492049
break;
20502050
}
20512051

2052+
case NI_Sve_GatherVectorFirstFaulting:
2053+
{
2054+
if (node->GetAuxiliaryType() == TYP_UNKNOWN)
2055+
{
2056+
if (intrin.numOperands == 3)
2057+
{
2058+
// We have extra argument which means there is a "use" of FFR here. Restore it back in FFR
2059+
// register.
2060+
assert(op3Reg != REG_NA);
2061+
GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op3Reg, opt);
2062+
}
2063+
}
2064+
else
2065+
{
2066+
// AuxilaryType is added only for numOperands == 3. If there is an extra argument, we need to
2067+
// "use" FFR here. Restore it back in FFR register.
2068+
2069+
if (intrin.numOperands == 4)
2070+
{
2071+
// We have extra argument which means there is a "use" of FFR here. Restore it back in FFR
2072+
// register.
2073+
assert(op4Reg != REG_NA);
2074+
GetEmitter()->emitIns_R(INS_sve_wrffr, emitSize, op4Reg, opt);
2075+
}
2076+
}
2077+
2078+
FALLTHROUGH;
2079+
}
20522080
case NI_Sve_GatherVector:
20532081
case NI_Sve_GatherVectorByteZeroExtend:
20542082
case NI_Sve_GatherVectorInt16SignExtend:
@@ -2065,25 +2093,24 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
20652093
{
20662094
// GatherVector...(Vector<T> mask, T* address, Vector<T2> indices)
20672095

2068-
assert(intrin.numOperands == 3);
2069-
emitAttr baseSize = emitActualTypeSize(intrin.baseType);
2070-
insScalableOpts sopt = INS_SCALABLE_OPTS_NONE;
2096+
emitAttr baseSize = emitActualTypeSize(intrin.baseType);
2097+
bool isLoadingBytes = ((ins == INS_sve_ld1b) || (ins == INS_sve_ld1sb) || (ins == INS_sve_ldff1b) ||
2098+
(ins == INS_sve_ldff1sb));
2099+
insScalableOpts sopt = INS_SCALABLE_OPTS_NONE;
20712100

2072-
if (baseSize == EA_8BYTE)
2073-
{
2074-
// Index is multiplied.
2075-
sopt = (ins == INS_sve_ld1b || ins == INS_sve_ld1sb) ? INS_SCALABLE_OPTS_NONE
2076-
: INS_SCALABLE_OPTS_LSL_N;
2077-
}
2078-
else
2101+
if (baseSize == EA_4BYTE)
20792102
{
20802103
// Index is sign or zero extended to 64bits, then multiplied.
2081-
assert(baseSize == EA_4BYTE);
20822104
opt = varTypeIsUnsigned(node->GetAuxiliaryType()) ? INS_OPTS_SCALABLE_S_UXTW
20832105
: INS_OPTS_SCALABLE_S_SXTW;
20842106

2085-
sopt = (ins == INS_sve_ld1b || ins == INS_sve_ld1sb) ? INS_SCALABLE_OPTS_NONE
2086-
: INS_SCALABLE_OPTS_MOD_N;
2107+
sopt = isLoadingBytes ? INS_SCALABLE_OPTS_NONE : INS_SCALABLE_OPTS_MOD_N;
2108+
}
2109+
else
2110+
{
2111+
// Index is multiplied.
2112+
assert(baseSize == EA_8BYTE);
2113+
sopt = isLoadingBytes ? INS_SCALABLE_OPTS_NONE : INS_SCALABLE_OPTS_LSL_N;
20872114
}
20882115

20892116
GetEmitter()->emitIns_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg, opt, sopt);
@@ -2092,7 +2119,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
20922119
{
20932120
// GatherVector...(Vector<T> mask, Vector<T2> addresses)
20942121

2095-
assert(intrin.numOperands == 2);
20962122
GetEmitter()->emitIns_R_R_R_I(ins, emitSize, targetReg, op1Reg, op2Reg, 0, opt);
20972123
}
20982124

src/coreclr/jit/hwintrinsiclistarm64sve.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ HARDWARE_INTRINSIC(Sve, GatherPrefetch64Bit,
111111
HARDWARE_INTRINSIC(Sve, GatherPrefetch8Bit, -1, -1, false, {INS_sve_prfb, INS_sve_prfb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasImmediateOperand|HW_Flag_HasEnumOperand|HW_Flag_SpecialSideEffect_Other)
112112
HARDWARE_INTRINSIC(Sve, GatherVector, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_sve_ld1w, INS_sve_ld1d, INS_sve_ld1d, INS_sve_ld1w, INS_sve_ld1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
113113
HARDWARE_INTRINSIC(Sve, GatherVectorByteZeroExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1b, INS_sve_ld1b, INS_sve_ld1b, INS_sve_ld1b, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
114+
HARDWARE_INTRINSIC(Sve, GatherVectorFirstFaulting, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldff1w, INS_sve_ldff1w, INS_sve_ldff1d, INS_sve_ldff1d, INS_sve_ldff1w, INS_sve_ldff1d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_SpecialSideEffectMask)
114115
HARDWARE_INTRINSIC(Sve, GatherVectorInt16SignExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
115116
HARDWARE_INTRINSIC(Sve, GatherVectorInt16WithByteOffsetsSignExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_sve_ld1sh, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
116117
HARDWARE_INTRINSIC(Sve, GatherVectorInt32SignExtend, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1sw, INS_sve_ld1sw, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)

src/coreclr/jit/lowerarmarch.cpp

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1775,6 +1775,50 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
17751775

17761776
break;
17771777
}
1778+
case NI_Sve_GatherVectorFirstFaulting:
1779+
{
1780+
LIR::Use use;
1781+
bool foundUse = BlockRange().TryGetUse(node, &use);
1782+
1783+
if (m_ffrTrashed)
1784+
{
1785+
// Consume the FFR register value from local variable to simulate "use" of FFR,
1786+
// only if it was trashed. If it was not trashed, we do not have to reload the
1787+
// contents of the FFR register.
1788+
1789+
unsigned lclNum = comp->getFFRegisterVarNum();
1790+
GenTree* lclVar = comp->gtNewLclvNode(lclNum, TYP_MASK);
1791+
BlockRange().InsertBefore(node, lclVar);
1792+
LowerNode(lclVar);
1793+
1794+
if (node->GetOperandCount() == 3)
1795+
{
1796+
assert(node->GetAuxiliaryType() != TYP_UNKNOWN);
1797+
node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), node->Op(3), lclVar);
1798+
}
1799+
else
1800+
{
1801+
assert(node->GetOperandCount() == 2);
1802+
node->ResetHWIntrinsicId(intrinsicId, comp, node->Op(1), node->Op(2), lclVar);
1803+
}
1804+
}
1805+
1806+
if (foundUse)
1807+
{
1808+
unsigned tmpNum = comp->lvaGrabTemp(true DEBUGARG("Return value result/FFR"));
1809+
LclVarDsc* tmpVarDsc = comp->lvaGetDesc(tmpNum);
1810+
tmpVarDsc->lvType = node->TypeGet();
1811+
GenTree* storeLclVar;
1812+
use.ReplaceWithLclVar(comp, tmpNum, &storeLclVar);
1813+
}
1814+
else
1815+
{
1816+
node->SetUnusedValue();
1817+
}
1818+
1819+
StoreFFRValue(node);
1820+
break;
1821+
}
17781822
case NI_Sve_LoadVectorFirstFaulting:
17791823
{
17801824
LIR::Use use;
@@ -1786,7 +1830,8 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node)
17861830
// only if it was trashed. If it was not trashed, we do not have to reload the
17871831
// contents of the FFR register.
17881832

1789-
GenTree* lclVar = comp->gtNewLclvNode(comp->lvaFfrRegister, TYP_MASK);
1833+
unsigned lclNum = comp->getFFRegisterVarNum();
1834+
GenTree* lclVar = comp->gtNewLclvNode(lclNum, TYP_MASK);
17901835
BlockRange().InsertBefore(node, lclVar);
17911836
LowerNode(lclVar);
17921837

@@ -4082,8 +4127,10 @@ void Lowering::StoreFFRValue(GenTreeHWIntrinsic* node)
40824127
#ifdef DEBUG
40834128
switch (node->GetHWIntrinsicId())
40844129
{
4085-
case NI_Sve_SetFfr:
4130+
case NI_Sve_GatherVectorFirstFaulting:
40864131
case NI_Sve_LoadVectorFirstFaulting:
4132+
case NI_Sve_SetFfr:
4133+
40874134
break;
40884135
default:
40894136
assert(!"Unexpected HWIntrinsicId");

0 commit comments

Comments
 (0)