Skip to content

Commit f0463a9

Browse files
authored
Emit armv8.4 ldapur* for volatile loads with contained offsets (#89681)
1 parent ff2de36 commit f0463a9

20 files changed

+162
-44
lines changed

src/coreclr/inc/clrconfigvalues.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,7 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Rdm, W("EnableArm64Rd
797797
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Sha1, W("EnableArm64Sha1"), 1, "Allows Arm64 Sha1+ hardware intrinsics to be disabled")
798798
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Sha256, W("EnableArm64Sha256"), 1, "Allows Arm64 Sha256+ hardware intrinsics to be disabled")
799799
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Rcpc, W("EnableArm64Rcpc"), 1, "Allows Arm64 Rcpc+ hardware intrinsics to be disabled")
800+
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Rcpc2, W("EnableArm64Rcpc2"), 1, "Allows Arm64 Rcpc2+ hardware intrinsics to be disabled")
800801
#endif
801802

802803
///

src/coreclr/inc/corinfoinstructionset.h

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,15 @@ enum CORINFO_InstructionSet
3030
InstructionSet_Dczva=12,
3131
InstructionSet_Rcpc=13,
3232
InstructionSet_VectorT128=14,
33-
InstructionSet_ArmBase_Arm64=15,
34-
InstructionSet_AdvSimd_Arm64=16,
35-
InstructionSet_Aes_Arm64=17,
36-
InstructionSet_Crc32_Arm64=18,
37-
InstructionSet_Dp_Arm64=19,
38-
InstructionSet_Rdm_Arm64=20,
39-
InstructionSet_Sha1_Arm64=21,
40-
InstructionSet_Sha256_Arm64=22,
33+
InstructionSet_Rcpc2=15,
34+
InstructionSet_ArmBase_Arm64=16,
35+
InstructionSet_AdvSimd_Arm64=17,
36+
InstructionSet_Aes_Arm64=18,
37+
InstructionSet_Crc32_Arm64=19,
38+
InstructionSet_Dp_Arm64=20,
39+
InstructionSet_Rdm_Arm64=21,
40+
InstructionSet_Sha1_Arm64=22,
41+
InstructionSet_Sha256_Arm64=23,
4142
#endif // TARGET_ARM64
4243
#ifdef TARGET_AMD64
4344
InstructionSet_X86Base=1,
@@ -761,6 +762,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
761762
return "Rcpc";
762763
case InstructionSet_VectorT128 :
763764
return "VectorT128";
765+
case InstructionSet_Rcpc2 :
766+
return "Rcpc2";
764767
#endif // TARGET_ARM64
765768
#ifdef TARGET_AMD64
766769
case InstructionSet_X86Base :
@@ -994,6 +997,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
994997
case READYTORUN_INSTRUCTION_Atomics: return InstructionSet_Atomics;
995998
case READYTORUN_INSTRUCTION_Rcpc: return InstructionSet_Rcpc;
996999
case READYTORUN_INSTRUCTION_VectorT128: return InstructionSet_VectorT128;
1000+
case READYTORUN_INSTRUCTION_Rcpc2: return InstructionSet_Rcpc2;
9971001
#endif // TARGET_ARM64
9981002
#ifdef TARGET_AMD64
9991003
case READYTORUN_INSTRUCTION_X86Base: return InstructionSet_X86Base;

src/coreclr/inc/jiteeversionguid.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,11 @@ typedef const GUID *LPCGUID;
4343
#define GUID_DEFINED
4444
#endif // !GUID_DEFINED
4545

46-
constexpr GUID JITEEVersionIdentifier = { /* 5bf301d6-d08e-4c74-ab9b-1d9c1975950f */
47-
0x5bf301d6,
48-
0xd08e,
49-
0x4c74,
50-
{0xab, 0x9b, 0x1d, 0x9c, 0x19, 0x75, 0x95, 0x0f}
46+
constexpr GUID JITEEVersionIdentifier = { /* a2974440-e8ee-4d95-9e6e-799a330be1a0 */
47+
0xa2974440,
48+
0xe8ee,
49+
0x4d95,
50+
{0x9e, 0x6e, 0x79, 0x9a, 0x33, 0x0b, 0xe1, 0xa0}
5151
};
5252

5353
//////////////////////////////////////////////////////////////////////////////////////////////////////////

src/coreclr/inc/readytoruninstructionset.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ enum ReadyToRunInstructionSet
5050
READYTORUN_INSTRUCTION_VectorT128=39,
5151
READYTORUN_INSTRUCTION_VectorT256=40,
5252
READYTORUN_INSTRUCTION_VectorT512=41,
53+
READYTORUN_INSTRUCTION_Rcpc2=42,
5354

5455
};
5556

src/coreclr/jit/codegenarmarch.cpp

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1761,10 +1761,35 @@ void CodeGen::genCodeForIndir(GenTreeIndir* tree)
17611761
bool addrIsInReg = tree->Addr()->isUsedFromReg();
17621762
bool addrIsAligned = ((tree->gtFlags & GTF_IND_UNALIGNED) == 0);
17631763

1764-
// on arm64-v8.3+ we can use ldap* instructions with acquire/release semantics to avoid
1764+
// On arm64-v8.3+ we can use ldap* instructions with acquire/release semantics to avoid
17651765
// full memory barriers if mixed with STLR
17661766
bool hasRcpc = compiler->compOpportunisticallyDependsOn(InstructionSet_Rcpc);
17671767

1768+
// On arm64-v8.4+ we can use ldapur* instructions with acquire/release semantics to
1769+
// avoid full memory barriers if address is contained and unscaled
1770+
bool hasRcpc2 = compiler->compOpportunisticallyDependsOn(InstructionSet_Rcpc2);
1771+
1772+
bool handledWithLdapur = false;
1773+
if (hasRcpc2 && !addrIsInReg && tree->Addr()->OperIs(GT_LEA) && !tree->HasIndex() && (tree->Scale() == 1) &&
1774+
emitter::emitIns_valid_imm_for_unscaled_ldst_offset(tree->Offset()))
1775+
{
1776+
if (ins == INS_ldrb)
1777+
{
1778+
ins = INS_ldapurb;
1779+
handledWithLdapur = true;
1780+
}
1781+
else if (ins == INS_ldrh)
1782+
{
1783+
ins = INS_ldapurh;
1784+
handledWithLdapur = true;
1785+
}
1786+
else if (ins == INS_ldr)
1787+
{
1788+
ins = INS_ldapur;
1789+
handledWithLdapur = true;
1790+
}
1791+
}
1792+
17681793
if ((ins == INS_ldrb) && addrIsInReg)
17691794
{
17701795
ins = hasRcpc ? INS_ldaprb : INS_ldarb;
@@ -1777,7 +1802,7 @@ void CodeGen::genCodeForIndir(GenTreeIndir* tree)
17771802
{
17781803
ins = hasRcpc ? INS_ldapr : INS_ldar;
17791804
}
1780-
else
1805+
else if (!handledWithLdapur)
17811806
#endif // TARGET_ARM64
17821807
{
17831808
emitBarrier = true;

src/coreclr/jit/emit.h

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -628,8 +628,8 @@ class emitter
628628
#define MAX_ENCODED_SIZE 15
629629
#elif defined(TARGET_ARM64)
630630
#define INSTR_ENCODED_SIZE 4
631-
static_assert_no_msg(INS_count <= 512);
632-
instruction _idIns : 9;
631+
static_assert_no_msg(INS_count <= 1024);
632+
instruction _idIns : 10;
633633
#elif defined(TARGET_LOONGARCH64)
634634
// TODO-LoongArch64: not include SIMD-vector.
635635
static_assert_no_msg(INS_count <= 512);
@@ -712,7 +712,7 @@ class emitter
712712
// x86: 17 bits
713713
// amd64: 17 bits
714714
// arm: 16 bits
715-
// arm64: 17 bits
715+
// arm64: 18 bits
716716
// loongarch64: 14 bits
717717
// risc-v: 14 bits
718718

@@ -754,7 +754,7 @@ class emitter
754754
// x86: 38 bits
755755
// amd64: 38 bits
756756
// arm: 32 bits
757-
// arm64: 31 bits
757+
// arm64: 32 bits
758758
// loongarch64: 28 bits
759759
// risc-v: 28 bits
760760

@@ -763,10 +763,12 @@ class emitter
763763
unsigned _idLargeDsp : 1; // does a large displacement follow?
764764
unsigned _idLargeCall : 1; // large call descriptor used
765765

766-
unsigned _idBound : 1; // jump target / frame offset bound
766+
unsigned _idBound : 1; // jump target / frame offset bound
767+
#ifndef TARGET_ARMARCH
767768
unsigned _idCallRegPtr : 1; // IL indirect calls: addr in reg
768-
unsigned _idCallAddr : 1; // IL indirect calls: can make a direct call to iiaAddr
769-
unsigned _idNoGC : 1; // Some helpers don't get recorded in GC tables
769+
#endif
770+
unsigned _idCallAddr : 1; // IL indirect calls: can make a direct call to iiaAddr
771+
unsigned _idNoGC : 1; // Some helpers don't get recorded in GC tables
770772
#if defined(TARGET_XARCH)
771773
unsigned _idEvexbContext : 1; // does EVEX.b need to be set.
772774
#endif // TARGET_XARCH
@@ -1509,6 +1511,7 @@ class emitter
15091511
_idBound = 1;
15101512
}
15111513

1514+
#ifndef TARGET_ARMARCH
15121515
bool idIsCallRegPtr() const
15131516
{
15141517
return _idCallRegPtr != 0;
@@ -1517,6 +1520,7 @@ class emitter
15171520
{
15181521
_idCallRegPtr = 1;
15191522
}
1523+
#endif
15201524

15211525
// Only call instructions that call helper functions may be marked as "IsNoGC", indicating
15221526
// that a thread executing such a call cannot be stopped for GC. Thus, in partially-interruptible

src/coreclr/jit/emitarm.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4770,8 +4770,6 @@ void emitter::emitIns_Call(EmitCallType callType,
47704770
{
47714771
/* This is an indirect call (either a virtual call or func ptr call) */
47724772

4773-
id->idSetIsCallRegPtr();
4774-
47754773
if (isJump)
47764774
{
47774775
ins = INS_bx; // INS_bx Reg

src/coreclr/jit/emitarm64.cpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,7 +1157,9 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id)
11571157
case INS_ldrb:
11581158
case INS_strb:
11591159
case INS_ldurb:
1160+
case INS_ldapurb:
11601161
case INS_sturb:
1162+
case INS_stlurb:
11611163
result = EA_4BYTE;
11621164
break;
11631165

@@ -1172,6 +1174,8 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id)
11721174
case INS_strh:
11731175
case INS_ldurh:
11741176
case INS_sturh:
1177+
case INS_ldapurh:
1178+
case INS_stlurh:
11751179
result = EA_4BYTE;
11761180
break;
11771181

@@ -1209,6 +1213,8 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id)
12091213
case INS_str:
12101214
case INS_ldur:
12111215
case INS_stur:
1216+
case INS_ldapur:
1217+
case INS_stlur:
12121218
result = id->idOpSize();
12131219
break;
12141220

@@ -1237,7 +1243,9 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id)
12371243
case INS_ldrb:
12381244
case INS_strb:
12391245
case INS_ldurb:
1246+
case INS_ldapurb:
12401247
case INS_sturb:
1248+
case INS_stlurb:
12411249
case INS_ldrsb:
12421250
case INS_ldursb:
12431251
result = EA_1BYTE;
@@ -1252,6 +1260,8 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id)
12521260
case INS_sturh:
12531261
case INS_ldrsh:
12541262
case INS_ldursh:
1263+
case INS_ldapurh:
1264+
case INS_stlurh:
12551265
result = EA_2BYTE;
12561266
break;
12571267

@@ -1275,6 +1285,8 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id)
12751285
case INS_str:
12761286
case INS_ldur:
12771287
case INS_stur:
1288+
case INS_ldapur:
1289+
case INS_stlur:
12781290
result = id->idOpSize();
12791291
break;
12801292

@@ -2372,6 +2384,12 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt)
23722384
return false;
23732385
}
23742386

2387+
// true if this 'imm' can be encoded as the offset in an unscaled ldr/str instruction
2388+
/*static*/ bool emitter::emitIns_valid_imm_for_unscaled_ldst_offset(INT64 imm)
2389+
{
2390+
return (imm >= -256) && (imm <= 255);
2391+
}
2392+
23752393
// true if this 'imm' can be encoded as the offset in a ldr/str instruction
23762394
/*static*/ bool emitter::emitIns_valid_imm_for_ldst_offset(INT64 imm, emitAttr attr)
23772395
{
@@ -5505,6 +5523,8 @@ void emitter::emitIns_R_R_I(
55055523
isLdSt = true;
55065524
break;
55075525

5526+
case INS_ldapurb:
5527+
case INS_stlurb:
55085528
case INS_ldurb:
55095529
case INS_sturb:
55105530
// size is ignored
@@ -5522,7 +5542,9 @@ void emitter::emitIns_R_R_I(
55225542
break;
55235543

55245544
case INS_ldurh:
5545+
case INS_ldapurh:
55255546
case INS_sturh:
5547+
case INS_stlurh:
55265548
// size is ignored
55275549
unscaledOp = true;
55285550
scale = 0;
@@ -5550,6 +5572,8 @@ void emitter::emitIns_R_R_I(
55505572

55515573
case INS_ldur:
55525574
case INS_stur:
5575+
case INS_ldapur:
5576+
case INS_stlur:
55535577
// Is the target a vector register?
55545578
if (isVectorRegister(reg1))
55555579
{
@@ -8813,8 +8837,6 @@ void emitter::emitIns_Call(EmitCallType callType,
88138837
{
88148838
/* This is an indirect call (either a virtual call or func ptr call) */
88158839

8816-
id->idSetIsCallRegPtr();
8817-
88188840
if (isJump)
88198841
{
88208842
ins = INS_br_tail; // INS_br_tail Reg

src/coreclr/jit/emitarm64.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,9 @@ static bool emitIns_valid_imm_for_alu(INT64 imm, emitAttr size);
543543
// true if this 'imm' can be encoded as the offset in a ldr/str instruction
544544
static bool emitIns_valid_imm_for_ldst_offset(INT64 imm, emitAttr size);
545545

546+
// true if this 'imm' can be encoded as the offset in an unscaled ldr/str instruction
547+
static bool emitIns_valid_imm_for_unscaled_ldst_offset(INT64 imm);
548+
546549
// true if this 'imm' can be encoded as a input operand to a ccmp instruction
547550
static bool emitIns_valid_imm_for_ccmp(INT64 imm);
548551

src/coreclr/jit/instrsarm64.h

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,17 +1053,15 @@ INST1(ldarb, "ldarb", LD, IF_LS_2A, 0x08DFFC00)
10531053
INST1(ldarh, "ldarh", LD, IF_LS_2A, 0x48DFFC00)
10541054
// ldarh Rt,[Xn] LS_2A 0100100011011111 111111nnnnnttttt 48DF FC00
10551055

1056-
1057-
INST1(ldapr, "ldapr", LD, IF_LS_2A, 0xB8BFC000)
1056+
INST1(ldapr, "ldapr", LD, IF_LS_2A, 0xB8BFC000)
10581057
// ldapr Rt,[Xn] LS_2A 1X11100010111111 110000nnnnnttttt B8BF C000 Rm Rt Rn ARMv8.3 LRCPC
10591058

1060-
INST1(ldaprb, "ldaprb", LD, IF_LS_2A, 0x38BFC000)
1059+
INST1(ldaprb, "ldaprb", LD, IF_LS_2A, 0x38BFC000)
10611060
// ldaprb Rt,[Xn] LS_2A 0011100010111111 110000nnnnnttttt 38BF C000 Rm Rt Rn ARMv8.3 LRCPC
10621061

1063-
INST1(ldaprh, "ldaprh", LD, IF_LS_2A, 0x78BFC000)
1062+
INST1(ldaprh, "ldaprh", LD, IF_LS_2A, 0x78BFC000)
10641063
// ldaprh Rt,[Xn] LS_2A 0111100010111111 110000nnnnnttttt 78BF C000 Rm Rt Rn ARMv8.3 LRCPC
10651064

1066-
10671065
INST1(ldxr, "ldxr", LD, IF_LS_2A, 0x885F7C00)
10681066
// ldxr Rt,[Xn] LS_2A 1X00100001011111 011111nnnnnttttt 885F 7C00
10691067

@@ -1100,6 +1098,15 @@ INST1(ldursh, "ldursh", LD, IF_LS_2C, 0x78800000)
11001098
INST1(ldursw, "ldursw", LD, IF_LS_2C, 0xB8800000)
11011099
// ldursw Rt,[Xn+simm9] LS_2C 10111000100iiiii iiii00nnnnnttttt B880 0000 [Xn imm(-256..+255)]
11021100

1101+
INST1(ldapur, "ldapur", LD, IF_LS_2C, 0x99400000)
1102+
// ldapur Rt,[Xn+simm9] LS_2C 1X011001010iiiii iiii00nnnnnttttt 9940 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2
1103+
1104+
INST1(ldapurb, "ldapurb", LD, IF_LS_2C, 0x19400000)
1105+
// ldapurb Rt,[Xn+simm9] LS_2C 00011001010iiiii iiii00nnnnnttttt 1940 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2
1106+
1107+
INST1(ldapurh, "ldapurh", LD, IF_LS_2C, 0x59400000)
1108+
// ldapurh Rt,[Xn+simm9] LS_2C 01011001010iiiii iiii00nnnnnttttt 5940 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2
1109+
11031110
INST1(stlr, "stlr", ST, IF_LS_2A, 0x889FFC00)
11041111
// stlr Rt,[Xn] LS_2A 1X00100010011111 111111nnnnnttttt 889F FC00
11051112

@@ -1136,6 +1143,15 @@ INST1(sturb, "sturb", ST, IF_LS_2C, 0x38000000)
11361143
INST1(sturh, "sturh", ST, IF_LS_2C, 0x78000000)
11371144
// sturh Rt,[Xn+simm9] LS_2C 01111000000iiiii iiii00nnnnnttttt 7800 0000 [Xn imm(-256..+255)]
11381145

1146+
INST1(stlur, "stlur", ST, IF_LS_2C, 0x99000000)
1147+
// stlur Rt,[Xn+simm9] LS_2C 1X011001000iiiii iiii00nnnnnttttt 9900 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2
1148+
1149+
INST1(stlurb, "stlurb", ST, IF_LS_2C, 0x19000000)
1150+
// stlurb Rt,[Xn+simm9] LS_2C 00011001000iiiii iiii00nnnnnttttt 1900 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2
1151+
1152+
INST1(stlurh, "stlurh", ST, IF_LS_2C, 0x59000000)
1153+
// stlurh Rt,[Xn+simm9] LS_2C 01011001000iiiii iiii00nnnnnttttt 5900 0000 [Xn imm(-256..+255)] ARMv8.4 RCPC2
1154+
11391155
INST1(casb, "casb", LD|ST, IF_LS_3E, 0x08A07C00)
11401156
// casb Wm, Wt, [Xn] LS_3E 00001000101mmmmm 011111nnnnnttttt 08A0 7C00 Rm Rt Rn ARMv8.1 LSE Atomics
11411157

0 commit comments

Comments
 (0)