Skip to content

Commit 5e7cfbf

Browse files
authored
[NativeAOT] Save full ARM64 SIMD arg registers in UniversalTransition (#74888)
* Save full ARM64 SIMD arg registers in UniversalTransition * remove unused SAVE/RESTORE argument macros
1 parent ab2d195 commit 5e7cfbf

File tree

5 files changed

+46
-89
lines changed

5 files changed

+46
-89
lines changed

src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,13 +1105,13 @@ struct UniversalTransitionStackFrame
11051105
// Conservative GC reporting must be applied to everything between the base of the
11061106
// ReturnBlock and the top of the StackPassedArgs.
11071107
private:
1108-
uintptr_t m_pushedFP; // ChildSP+000 CallerSP-0C0 (0x08 bytes) (fp)
1109-
uintptr_t m_pushedLR; // ChildSP+008 CallerSP-0B8 (0x08 bytes) (lr)
1110-
uint64_t m_fpArgRegs[8]; // ChildSP+010 CallerSP-0B0 (0x40 bytes) (d0-d7)
1111-
uintptr_t m_returnBlock[4]; // ChildSP+050 CallerSP-070 (0x40 bytes)
1112-
uintptr_t m_intArgRegs[9]; // ChildSP+070 CallerSP-050 (0x48 bytes) (x0-x8)
1113-
uintptr_t m_alignmentPad; // ChildSP+0B8 CallerSP-008 (0x08 bytes)
1114-
uintptr_t m_stackPassedArgs[1]; // ChildSP+0C0 CallerSP+000 (unknown size)
1108+
uintptr_t m_pushedFP; // ChildSP+000 CallerSP-100 (0x08 bytes) (fp)
1109+
uintptr_t m_pushedLR; // ChildSP+008 CallerSP-0F8 (0x08 bytes) (lr)
1110+
Fp128 m_fpArgRegs[8]; // ChildSP+010 CallerSP-0F0 (0x80 bytes) (q0-q7)
1111+
uintptr_t m_returnBlock[4]; // ChildSP+090 CallerSP-070 (0x40 bytes)
1112+
uintptr_t m_intArgRegs[9]; // ChildSP+0B0 CallerSP-050 (0x48 bytes) (x0-x8)
1113+
uintptr_t m_alignmentPad; // ChildSP+0F8 CallerSP-008 (0x08 bytes)
1114+
uintptr_t m_stackPassedArgs[1]; // ChildSP+100 CallerSP+000 (unknown size)
11151115

11161116
public:
11171117
PTR_UIntNative get_CallerSP() { return GET_POINTER_TO_FIELD(m_stackPassedArgs[0]); }

src/coreclr/nativeaot/Runtime/UniversalTransitionHelpers.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
//
2222
// In the absence of trashing, such bugs can become undetectable if the code that
2323
// dispatches the call happens to never touch the impacted argument register (e.g., xmm3 on
24-
// amd64 or d5 on arm32). In such a case, the original enregistered argument will flow
24+
// amd64 or q5 on arm64). In such a case, the original enregistered argument will flow
2525
// unmodified into the eventual callee, obscuring the fact that the dispatcher failed to
2626
// propagate the transition frame copy of this register.
2727
//

src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.S

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
#define RETURN_BLOCK_SIZE (32)
2424

2525
#define COUNT_FLOAT_ARG_REGISTERS (8)
26-
#define FLOAT_REGISTER_SIZE (8)
26+
#define FLOAT_REGISTER_SIZE (16)
2727
#define FLOAT_ARG_REGISTERS_SIZE (COUNT_FLOAT_ARG_REGISTERS * FLOAT_REGISTER_SIZE)
2828

2929
#define PUSHED_LR_SIZE (8)
@@ -50,7 +50,7 @@
5050
//
5151
// RhpUniversalTransition
5252
//
53-
// At input to this function, x0-8, d0-7 and the stack may contain any number of arguments.
53+
// At input to this function, x0-8, q0-7 and the stack may contain any number of arguments.
5454
//
5555
// In addition, there are 2 extra arguments passed in the intra-procedure-call scratch register:
5656
// xip0 will contain the managed function that is to be called by this transition function
@@ -63,16 +63,16 @@
6363
//
6464
// Frame layout is:
6565
//
66-
// {StackPassedArgs} ChildSP+0C0 CallerSP+000
67-
// {AlignmentPad (0x8 bytes)} ChildSP+0B8 CallerSP-008
68-
// {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+070 CallerSP-050
69-
// {ReturnBlock (0x20 bytes)} ChildSP+050 CallerSP-070
66+
// {StackPassedArgs} ChildSP+100 CallerSP+000
67+
// {AlignmentPad (0x8 bytes)} ChildSP+0F8 CallerSP-008
68+
// {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+0B0 CallerSP-050
69+
// {ReturnBlock (0x20 bytes)} ChildSP+090 CallerSP-070
7070
// -- The base address of the Return block is the TransitionBlock pointer, the floating point args are
7171
// in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact
7272
// layout of all pieces of the frame that lie at or above the pushed floating point registers.
73-
// {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+010 CallerSP-0B0
74-
// {PushedLR} ChildSP+008 CallerSP-0B8
75-
// {PushedFP} ChildSP+000 CallerSP-0C0
73+
// {FpArgRegs (q0-q7) (0x80 bytes)} ChildSP+010 CallerSP-0F0
74+
// {PushedLR} ChildSP+008 CallerSP-0F8
75+
// {PushedFP} ChildSP+000 CallerSP-100
7676
//
7777
// NOTE: If the frame layout ever changes, the C++ UniversalTransitionStackFrame structure
7878
// must be updated as well.
@@ -95,10 +95,10 @@
9595
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -STACK_SIZE // ;; Push down stack pointer and store FP and LR
9696

9797
// Floating point registers
98-
stp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
99-
stp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
100-
stp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
101-
stp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
98+
stp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
99+
stp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
100+
stp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
101+
stp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]
102102

103103
// Space for return buffer data (0x40 bytes)
104104

@@ -112,10 +112,10 @@
112112
#ifdef TRASH_SAVED_ARGUMENT_REGISTERS
113113
PREPARE_EXTERNAL_VAR RhpFpTrashValues, x1
114114

115-
ldp d0,d1, [x1, 0x0]
116-
ldp d2,d3, [x1, 0x10]
117-
ldp d4,d5, [x1, 0x20]
118-
ldp d6,d7, [x1, 0x30]
115+
ldp q0,q1, [x1, 0x0]
116+
ldp q2,q3, [x1, 0x20]
117+
ldp q4,q5, [x1, 0x40]
118+
ldp q6,q7, [x1, 0x60]
119119

120120
PREPARE_EXTERNAL_VAR RhpIntegerTrashValues, x1
121121

@@ -139,10 +139,10 @@
139139
mov x12, x0
140140

141141
// Restore floating point registers
142-
ldp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
143-
ldp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
144-
ldp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
145-
ldp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
142+
ldp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
143+
ldp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
144+
ldp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
145+
ldp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]
146146

147147
// Restore the argument registers
148148
ldp x0, x1, [sp, #(ARGUMENT_REGISTERS_OFFSET )]

src/coreclr/nativeaot/Runtime/arm64/UniversalTransition.asm

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
#define RETURN_BLOCK_SIZE (32)
2424

2525
#define COUNT_FLOAT_ARG_REGISTERS (8)
26-
#define FLOAT_REGISTER_SIZE (8)
26+
#define FLOAT_REGISTER_SIZE (16)
2727
#define FLOAT_ARG_REGISTERS_SIZE (COUNT_FLOAT_ARG_REGISTERS * FLOAT_REGISTER_SIZE)
2828

2929
#define PUSHED_LR_SIZE (8)
@@ -51,7 +51,7 @@
5151
;;
5252
;; RhpUniversalTransition
5353
;;
54-
;; At input to this function, x0-8, d0-7 and the stack may contain any number of arguments.
54+
;; At input to this function, x0-8, q0-7 and the stack may contain any number of arguments.
5555
;;
5656
;; In addition, there are 2 extra arguments passed in the intra-procedure-call scratch register:
5757
;; xip0 will contain the managed function that is to be called by this transition function
@@ -64,16 +64,16 @@
6464
;;
6565
;; Frame layout is:
6666
;;
67-
;; {StackPassedArgs} ChildSP+0C0 CallerSP+000
68-
;; {AlignmentPad (0x8 bytes)} ChildSP+0B8 CallerSP-008
69-
;; {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+070 CallerSP-050
70-
;; {ReturnBlock (0x20 bytes)} ChildSP+050 CallerSP-070
67+
;; {StackPassedArgs} ChildSP+100 CallerSP+000
68+
;; {AlignmentPad (0x8 bytes)} ChildSP+0F8 CallerSP-008
69+
;; {IntArgRegs (x0-x8) (0x48 bytes)} ChildSP+0A0 CallerSP-050
70+
;; {ReturnBlock (0x20 bytes)} ChildSP+090 CallerSP-070
7171
;; -- The base address of the Return block is the TransitionBlock pointer, the floating point args are
7272
;; in the neg space of the TransitionBlock pointer. Note that the callee has knowledge of the exact
7373
;; layout of all pieces of the frame that lie at or above the pushed floating point registers.
74-
;; {FpArgRegs (d0-d7) (0x40 bytes)} ChildSP+010 CallerSP-0B0
75-
;; {PushedLR} ChildSP+008 CallerSP-0B8
76-
;; {PushedFP} ChildSP+000 CallerSP-0C0
74+
;; {FpArgRegs (q0-q7) (0x80 bytes)} ChildSP+010 CallerSP-0F0
75+
;; {PushedLR} ChildSP+008 CallerSP-0F8
76+
;; {PushedFP} ChildSP+000 CallerSP-100
7777
;;
7878
;; NOTE: If the frame layout ever changes, the C++ UniversalTransitionStackFrame structure
7979
;; must be updated as well.
@@ -97,10 +97,10 @@
9797
PROLOG_SAVE_REG_PAIR fp, lr, #-STACK_SIZE! ;; Push down stack pointer and store FP and LR
9898

9999
;; Floating point registers
100-
stp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
101-
stp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
102-
stp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
103-
stp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
100+
stp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
101+
stp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
102+
stp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
103+
stp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]
104104

105105
;; Space for return buffer data (0x40 bytes)
106106

@@ -130,10 +130,10 @@
130130
mov x12, x0
131131

132132
;; Restore floating point registers
133-
ldp d0, d1, [sp, #(FLOAT_ARG_OFFSET )]
134-
ldp d2, d3, [sp, #(FLOAT_ARG_OFFSET + 0x10)]
135-
ldp d4, d5, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
136-
ldp d6, d7, [sp, #(FLOAT_ARG_OFFSET + 0x30)]
133+
ldp q0, q1, [sp, #(FLOAT_ARG_OFFSET )]
134+
ldp q2, q3, [sp, #(FLOAT_ARG_OFFSET + 0x20)]
135+
ldp q4, q5, [sp, #(FLOAT_ARG_OFFSET + 0x40)]
136+
ldp q6, q7, [sp, #(FLOAT_ARG_OFFSET + 0x60)]
137137

138138
;; Restore the argument registers
139139
ldp x0, x1, [sp, #(ARGUMENT_REGISTERS_OFFSET )]

src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -164,49 +164,6 @@ C_FUNC(\Name):
164164
brk #0
165165
.endm
166166

167-
//-----------------------------------------------------------------------------
168-
// The Following sets of SAVE_*_REGISTERS expect the memory to be reserved and
169-
// base address to be passed in $reg
170-
//
171-
172-
// Reserve 64 bytes of memory before calling SAVE_ARGUMENT_REGISTERS
173-
.macro SAVE_ARGUMENT_REGISTERS reg, ofs
174-
175-
stp x0, x1, [\reg, #(\ofs)]
176-
stp x2, x3, [\reg, #(\ofs + 16)]
177-
stp x4, x5, [\reg, #(\ofs + 32)]
178-
stp x6, x7, [\reg, #(\ofs + 48)]
179-
180-
.endm
181-
182-
// Reserve 64 bytes of memory before calling SAVE_FLOAT_ARGUMENT_REGISTERS
183-
.macro SAVE_FLOAT_ARGUMENT_REGISTERS reg, ofs
184-
185-
stp d0, d1, [\reg, #(\ofs)]
186-
stp d2, d3, [\reg, #(\ofs + 16)]
187-
stp d4, d5, [\reg, #(\ofs + 32)]
188-
stp d6, d7, [\reg, #(\ofs + 48)]
189-
190-
.endm
191-
192-
.macro RESTORE_ARGUMENT_REGISTERS reg, ofs
193-
194-
ldp x0, x1, [\reg, #(\ofs)]
195-
ldp x2, x3, [\reg, #(\ofs + 16)]
196-
ldp x4, x5, [\reg, #(\ofs + 32)]
197-
ldp x6, x7, [\reg, #(\ofs + 48)]
198-
199-
.endm
200-
201-
.macro RESTORE_FLOAT_ARGUMENT_REGISTERS reg, ofs
202-
203-
ldp d0, d1, [\reg, #(\ofs)]
204-
ldp d2, d3, [\reg, #(\ofs + 16)]
205-
ldp d4, d5, [\reg, #(\ofs + 32)]
206-
ldp d6, d7, [\reg, #(\ofs + 48)]
207-
208-
.endm
209-
210167
.macro EPILOG_BRANCH_REG reg
211168

212169
br \reg

0 commit comments

Comments
 (0)