-
Notifications
You must be signed in to change notification settings - Fork 14k
[ARM64EC] Fix thunks for vector args #96003
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Daniel Paoliello (dpaoliello) ChangesThe checks when building a thunk to decide if an arg needed to be cast to/from an integer or redirected via a pointer didn't match how arg types were changed in Instead of duplicating these checks, we should check if the arg type differs between x64 and AArch64 and then cast or redirect as appropriate. Full diff: https://github.com/llvm/llvm-project/pull/96003.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 218201f24aaab..b6cd816d78938 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -387,6 +387,7 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
SmallVector<Value *> Args;
// Pass the called function in x9.
+ auto X64TyOffset = 1;
Args.push_back(F->arg_begin());
Type *RetTy = Arm64Ty->getReturnType();
@@ -396,10 +397,11 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
// pointer.
if (DL.getTypeStoreSize(RetTy) > 8) {
Args.push_back(IRB.CreateAlloca(RetTy));
+ X64TyOffset++;
}
}
- for (auto &Arg : make_range(F->arg_begin() + 1, F->arg_end())) {
+ for (auto [Arg, X64ArgType] : llvm::zip_equal(make_range(F->arg_begin() + 1, F->arg_end()), make_range(X64Ty->param_begin() + X64TyOffset, X64Ty->param_end()))) {
// Translate arguments from AArch64 calling convention to x86 calling
// convention.
//
@@ -414,8 +416,7 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
// with an attribute.)
//
// The first argument is the called function, stored in x9.
- if (Arg.getType()->isArrayTy() || Arg.getType()->isStructTy() ||
- DL.getTypeStoreSize(Arg.getType()) > 8) {
+ if (Arg.getType() != X64ArgType) {
Value *Mem = IRB.CreateAlloca(Arg.getType());
IRB.CreateStore(&Arg, Mem);
if (DL.getTypeStoreSize(Arg.getType()) <= 8) {
@@ -488,8 +489,7 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
for (unsigned i = ThunkArgOffset, e = PassthroughArgSize; i != e; ++i) {
Value *Arg = Thunk->getArg(i);
Type *ArgTy = Arm64Ty->getParamType(i - ThunkArgOffset);
- if (ArgTy->isArrayTy() || ArgTy->isStructTy() ||
- DL.getTypeStoreSize(ArgTy) > 8) {
+ if (ArgTy != Arg->getType()) {
// Translate array/struct arguments to the expected type.
if (DL.getTypeStoreSize(ArgTy) <= 8) {
Value *CastAlloca = IRB.CreateAlloca(ArgTy);
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
index 0cf678f56e03c..6aeeeed94543d 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -487,6 +487,109 @@ define void @cxx_method(ptr noundef nonnull align 8 dereferenceable(8) %0, ptr d
ret void
}
+define <4 x i8> @small_vector(<4 x i8> %0) {
+; CHECK-LABEL: .def $ientry_thunk$cdecl$m$m;
+; CHECK: .section .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m$m
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #192
+; CHECK-NEXT: .seh_stackalloc 192
+; CHECK-NEXT: stp q6, q7, [sp, #16] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q6, 16
+; CHECK-NEXT: stp q8, q9, [sp, #48] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q8, 48
+; CHECK-NEXT: stp q10, q11, [sp, #80] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q10, 80
+; CHECK-NEXT: stp q12, q13, [sp, #112] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q12, 112
+; CHECK-NEXT: stp q14, q15, [sp, #144] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q14, 144
+; CHECK-NEXT: stp x29, x30, [sp, #176] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_fplr 176
+; CHECK-NEXT: add x29, sp, #176
+; CHECK-NEXT: .seh_add_fp 176
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: str w0, [sp, #12]
+; CHECK-NEXT: ldr s0, [sp, #12]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: blr x9
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-NEXT: adrp x9, __os_arm64x_dispatch_ret
+; CHECK-NEXT: str s0, [sp, #8]
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: ldr x0, [x9, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldp x29, x30, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_fplr 176
+; CHECK-NEXT: ldp q14, q15, [sp, #144] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q14, 144
+; CHECK-NEXT: ldp q12, q13, [sp, #112] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q12, 112
+; CHECK-NEXT: ldp q10, q11, [sp, #80] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q10, 80
+; CHECK-NEXT: ldp q8, q9, [sp, #48] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q8, 48
+; CHECK-NEXT: ldp q6, q7, [sp, #16] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q6, 16
+; CHECK-NEXT: add sp, sp, #192
+; CHECK-NEXT: .seh_stackalloc 192
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: br x0
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+start:
+ ret <4 x i8> %0
+}
+
+define <8 x i16> @large_vector(<8 x i16> %0) {
+; CHECK-LABEL: .def $ientry_thunk$cdecl$m16$m16;
+; CHECK: .section .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m16$m16
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp q6, q7, [sp, #-192]! // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_px q6, 192
+; CHECK-NEXT: stp q8, q9, [sp, #32] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q8, 32
+; CHECK-NEXT: stp q10, q11, [sp, #64] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q10, 64
+; CHECK-NEXT: stp q12, q13, [sp, #96] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q12, 96
+; CHECK-NEXT: stp q14, q15, [sp, #128] // 32-byte Folded Spill
+; CHECK-NEXT: .seh_save_any_reg_p q14, 128
+; CHECK-NEXT: str x19, [sp, #160] // 8-byte Folded Spill
+; CHECK-NEXT: .seh_save_reg x19, 160
+; CHECK-NEXT: stp x29, x30, [sp, #168] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_fplr 168
+; CHECK-NEXT: add x29, sp, #168
+; CHECK-NEXT: .seh_add_fp 168
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: ldr q0, [x1]
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: blr x9
+; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_ret
+; CHECK-NEXT: str q0, [x19]
+; CHECK-NEXT: ldr x0, [x8, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldp x29, x30, [sp, #168] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_fplr 168
+; CHECK-NEXT: ldr x19, [sp, #160] // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg x19, 160
+; CHECK-NEXT: ldp q14, q15, [sp, #128] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q14, 128
+; CHECK-NEXT: ldp q12, q13, [sp, #96] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q12, 96
+; CHECK-NEXT: ldp q10, q11, [sp, #64] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q10, 64
+; CHECK-NEXT: ldp q8, q9, [sp, #32] // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_p q8, 32
+; CHECK-NEXT: ldp q6, q7, [sp], #192 // 32-byte Folded Reload
+; CHECK-NEXT: .seh_save_any_reg_px q6, 192
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: br x0
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+start:
+ ret <8 x i16> %0
+}
; Verify the hybrid bitmap
; CHECK-LABEL: .section .hybmp$x,"yi"
@@ -523,3 +626,9 @@ define void @cxx_method(ptr noundef nonnull align 8 dereferenceable(8) %0, ptr d
; CHECK-NEXT: .symidx "#cxx_method"
; CHECK-NEXT: .symidx $ientry_thunk$cdecl$i8$i8i8
; CHECK-NEXT: .word 1
+; CHECK-NEXT: .symidx "#small_vector"
+; CHECK-NEXT: .symidx $ientry_thunk$cdecl$m$m
+; CHECK-NEXT: .word 1
+; CHECK-NEXT: .symidx "#large_vector"
+; CHECK-NEXT: .symidx $ientry_thunk$cdecl$m16$m16
+; CHECK-NEXT: .word 1
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
index 7a40fcd85ac58..dcc675839b714 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
@@ -457,6 +457,109 @@ declare %T2 @simple_struct(%T1, %T2, %T3, %T4) nounwind;
; CHECK-NEXT: .seh_endfunclet
; CHECK-NEXT: .seh_endproc
+declare <4 x i8> @small_vector(<4 x i8> %0) nounwind;
+; CHECK-LABEL: .def $iexit_thunk$cdecl$m$m;
+; CHECK: .section .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m$m
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #64
+; CHECK-NEXT: .seh_stackalloc 64
+; CHECK-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_fplr 48
+; CHECK-NEXT: add x29, sp, #48
+; CHECK-NEXT: .seh_add_fp 48
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
+; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT: ldr x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: stur s0, [x29, #-4]
+; CHECK-NEXT: blr x16
+; CHECK-NEXT: stur w8, [x29, #-8]
+; CHECK-NEXT: ldur s0, [x29, #-8]
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_fplr 48
+; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: .seh_stackalloc 64
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: ret
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+; CHECK-LABEL: .def "#small_vector$exit_thunk";
+; CHECK: .section .wowthk$aa,"xr",discard,"#small_vector$exit_thunk"
+; CHECK: .weak_anti_dep small_vector
+; CHECK: .weak_anti_dep "#small_vector"
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .seh_save_reg_x x30, 16
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: adrp x8, __os_arm64x_check_icall
+; CHECK-NEXT: adrp x11, small_vector
+; CHECK-NEXT: add x11, x11, :lo12:small_vector
+; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$m$m)
+; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$m$m)
+; CHECK-NEXT: blr x8
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: br x11
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+
+declare <8 x i16> @large_vector(<8 x i16> %0) nounwind;
+; CHECK-LABEL: .def $iexit_thunk$cdecl$m16$m16;
+; CHECK: .section .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m16$m16
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #80
+; CHECK-NEXT: .seh_stackalloc 80
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_fplr 64
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: .seh_add_fp 64
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: add x1, sp, #32
+; CHECK-NEXT: ldr x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT: str q0, [sp, #32]
+; CHECK-NEXT: blr x16
+; CHECK-NEXT: ldur q0, [x29, #-16]
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_fplr 64
+; CHECK-NEXT: add sp, sp, #80
+; CHECK-NEXT: .seh_stackalloc 80
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: ret
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+; CHECK-LABEL: .def "#large_vector$exit_thunk";
+; CHECK: .section .wowthk$aa,"xr",discard,"#large_vector$exit_thunk"
+; CHECK: .weak_anti_dep large_vector
+; CHECK: .weak_anti_dep "#large_vector"
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .seh_save_reg_x x30, 16
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: adrp x8, __os_arm64x_check_icall
+; CHECK-NEXT: adrp x11, large_vector
+; CHECK-NEXT: add x11, x11, :lo12:large_vector
+; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$m16$m16)
+; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$m16$m16)
+; CHECK-NEXT: blr x8
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: br x11
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+
; CHECK-LABEL: .section .hybmp$x,"yi"
; CHECK-NEXT: .symidx "#func_caller"
; CHECK-NEXT: .symidx $ientry_thunk$cdecl$v$v
@@ -515,6 +618,18 @@ declare %T2 @simple_struct(%T1, %T2, %T3, %T4) nounwind;
; CHECK-NEXT: .symidx "#simple_struct$exit_thunk"
; CHECK-NEXT: .symidx simple_struct
; CHECK-NEXT: .word 0
+; CHECK-NEXT: .symidx small_vector
+; CHECK-NEXT: .symidx $iexit_thunk$cdecl$m$m
+; CHECK-NEXT: .word 4
+; CHECK-NEXT: .symidx "#small_vector$exit_thunk"
+; CHECK-NEXT: .symidx small_vector
+; CHECK-NEXT: .word 0
+; CHECK-NEXT: .symidx large_vector
+; CHECK-NEXT: .symidx $iexit_thunk$cdecl$m16$m16
+; CHECK-NEXT: .word 4
+; CHECK-NEXT: .symidx "#large_vector$exit_thunk"
+; CHECK-NEXT: .symidx large_vector
+; CHECK-NEXT: .word 0
define void @func_caller() nounwind {
call void @no_op()
@@ -529,5 +644,7 @@ define void @func_caller() nounwind {
call [2 x i8] @small_array([2 x i8] [i8 0, i8 0], [2 x float] [float 0.0, float 0.0])
call [3 x i64] @large_array([3 x i64] [i64 0, i64 0, i64 0], [2 x double] [double 0.0, double 0.0], [2 x [2 x i64]] [[2 x i64] [i64 0, i64 0], [2 x i64] [i64 0, i64 0]])
call %T2 @simple_struct(%T1 { i16 0 }, %T2 { i32 0, float 0.0 }, %T3 { i64 0, double 0.0 }, %T4 { i64 0, double 0.0, i8 0 })
+ call <4 x i8> @small_vector(<4 x i8> <i8 0, i8 0, i8 0, i8 0>)
+ call <8 x i16> @large_vector(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
ret void
}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
New approach looks fine; I'd still like an answer to my question about the correct convention for <4 x i8>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
The checks when building a thunk to decide if an arg needed to be cast to/from an integer or redirected via a pointer didn't match how arg types were changed in `canonicalizeThunkType`, this caused LLVM to ICE when using vector types as args due to incorrect types in a call instruction. Instead of duplicating these checks, we should check if the arg type differs between x64 and AArch64 and then cast or redirect as appropriate.
The checks when building a thunk to decide if an arg needed to be cast to/from an integer or redirected via a pointer didn't match how arg types were changed in
canonicalizeThunkType
, this caused LLVM to ICE when using vector types as args due to incorrect types in a call instruction.Instead of duplicating these checks, we should check if the arg type differs between x64 and AArch64 and then cast or redirect as appropriate.