-
Notifications
You must be signed in to change notification settings - Fork 14.1k
[X86] Fix invalid instructions on x32 with large stack frames #124041
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Fix invalid instructions on x32 with large stack frames #124041
Conversation
`X86FrameLowering::emitSPUpdate()` assumes that 64-bit targets use a 64-bit stack pointer, but that's not true on x32. When checking the stack pointer size, we need to look at `Uses64BitFramePtr` rather than `Is64Bit`. This avoids generating invalid instructions like `add esp, rcx`. For impossibly-large stack frames (4 GiB or larger with a 32-bit stack pointer), we were also generating invalid instructions like `mov eax, 5000000000`. The inline stack probe code already had a check for that situation; I've moved the check into `emitSPUpdate()`, so any attempt to allocate a 4 GiB stack frame with a 32-bit stack pointer will now trap rather than adjusting ESP by the wrong amount. This also fixes the "can't have 32-bit 16GB stack frame" assertion, which used to be triggerable by user code but is now correct. To help catch situations like this in the future, I've added `-verify-machineinstrs` to the stack clash tests that generate large stack frames. This fixes the expensive-checks buildbot failure caused by llvm#113219.
@llvm/pr-subscribers-backend-x86 Author: None (mconst) Changes
For impossibly-large stack frames (4 GiB or larger with a 32-bit stack pointer), we were also generating invalid instructions like To help catch situations like this in the future, I've added This fixes the expensive-checks buildbot failure caused by #113219. @phoebewang @efriedma-quic Full diff: https://github.com/llvm/llvm-project/pull/124041.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 18de38b2d01597..a7b60afb7f5473 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -234,6 +234,14 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
MachineInstr::MIFlag Flag =
isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
+ if (!Uses64BitFramePtr && !isUInt<32>(Offset)) {
+ // We're being asked to adjust a 32-bit stack pointer by 4 GiB or more.
+ // This might be unreachable code, so don't complain now; just trap if
+ // it's reached at runtime.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::TRAP));
+ return;
+ }
+
uint64_t Chunk = (1LL << 31) - 1;
MachineFunction &MF = *MBB.getParent();
@@ -253,17 +261,19 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
// Rather than emit a long series of instructions for large offsets,
// load the offset into a register and do one sub/add
unsigned Reg = 0;
- unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
+ unsigned Rax = (unsigned)(Uses64BitFramePtr ? X86::RAX : X86::EAX);
if (isSub && !isEAXLiveIn(MBB))
Reg = Rax;
else
- Reg = TRI->findDeadCallerSavedReg(MBB, MBBI);
+ Reg = getX86SubSuperRegister(TRI->findDeadCallerSavedReg(MBB, MBBI),
+ Uses64BitFramePtr ? 64 : 32);
- unsigned AddSubRROpc =
- isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
+ unsigned AddSubRROpc = isSub ? getSUBrrOpcode(Uses64BitFramePtr)
+ : getADDrrOpcode(Uses64BitFramePtr);
if (Reg) {
- BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Reg)
+ BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Uses64BitFramePtr, Offset)),
+ Reg)
.addImm(Offset)
.setMIFlag(Flag);
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
@@ -279,7 +289,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
// addq %rsp, %rax
// xchg %rax, (%rsp)
// movq (%rsp), %rsp
- assert(Is64Bit && "can't have 32-bit 16GB stack frame");
+ assert(Uses64BitFramePtr && "can't have 32-bit 16GB stack frame");
BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
.addReg(Rax, RegState::Kill)
.setMIFlag(Flag);
@@ -289,7 +299,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
Offset = -(Offset - SlotSize);
else
Offset = Offset + SlotSize;
- BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Rax)
+ BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Uses64BitFramePtr, Offset)),
+ Rax)
.addImm(Offset)
.setMIFlag(Flag);
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
@@ -826,10 +837,7 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
.addReg(StackPtr)
.setMIFlag(MachineInstr::FrameSetup);
} else {
- // We're being asked to probe a stack frame that's 4 GiB or larger,
- // but our stack pointer is only 32 bits. This might be unreachable
- // code, so don't complain now; just trap if it's reached at runtime.
- BuildMI(MBB, MBBI, DL, TII.get(X86::TRAP));
+ llvm_unreachable("Offset too large for 32-bit stack pointer");
}
// while in the loop, use loop-invariant reg for CFI,
diff --git a/llvm/test/CodeGen/X86/huge-stack-offset.ll b/llvm/test/CodeGen/X86/huge-stack-offset.ll
index e825328ccd89a2..6629811a59b23d 100644
--- a/llvm/test/CodeGen/X86/huge-stack-offset.ll
+++ b/llvm/test/CodeGen/X86/huge-stack-offset.ll
@@ -13,11 +13,9 @@ define void @foo() nounwind {
; CHECK-64-NEXT: addq [[RAX]], %rsp
; CHECK-32-LABEL: foo:
-; CHECK-32: movl $50000000{{..}}, %eax
-; CHECK-32-NEXT: subl %eax, %esp
+; CHECK-32: ud2
; CHECK-32-NOT: subl $2147483647, %esp
-; CHECK-32: movl $50000000{{..}}, [[EAX:%e..]]
-; CHECK-32-NEXT: addl [[EAX]], %esp
+; CHECK-32: ud2
%1 = alloca [5000000000 x i8], align 16
call void @bar(ptr %1)
ret void
@@ -46,7 +44,7 @@ define i32 @foo3(i32 inreg %x) nounwind {
; CHECK-64-NEXT: subq %rax, %rsp
; CHECK-32-LABEL: foo3:
-; CHECK-32: subl $2147483647, %esp
+; CHECK-32: ud2
; CHECK-32-NOT: movl ${{.*}}, %eax
%1 = alloca [5000000000 x i8], align 16
call void @bar(ptr %1)
diff --git a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll
index 59cbcd0689fbf8..d9b20f50e9a884 100644
--- a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp
-; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X64 %s
-; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86 %s
-; RUN: llc -mtriple=x86_64-linux-gnux32 < %s | FileCheck -check-prefix=CHECK-X32 %s
+; RUN: llc -mtriple=x86_64-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X64 %s
+; RUN: llc -mtriple=i686-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X86 %s
+; RUN: llc -mtriple=x86_64-linux-gnux32 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X32 %s
define i32 @foo() local_unnamed_addr #0 {
; CHECK-X64-LABEL: foo:
@@ -30,44 +30,22 @@ define i32 @foo() local_unnamed_addr #0 {
; CHECK-X86-LABEL: foo:
; CHECK-X86: # %bb.0:
; CHECK-X86-NEXT: ud2
-; CHECK-X86-NEXT: .cfi_def_cfa_register %eax
-; CHECK-X86-NEXT: .cfi_adjust_cfa_offset 4800000000
-; CHECK-X86-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
-; CHECK-X86-NEXT: subl $4096, %esp # imm = 0x1000
-; CHECK-X86-NEXT: movl $0, (%esp)
-; CHECK-X86-NEXT: cmpl %eax, %esp
-; CHECK-X86-NEXT: jne .LBB0_1
-; CHECK-X86-NEXT: # %bb.2:
-; CHECK-X86-NEXT: subl $12, %esp
-; CHECK-X86-NEXT: .cfi_def_cfa_register %esp
; CHECK-X86-NEXT: .cfi_def_cfa_offset 4800000016
; CHECK-X86-NEXT: movl $1, 392(%esp)
; CHECK-X86-NEXT: movl $1, 28792(%esp)
; CHECK-X86-NEXT: movl (%esp), %eax
-; CHECK-X86-NEXT: movl $4800000012, %ecx # imm = 0x11E1A300C
-; CHECK-X86-NEXT: addl %ecx, %esp
+; CHECK-X86-NEXT: ud2
; CHECK-X86-NEXT: .cfi_def_cfa_offset 4
; CHECK-X86-NEXT: retl
;
; CHECK-X32-LABEL: foo:
; CHECK-X32: # %bb.0:
; CHECK-X32-NEXT: ud2
-; CHECK-X32-NEXT: .cfi_def_cfa_register %r11
-; CHECK-X32-NEXT: .cfi_adjust_cfa_offset 4799995904
-; CHECK-X32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
-; CHECK-X32-NEXT: subl $4096, %esp # imm = 0x1000
-; CHECK-X32-NEXT: movq $0, (%esp)
-; CHECK-X32-NEXT: cmpl %r11d, %esp
-; CHECK-X32-NEXT: jne .LBB0_1
-; CHECK-X32-NEXT: # %bb.2:
-; CHECK-X32-NEXT: subl $3976, %esp # imm = 0xF88
-; CHECK-X32-NEXT: .cfi_def_cfa_register %rsp
; CHECK-X32-NEXT: .cfi_def_cfa_offset 4799999888
; CHECK-X32-NEXT: movl $1, 264(%esp)
; CHECK-X32-NEXT: movl $1, 28664(%esp)
; CHECK-X32-NEXT: movl -128(%esp), %eax
-; CHECK-X32-NEXT: movabsq $4799999880, %rcx # imm = 0x11E1A2F88
-; CHECK-X32-NEXT: addq %rcx, %esp
+; CHECK-X32-NEXT: ud2
; CHECK-X32-NEXT: .cfi_def_cfa_offset 8
; CHECK-X32-NEXT: retq
%a = alloca i32, i64 1200000000, align 16
diff --git a/llvm/test/CodeGen/X86/stack-clash-huge.ll b/llvm/test/CodeGen/X86/stack-clash-huge.ll
index 03f028dfc25067..c9990773201f0b 100644
--- a/llvm/test/CodeGen/X86/stack-clash-huge.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-huge.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp
-; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X64 %s
-; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86 %s
-; RUN: llc -mtriple=x86_64-linux-gnux32 < %s | FileCheck -check-prefix=CHECK-X32 %s
+; RUN: llc -mtriple=x86_64-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X64 %s
+; RUN: llc -mtriple=i686-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X86 %s
+; RUN: llc -mtriple=x86_64-linux-gnux32 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X32 %s
define i32 @foo() local_unnamed_addr #0 {
; CHECK-X64-LABEL: foo:
@@ -69,7 +69,7 @@ define i32 @foo() local_unnamed_addr #0 {
; CHECK-X32-NEXT: movl $1, 28664(%esp)
; CHECK-X32-NEXT: movl -128(%esp), %eax
; CHECK-X32-NEXT: movl $2399999880, %ecx # imm = 0x8F0D1788
-; CHECK-X32-NEXT: addq %rcx, %esp
+; CHECK-X32-NEXT: addl %ecx, %esp
; CHECK-X32-NEXT: .cfi_def_cfa_offset 8
; CHECK-X32-NEXT: retq
%a = alloca i32, i64 600000000, align 16
|
@@ -234,6 +234,14 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, | |||
MachineInstr::MIFlag Flag = | |||
isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy; | |||
|
|||
if (!Uses64BitFramePtr && !isUInt<32>(Offset)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
isUInt<31>
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe it's correct -- it's valid to have a 2.5 GB stack frame on 32-bit x86 (and x32), and I think the code handles it properly. There's even a test case for it (stack-clash-huge.ll
).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, can we split this change into another patch. We should use this to fix the failure of #113219 only.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, done. This means that for now, the stack-clash-extra-huge.ll
test will generate the invalid instruction movl $4799999880, %ecx
on x32, just like the preexisting huge-stack-offset.ll
test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
I think this is ready -- would you be up for merging it? |
@mconst I've merged the PR for you :) Please look out for any buildbot failures. |
Awesome, thanks for your help! I'll watch the buildbots to verify that this fixes the failure from #113219. |
@phoebewang Thanks for all your help reviewing this! For the code I split out (which emits a trap for impossibly large stack frames rather than using invalid instructions like |
Yeah, please. I think it's more friendly for user to get obvious error than some mysterious result. |
If you try to create a stack frame of 4 GiB or larger with a 32-bit stack pointer, we currently emit invalid instructions like `mov eax, 5000000000` (unless you specify `-fstack-clash-protection`, in which case we emit a trap instead). The trap seems nicer, so let's do that in all cases. This avoids emitting invalid instructions, and also fixes the "can't have 32-bit 16GB stack frame" assertion in `X86FrameLowering::emitSPUpdate()` (which used to be triggerable by user code, but is now correct). This was originally part of llvm#124041.
If you try to create a stack frame of 4 GiB or larger with a 32-bit stack pointer, we currently emit invalid instructions like `mov eax, 5000000000` (unless you specify `-fstack-clash-protection`, in which case we emit a trap instead). The trap seems nicer, so let's do that in all cases. This avoids emitting invalid instructions, and also fixes the "can't have 32-bit 16GB stack frame" assertion in `X86FrameLowering::emitSPUpdate()` (which used to be triggerable by user code, but is now correct). This was originally part of #124041. @phoebewang
X86FrameLowering::emitSPUpdate()
assumes that 64-bit targets use a 64-bit stack pointer, but that's not true on x32.When checking the stack pointer size, we need to look at
Uses64BitFramePtr
rather thanIs64Bit
. This avoids generating invalid instructions likeadd esp, rcx
.For impossibly-large stack frames (4 GiB or larger with a 32-bit stack pointer), we were also generating invalid instructions like
mov eax, 5000000000
. The inline stack probe code already had a check for that situation; I've moved the check intoemitSPUpdate()
, so any attempt to allocate a 4 GiB stack frame with a 32-bit stack pointer will now trap rather than adjusting ESP by the wrong amount. This also fixes the "can't have 32-bit 16GB stack frame" assertion, which used to be triggerable by user code but is now correct.To help catch situations like this in the future, I've added
-verify-machineinstrs
to the stack clash tests that generate large stack frames.This fixes the expensive-checks buildbot failure caused by #113219.
@phoebewang @efriedma-quic