-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LLD][COFF] Add support for ARM64EC import call thunks with extended range #109703
Conversation
@llvm/pr-subscribers-lld @llvm/pr-subscribers-lld-coff Author: Jacek Caban (cjacek) ChangesThe MSVC linker generates range extensions for these thunks when needed. This commit inlines the range extension into the thunk, making it both slightly more optimal and easier to implement in LLD. Full diff: https://github.com/llvm/llvm-project/pull/109703.diff 4 Files Affected:
diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index 6510c637ae8fe6..c6986681dffe77 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -1100,6 +1100,13 @@ void CHPERedirectionChunk::writeTo(uint8_t *buf) const {
ImportThunkChunkARM64EC::ImportThunkChunkARM64EC(ImportFile *file)
: ImportThunkChunk(file->ctx, file->impSym), file(file) {}
+size_t ImportThunkChunkARM64EC::getSize() const {
+ if (!extended)
+ return sizeof(importThunkARM64EC);
+ // The last instruction is replaced with an inline range extension thunk.
+ return sizeof(importThunkARM64EC) + sizeof(arm64Thunk) - sizeof(uint32_t);
+}
+
void ImportThunkChunkARM64EC::writeTo(uint8_t *buf) const {
memcpy(buf, importThunkARM64EC, sizeof(importThunkARM64EC));
applyArm64Addr(buf, file->impSym->getRVA(), rva, 12);
@@ -1116,7 +1123,29 @@ void ImportThunkChunkARM64EC::writeTo(uint8_t *buf) const {
applyArm64Imm(buf + 12, exitThunkRVA & 0xfff, 0);
Defined *helper = cast<Defined>(file->ctx.config.arm64ECIcallHelper);
- applyArm64Branch26(buf + 16, helper->getRVA() - rva - 16);
+ if (extended) {
+ // Replace last instruction with an inline range extension thunk.
+ memcpy(buf + 16, arm64Thunk, sizeof(arm64Thunk));
+ applyArm64Addr(buf + 16, helper->getRVA(), rva + 16, 12);
+ applyArm64Imm(buf + 20, helper->getRVA() & 0xfff, 0);
+ } else {
+ applyArm64Branch26(buf + 16, helper->getRVA() - rva - 16);
+ }
+}
+
+bool ImportThunkChunkARM64EC::verifyRanges() {
+ if (extended)
+ return true;
+ auto helper = cast<Defined>(file->ctx.config.arm64ECIcallHelper);
+ return isInt<28>(helper->getRVA() - rva - 16);
+}
+
+uint32_t ImportThunkChunkARM64EC::extendRanges() {
+ if (extended || verifyRanges())
+ return 0;
+ extended = true;
+ // The last instruction is replaced with an inline range extension thunk.
+ return sizeof(arm64Thunk) - sizeof(uint32_t);
}
} // namespace lld::coff
diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 04a656ae0874ec..42284f485e5c07 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -185,6 +185,13 @@ class NonSectionChunk : public Chunk {
// bytes, so this is used only for logging or debugging.
virtual StringRef getDebugName() const { return ""; }
+ // Verify that chunk relocations are within their ranges.
+ virtual bool verifyRanges() { return true; };
+
+ // If needed, extend the chunk to ensure all relocations are within the
+ // allowed ranges. Return the additional space required for the extension.
+ virtual uint32_t extendRanges() { return 0; };
+
static bool classof(const Chunk *c) { return c->kind() >= OtherKind; }
protected:
@@ -620,12 +627,15 @@ class ImportThunkChunkARM64 : public ImportThunkChunk {
class ImportThunkChunkARM64EC : public ImportThunkChunk {
public:
explicit ImportThunkChunkARM64EC(ImportFile *file);
- size_t getSize() const override { return sizeof(importThunkARM64EC); };
+ size_t getSize() const override;
MachineTypes getMachine() const override { return ARM64EC; }
void writeTo(uint8_t *buf) const override;
+ bool verifyRanges() override;
+ uint32_t extendRanges() override;
Defined *exitThunk;
Defined *sym = nullptr;
+ bool extended = false;
private:
ImportFile *file;
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index c2765453aa964e..efab7d3e83709c 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -472,9 +472,15 @@ bool Writer::createThunks(OutputSection *os, int margin) {
// Recheck Chunks.size() each iteration, since we can insert more
// elements into it.
for (size_t i = 0; i != os->chunks.size(); ++i) {
- SectionChunk *sc = dyn_cast_or_null<SectionChunk>(os->chunks[i]);
- if (!sc)
+ SectionChunk *sc = dyn_cast<SectionChunk>(os->chunks[i]);
+ if (!sc) {
+ auto chunk = cast<NonSectionChunk>(os->chunks[i]);
+ if (uint32_t size = chunk->extendRanges()) {
+ thunksSize += size;
+ addressesChanged = true;
+ }
continue;
+ }
MachineTypes machine = sc->getMachine();
size_t thunkInsertionSpot = i + 1;
@@ -606,9 +612,12 @@ void Writer::createECCodeMap() {
// Verify that all relocations are in range, with no extra margin requirements.
bool Writer::verifyRanges(const std::vector<Chunk *> chunks) {
for (Chunk *c : chunks) {
- SectionChunk *sc = dyn_cast_or_null<SectionChunk>(c);
- if (!sc)
+ SectionChunk *sc = dyn_cast<SectionChunk>(c);
+ if (!sc) {
+ if (!cast<NonSectionChunk>(c)->verifyRanges())
+ return false;
continue;
+ }
MachineTypes machine = sc->getMachine();
ArrayRef<coff_relocation> relocs = sc->getRelocs();
@@ -872,8 +881,8 @@ bool Writer::fixGnuImportChunks() {
if (!pSec->chunks.empty())
hasIdata = true;
llvm::stable_sort(pSec->chunks, [&](Chunk *s, Chunk *t) {
- SectionChunk *sc1 = dyn_cast_or_null<SectionChunk>(s);
- SectionChunk *sc2 = dyn_cast_or_null<SectionChunk>(t);
+ SectionChunk *sc1 = dyn_cast<SectionChunk>(s);
+ SectionChunk *sc2 = dyn_cast<SectionChunk>(t);
if (!sc1 || !sc2) {
// if SC1, order them ascending. If SC2 or both null,
// S is not less than T.
diff --git a/lld/test/COFF/arm64ec-import-range-ext.test b/lld/test/COFF/arm64ec-import-range-ext.test
new file mode 100644
index 00000000000000..701d4c11cc5646
--- /dev/null
+++ b/lld/test/COFF/arm64ec-import-range-ext.test
@@ -0,0 +1,39 @@
+REQUIRES: aarch64, x86
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+RUN: llvm-lib -machine:arm64ec -def:test.def -out:test.lib
+
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj test.obj test.lib
+
+RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s
+DISASM: 0000000180001000 <.text>:
+DISASM-NEXT: 180001000: 52800000 mov w0, #0x0 // =0
+DISASM-NEXT: 180001004: d65f03c0 ret
+DISASM-NEXT: ...
+DISASM-NEXT: 188001008: b000000b adrp x11, 0x188002000
+DISASM-NEXT: 18800100c: f940016b ldr x11, [x11]
+DISASM-NEXT: 188001010: f0fbffea adrp x10, 0x180000000
+DISASM-NEXT: 188001014: 9100014a add x10, x10, #0x0
+DISASM-NEXT: 188001018: 90fc0010 adrp x16, 0x180001000 <.text>
+DISASM-NEXT: 18800101c: 91000210 add x16, x16, #0x0
+DISASM-NEXT: 188001020: d61f0200 br x16
+
+#--- test.s
+ .text
+ .globl __icall_helper_arm64ec
+ .p2align 2, 0x0
+__icall_helper_arm64ec:
+ mov w0, #0
+ ret
+
+ .space 0x8000000
+
+ .data
+ .rva __imp_func
+
+#--- test.def
+NAME test.dll
+EXPORTS
+ func
|
Depends on #109701. I have mixed feelings about the test. It needs to create a larger file than usual (~129MB). I could remove the file at the end of the test to reduce its footprint, but it will still be slower than typical tests. Is there some common practice for such cases? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM code wise.
.space 0x8000000 | ||
|
||
.data | ||
.rva __imp_func |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does this test even require a thunk? We're only referencing __imp_func
, not func
directly, so we should only need the raw IAT entry, no thunks at all? (I guess this is some of the other arm64ec aspects that we've already been through a couple times, that I haven't really internalized yet - sorry about that.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On ARM64EC, __imp_func
references the auxiliary IAT, not the regular one. The auxiliary IAT is guaranteed to be directly callable by EC code, so the caller doesn’t need to use __icall_helper_arm64ec
, as it would for indirect calls. Initially, the auxiliary IAT is filled with __impchk_*
thunks (#107931), which handle the use of __icall_helper_arm64ec
to call via the regular IAT. The OS may later update the auxiliary IAT entries to allow direct EC calls if the callee is also EC.
For ARM64EC, func references the x86 thunk that calls the regular IAT (__imp_aux_func
), while #func
references the ARM thunk that calls the auxiliary IAT (__imp_func
). At runtime, if the imported function is x86, two thunks are involved: #func
calls __imp_func
, which points to __impchk_func
, and this ultimately invokes the imported function pointer stored in __imp_aux_func
using __icall_helper_arm64ec
.
It's unfortunate and annoying, but I guess this is just how we usually handle these cases. After |
…range The MSVC linker generates range extensions for these thunks when needed. This commit inlines the range extension into the thunk, making it both slightly more optimal and easier to implement in LLD.
6d60982
to
3715274
Compare
Merged, thanks for review! |
…range (llvm#109703) The MSVC linker generates range extensions for these thunks when needed. This commit inlines the range extension into the thunk, making it both slightly more optimal and easier to implement in LLD.
The MSVC linker generates range extensions for these thunks when needed. This commit inlines the range extension into the thunk, making it both slightly more optimal and easier to implement in LLD.