-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[LOH] Emit hints for LDP/STP instructions #141297
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Ellis Hoag (ellishg) ChangesSupport more load/store instructions for llvm-project/lld/MachO/Arch/ARM64.cpp Lines 283 to 314 in 1695e8b
In a large binary, we saw Full diff: https://github.com/llvm/llvm-project/pull/141297.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index c3370cd6e946c..ee95efdfbee1e 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -193,6 +193,7 @@ static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) {
switch (MI.getOpcode()) {
default:
return false;
+ // STR
case AArch64::STRBBui:
case AArch64::STRHHui:
case AArch64::STRBui:
@@ -202,12 +203,37 @@ static bool isCandidateStore(const MachineInstr &MI, const MachineOperand &MO) {
case AArch64::STRSui:
case AArch64::STRDui:
case AArch64::STRQui:
+ // STUR
+ case AArch64::STURBi:
+ case AArch64::STURBBi:
+ case AArch64::STURHi:
+ case AArch64::STURHHi:
+ case AArch64::STURWi:
+ case AArch64::STURXi:
+ case AArch64::STURSi:
+ case AArch64::STURDi:
+ case AArch64::STURQi:
// We can only optimize the index operand.
// In case we have str xA, [xA, #imm], this is two different uses
// of xA and we cannot fold, otherwise the xA stored may be wrong,
// even if #imm == 0.
return MO.getOperandNo() == 1 &&
MI.getOperand(0).getReg() != MI.getOperand(1).getReg();
+ // STP
+ case AArch64::STPWi:
+ case AArch64::STPXi:
+ case AArch64::STPSi:
+ case AArch64::STPDi:
+ case AArch64::STPQi:
+ // STNP
+ case AArch64::STNPWi:
+ case AArch64::STNPXi:
+ case AArch64::STNPSi:
+ case AArch64::STNPDi:
+ case AArch64::STNPQi:
+ return MO.getOperandNo() == 2 &&
+ MI.getOperand(0).getReg() != MI.getOperand(2).getReg() &&
+ MI.getOperand(1).getReg() != MI.getOperand(2).getReg();
}
}
@@ -217,6 +243,7 @@ static bool isCandidateLoad(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
return false;
+ // LDR
case AArch64::LDRSBWui:
case AArch64::LDRSBXui:
case AArch64::LDRSHWui:
@@ -229,11 +256,40 @@ static bool isCandidateLoad(const MachineInstr &MI) {
case AArch64::LDRSui:
case AArch64::LDRDui:
case AArch64::LDRQui:
+ // LDUR
+ case AArch64::LDURBBi:
+ case AArch64::LDURBi:
+ case AArch64::LDURDi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURHi:
+ case AArch64::LDURQi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSHWi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSWi:
+ case AArch64::LDURSi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi:
return !(MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT);
+ // LDP
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi:
+ // LDNP
+ case AArch64::LDNPSi:
+ case AArch64::LDNPDi:
+ case AArch64::LDNPQi:
+ case AArch64::LDNPWi:
+ case AArch64::LDNPXi:
+ return !(MI.getOperand(3).getTargetFlags() & AArch64II::MO_GOT);
}
}
-/// Check whether the given instruction can load a litteral.
+/// Check whether the given instruction can load a literal.
static bool supportLoadFromLiteral(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
index acc0df12a94e8..f8b469efe5afc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -1,5 +1,5 @@
-; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
-; RUN: llc -o - %s -mtriple=arm64_32-apple-ios -O2 | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s --implicit-check-not=AdrpAddStr
+; RUN: llc -o - %s -mtriple=arm64_32-apple-ios -O2 | FileCheck %s --implicit-check-not=AdrpAddStr
; Test case for <rdar://problem/15942912>.
; AdrpAddStr cannot be used when the store uses same
; register as address and value. Indeed, the related
@@ -7,18 +7,26 @@
; at least provide a wrong one (with the offset folded
; into the definition).
-%struct.anon = type { ptr, ptr }
+@A = internal global i32 0, align 4
-@pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
-
-; CHECK-LABEL: _pptp_wan_init
-; CHECK: ret
-; CHECK-NOT: AdrpAddStr
-define i32 @pptp_wan_init() {
+define void @str() {
entry:
- store ptr null, ptr @pptp_wan_head, align 8
- store ptr @pptp_wan_head, ptr getelementptr inbounds (%struct.anon, ptr @pptp_wan_head, i64 0, i32 1), align 8
- ret i32 0
+ store ptr @A, ptr @A, align 4
+ ret void
}
+define void @stp0(i64 %t) {
+entry:
+ %addr = getelementptr inbounds i64, ptr @A, i32 1
+ store ptr @A, ptr @A, align 4
+ store i64 %t, ptr %addr, align 4
+ ret void
+}
+define void @stp1(i64 %t) {
+entry:
+ %addr = getelementptr inbounds i64, ptr @A, i32 1
+ store i64 %t, ptr @A, align 4
+ store ptr @A, ptr %addr, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
index 7f2bebf584d8f..6ac899fb41896 100644
--- a/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -71,6 +71,34 @@ define i32 @getC() {
ret i32 %res
}
+; CHECK-LABEL: _getCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getCPair() {
+ %res = load <8 x i32>, ptr @C, align 4
+ ret <8 x i32> %res
+}
+
+; CHECK-LABEL: _getCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldnp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getCNontemporalPair() {
+ %res = load <8 x i32>, ptr @C, align 4, !nontemporal !0
+ ret <8 x i32> %res
+}
+
; LDRSW supports loading from a literal.
; Make sure we emit AdrpLdrGotLdr for those.
; CHECK-LABEL: _getSExtC
@@ -126,6 +154,36 @@ entry:
ret void
}
+; CHECK-LABEL: _setCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define void @setCPair(<8 x i32> %t) {
+entry:
+ store <8 x i32> %t, ptr @C, align 4
+ ret void
+}
+
+; CHECK-LABEL: _setCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], [[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stnp q0, q1, [x[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define void @setCNontemporalPair(<8 x i32> %t) {
+entry:
+ store <8 x i32> %t, ptr @C, align 4, !nontemporal !0
+ ret void
+}
+
; Perform the same tests for internal global and a displacement
; in the addressing mode.
; Indeed we will get an ADD for those instead of LOADGot.
@@ -148,6 +206,51 @@ define i32 @getInternalCPlus4() {
ret i32 %res
}
+; CHECK-LABEL: _getInternalCUnscaled
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldur w0, [[[ADDGOT_REG]], #-4]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getInternalCUnscaled() {
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 -1
+ %res = load i32, ptr %addr, align 4
+ ret i32 %res
+}
+
+; CHECK-LABEL: _getInternalCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getInternalCPair() {
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+ %res = load <8 x i32>, ptr %addr, align 4
+ ret <8 x i32> %res
+}
+
+; CHECK-LABEL: _getInternalCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldnp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define <8 x i32> @getInternalCNontemporalPair() {
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+ %res = load <8 x i32>, ptr %addr, align 4, !nontemporal !0
+ ret <8 x i32> %res
+}
+
; LDRSW supports loading from a literal.
; Make sure we emit AdrpLdrGotLdr for those.
; CHECK-LABEL: _getSExtInternalCPlus4
@@ -206,6 +309,54 @@ entry:
ret void
}
+; CHECK-LABEL: _setInternalCUnscaled
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stur w0, [[[ADDGOT_REG]], #-4]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @setInternalCUnscaled(i32 %t) {
+entry:
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 -1
+ store i32 %t, ptr %addr, align 4
+ ret void
+}
+
+; CHECK-LABEL: _setInternalCPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @setInternalCPair(<8 x i32> %t) {
+entry:
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+ store <8 x i32> %t, ptr %addr, align 4
+ ret void
+}
+
+; CHECK-LABEL: _setInternalCNontemporalPair
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: stnp q0, q1, [[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @_setInternalCNontemporalPair(<8 x i32> %t) {
+entry:
+ %addr = getelementptr inbounds i32, ptr @InternalC, i32 4
+ store <8 x i32> %t, ptr %addr, align 4, !nontemporal !0
+ ret void
+}
+
; Check that we catch AdrpAddLdr case when we have a simple chain:
; adrp -> ldr.
; CHECK-LABEL: _getInternalC
@@ -679,4 +830,6 @@ if.end.i:
}
declare void @callee(ptr nocapture readonly, ...)
+!0 = !{ i32 1 }
+
attributes #0 = { "target-cpu"="cyclone" }
|
I also noticed that we can get even more
I plan to first implement ldp/stp in LLD, then I will consider implementing this case. |
case AArch64::STNPSi: | ||
case AArch64::STNPDi: | ||
case AArch64::STNPQi: | ||
return MO.getOperandNo() == 2 && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a comment similar to the above including the format of these pair store instructions and what operands mean?
6f31722
to
a342803
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Support more load/store instructions for
.loh
directives. Note that these new instructions are not supported in LLD yet, so they will be skipped for now.llvm-project/lld/MachO/Arch/ARM64.cpp
Lines 283 to 314 in 1695e8b
In a large binary, we saw
NumADDToLDR
increase from 3587 to 4511. I believe this shows the potential for improvement.