-
Notifications
You must be signed in to change notification settings - Fork 13.5k
MachineScheduler: Reset next cluster candidate for each node #139513
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
When a node is picked, we should reset its next cluster candidate to null before releasing its successors/predecessors.
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-backend-powerpc Author: Ruiling, Song (ruiling) ChangesWhen a node is picked, we should reset its next cluster candidate to null before releasing its successors/predecessors. Patch is 1.82 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/139513.diff 110 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 236c55cb04142..e283cf0f392f1 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -967,6 +967,12 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
/// releaseSuccessors - Call releaseSucc on each of SU's successors.
void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
+ // Reset the next successor, For example, we want to cluster A B C.
+ // After A is picked, we will set B as next cluster succ, but if we pick
+ // D instead of B after A, then we need to reset the next cluster succ because
+ // we have decided to not pick the cluster candidate B during pickNode().
+ // Leaving B as the NextClusterSucc just make things messy.
+ NextClusterSucc = nullptr;
for (SDep &Succ : SU->Succs)
releaseSucc(SU, &Succ);
}
@@ -1004,6 +1010,7 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
/// releasePredecessors - Call releasePred on each of SU's predecessors.
void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
+ NextClusterPred = nullptr;
for (SDep &Pred : SU->Preds)
releasePred(SU, &Pred);
}
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index 1ed2e09c6b4d4..7ca6adb1338d3 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -8,11 +8,11 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
; CHECK-NEXT: fmov s0, wzr
; CHECK-NEXT: ldr x11, [sp]
; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: ldp x9, x10, [sp, #8]
+; CHECK-NEXT: ldp x8, x10, [sp, #8]
; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: csel x8, x5, x9, ne
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: tst w9, #0x1
+; CHECK-NEXT: csel x8, x5, x8, ne
; CHECK-NEXT: csel x9, x4, x11, ne
; CHECK-NEXT: stp x9, x8, [x10, #16]
; CHECK-NEXT: csel x8, x3, x7, ne
@@ -36,14 +36,14 @@ define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
; CHECK-NEXT: ldr x10, [sp, #16]
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: ldp x9, x8, [sp]
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: tst w9, #0x1
+; CHECK-NEXT: ldp x8, x9, [sp]
; CHECK-NEXT: csel x11, x2, x6, ne
; CHECK-NEXT: str x11, [x10]
-; CHECK-NEXT: csel x9, x4, x9, ne
-; CHECK-NEXT: csel x8, x5, x8, ne
-; CHECK-NEXT: stur x9, [x10, #12]
+; CHECK-NEXT: csel x8, x4, x8, ne
+; CHECK-NEXT: stur x8, [x10, #12]
+; CHECK-NEXT: csel x8, x5, x9, ne
; CHECK-NEXT: csel x9, x3, x7, ne
; CHECK-NEXT: str w8, [x10, #20]
; CHECK-NEXT: str w9, [x10, #8]
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 82114d60c4a93..75f3ffc9515e5 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -667,30 +667,30 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: add x10, x3, #12
; CHECK-NEXT: bic v1.8h, #255, lsl #8
; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4
-; CHECK-NEXT: ldr s3, [x0, #12]
-; CHECK-NEXT: ldp s2, s7, [x0, #4]
+; CHECK-NEXT: ldr s4, [x0, #12]
+; CHECK-NEXT: ldp s5, s2, [x2, #4]
; CHECK-NEXT: ldr s6, [x2, #12]
-; CHECK-NEXT: ldp s5, s4, [x2, #4]
-; CHECK-NEXT: ld1 { v3.s }[1], [x11]
+; CHECK-NEXT: ldp s3, s7, [x0, #4]
+; CHECK-NEXT: ld1 { v4.s }[1], [x11]
; CHECK-NEXT: ld1 { v6.s }[1], [x10]
-; CHECK-NEXT: ld1 { v2.s }[1], [x9]
-; CHECK-NEXT: ld1 { v4.s }[1], [x8]
+; CHECK-NEXT: ld1 { v2.s }[1], [x8]
; CHECK-NEXT: ld1 { v5.s }[1], [x3]
; CHECK-NEXT: add x8, x1, #8
+; CHECK-NEXT: ld1 { v3.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
-; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
-; CHECK-NEXT: ushll v4.8h, v4.8b, #0
-; CHECK-NEXT: uaddl v3.8h, v5.8b, v6.8b
+; CHECK-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-NEXT: uaddl v3.8h, v3.8b, v4.8b
+; CHECK-NEXT: uaddl v4.8h, v5.8b, v6.8b
; CHECK-NEXT: uaddw v1.8h, v1.8h, v7.8b
-; CHECK-NEXT: uaddw2 v4.8h, v4.8h, v0.16b
-; CHECK-NEXT: ushll v0.4s, v2.4h, #3
-; CHECK-NEXT: ushll v5.4s, v3.4h, #3
+; CHECK-NEXT: uaddw2 v2.8h, v2.8h, v0.16b
+; CHECK-NEXT: ushll v0.4s, v3.4h, #3
+; CHECK-NEXT: ushll v5.4s, v4.4h, #3
+; CHECK-NEXT: ushll2 v4.4s, v4.8h, #3
; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3
-; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
-; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v4.8h
-; CHECK-NEXT: uaddw v2.4s, v5.4s, v4.4h
+; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-NEXT: uaddw2 v3.4s, v4.4s, v2.8h
+; CHECK-NEXT: uaddw v2.4s, v5.4s, v2.4h
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
store <4 x i8> %lp1, ptr %z
@@ -1073,24 +1073,24 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: ld1 { v6.s }[1], [x10]
; CHECK-NEXT: ld1 { v5.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
-; CHECK-NEXT: uaddl v16.8h, v2.8b, v3.8b
-; CHECK-NEXT: uaddl v3.8h, v1.8b, v6.8b
-; CHECK-NEXT: uaddl v2.8h, v4.8b, v5.8b
+; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT: uaddl v1.8h, v1.8b, v6.8b
+; CHECK-NEXT: uaddl v3.8h, v4.8b, v5.8b
; CHECK-NEXT: uaddl v4.8h, v0.8b, v7.8b
-; CHECK-NEXT: ushll v0.4s, v16.4h, #3
-; CHECK-NEXT: ushll2 v1.4s, v16.8h, #3
-; CHECK-NEXT: ushll2 v18.4s, v16.8h, #0
-; CHECK-NEXT: ushll v6.4s, v2.4h, #3
-; CHECK-NEXT: ushll2 v7.4s, v2.8h, #3
-; CHECK-NEXT: ushll2 v5.4s, v2.8h, #0
+; CHECK-NEXT: ushll2 v0.4s, v2.8h, #0
+; CHECK-NEXT: ushll v5.4s, v2.4h, #3
+; CHECK-NEXT: ushll2 v16.4s, v2.8h, #3
+; CHECK-NEXT: ushll v6.4s, v3.4h, #3
+; CHECK-NEXT: ushll2 v7.4s, v3.8h, #3
; CHECK-NEXT: ushll v17.4s, v2.4h, #0
-; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v3.8h
-; CHECK-NEXT: uaddw v0.4s, v0.4s, v3.4h
+; CHECK-NEXT: ushll2 v18.4s, v3.8h, #0
+; CHECK-NEXT: ushll v19.4s, v3.4h, #0
+; CHECK-NEXT: stp q17, q0, [x4]
+; CHECK-NEXT: uaddw v0.4s, v5.4s, v1.4h
+; CHECK-NEXT: uaddw2 v1.4s, v16.4s, v1.8h
; CHECK-NEXT: uaddw2 v3.4s, v7.4s, v4.8h
; CHECK-NEXT: uaddw v2.4s, v6.4s, v4.4h
-; CHECK-NEXT: ushll v4.4s, v16.4h, #0
-; CHECK-NEXT: stp q17, q5, [x4, #32]
-; CHECK-NEXT: stp q4, q18, [x4]
+; CHECK-NEXT: stp q19, q18, [x4, #32]
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
%p2 = getelementptr i8, ptr %p, i32 4
@@ -1176,19 +1176,20 @@ define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: ld1 { v5.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT: uaddl v1.8h, v1.8b, v6.8b
; CHECK-NEXT: uaddl v3.8h, v4.8b, v5.8b
-; CHECK-NEXT: uaddl v4.8h, v1.8b, v6.8b
-; CHECK-NEXT: ushll v5.4s, v2.4h, #3
-; CHECK-NEXT: ushll2 v6.4s, v2.8h, #3
-; CHECK-NEXT: uaddl v2.8h, v0.8b, v7.8b
-; CHECK-NEXT: ushll v7.4s, v3.4h, #3
-; CHECK-NEXT: ushll2 v16.4s, v3.8h, #3
-; CHECK-NEXT: uaddw2 v1.4s, v6.4s, v4.8h
-; CHECK-NEXT: uaddw v0.4s, v5.4s, v4.4h
-; CHECK-NEXT: stp q5, q6, [x4]
-; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v2.8h
-; CHECK-NEXT: uaddw v2.4s, v7.4s, v2.4h
-; CHECK-NEXT: stp q7, q16, [x4, #32]
+; CHECK-NEXT: uaddl v5.8h, v0.8b, v7.8b
+; CHECK-NEXT: ushll v4.4s, v2.4h, #3
+; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
+; CHECK-NEXT: ushll v6.4s, v3.4h, #3
+; CHECK-NEXT: ushll2 v7.4s, v3.8h, #3
+; CHECK-NEXT: uaddw v0.4s, v4.4s, v1.4h
+; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
+; CHECK-NEXT: str q4, [x4]
+; CHECK-NEXT: stp q2, q6, [x4, #16]
+; CHECK-NEXT: uaddw2 v3.4s, v7.4s, v5.8h
+; CHECK-NEXT: uaddw v2.4s, v6.4s, v5.4h
+; CHECK-NEXT: str q7, [x4, #48]
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
%p2 = getelementptr i8, ptr %p, i32 4
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 9c4f0207b84ce..ae3b6a54a1f7f 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -2825,42 +2825,42 @@ define <32 x i64> @fptos_v32f32_v32i64(<32 x float> %a) {
; CHECK-SD-NEXT: fcvtl v7.2d, v7.2s
; CHECK-SD-NEXT: fcvtl2 v17.2d, v6.4s
; CHECK-SD-NEXT: fcvtl v6.2d, v6.2s
-; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
-; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
+; CHECK-SD-NEXT: fcvtl2 v21.2d, v2.4s
+; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
; CHECK-SD-NEXT: fcvtl2 v19.2d, v4.4s
; CHECK-SD-NEXT: fcvtl v4.2d, v4.2s
+; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
; CHECK-SD-NEXT: fcvtl2 v20.2d, v3.4s
+; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
; CHECK-SD-NEXT: fcvtl v3.2d, v3.2s
; CHECK-SD-NEXT: fcvtzs v16.2d, v16.2d
; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d
; CHECK-SD-NEXT: fcvtzs v17.2d, v17.2d
; CHECK-SD-NEXT: fcvtzs v6.2d, v6.2d
+; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
+; CHECK-SD-NEXT: fcvtzs v19.2d, v19.2d
+; CHECK-SD-NEXT: fcvtzs v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzs v18.2d, v18.2d
+; CHECK-SD-NEXT: fcvtzs v20.2d, v20.2d
; CHECK-SD-NEXT: fcvtzs v5.2d, v5.2d
-; CHECK-SD-NEXT: fcvtzs v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzs v3.2d, v3.2d
; CHECK-SD-NEXT: stp q7, q16, [x8, #224]
-; CHECK-SD-NEXT: fcvtl2 v7.2d, v2.4s
-; CHECK-SD-NEXT: fcvtzs v16.2d, v19.2d
-; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
-; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
-; CHECK-SD-NEXT: fcvtl2 v5.2d, v0.4s
+; CHECK-SD-NEXT: fcvtzs v16.2d, v21.2d
; CHECK-SD-NEXT: stp q6, q17, [x8, #192]
-; CHECK-SD-NEXT: fcvtl2 v6.2d, v1.4s
-; CHECK-SD-NEXT: fcvtzs v17.2d, v20.2d
+; CHECK-SD-NEXT: fcvtl2 v17.2d, v1.4s
; CHECK-SD-NEXT: fcvtl v1.2d, v1.2s
+; CHECK-SD-NEXT: stp q4, q19, [x8, #128]
+; CHECK-SD-NEXT: stp q3, q20, [x8, #96]
+; CHECK-SD-NEXT: stp q2, q16, [x8, #64]
+; CHECK-SD-NEXT: fcvtl2 v16.2d, v0.4s
; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
-; CHECK-SD-NEXT: stp q4, q16, [x8, #128]
-; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d
-; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-SD-NEXT: fcvtzs v4.2d, v6.2d
-; CHECK-SD-NEXT: stp q3, q17, [x8, #96]
-; CHECK-SD-NEXT: fcvtzs v3.2d, v5.2d
+; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
+; CHECK-SD-NEXT: fcvtzs v6.2d, v17.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
+; CHECK-SD-NEXT: fcvtzs v4.2d, v16.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: stp q2, q7, [x8, #64]
-; CHECK-SD-NEXT: stp q0, q3, [x8]
-; CHECK-SD-NEXT: stp q1, q4, [x8, #32]
+; CHECK-SD-NEXT: stp q1, q6, [x8, #32]
+; CHECK-SD-NEXT: stp q0, q4, [x8]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v32f32_v32i64:
@@ -2918,42 +2918,42 @@ define <32 x i64> @fptou_v32f32_v32i64(<32 x float> %a) {
; CHECK-SD-NEXT: fcvtl v7.2d, v7.2s
; CHECK-SD-NEXT: fcvtl2 v17.2d, v6.4s
; CHECK-SD-NEXT: fcvtl v6.2d, v6.2s
-; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
-; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
+; CHECK-SD-NEXT: fcvtl2 v21.2d, v2.4s
+; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
; CHECK-SD-NEXT: fcvtl2 v19.2d, v4.4s
; CHECK-SD-NEXT: fcvtl v4.2d, v4.2s
+; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
; CHECK-SD-NEXT: fcvtl2 v20.2d, v3.4s
+; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
; CHECK-SD-NEXT: fcvtl v3.2d, v3.2s
; CHECK-SD-NEXT: fcvtzu v16.2d, v16.2d
; CHECK-SD-NEXT: fcvtzu v7.2d, v7.2d
; CHECK-SD-NEXT: fcvtzu v17.2d, v17.2d
; CHECK-SD-NEXT: fcvtzu v6.2d, v6.2d
+; CHECK-SD-NEXT: fcvtzu v2.2d, v2.2d
+; CHECK-SD-NEXT: fcvtzu v19.2d, v19.2d
+; CHECK-SD-NEXT: fcvtzu v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzu v18.2d, v18.2d
+; CHECK-SD-NEXT: fcvtzu v20.2d, v20.2d
; CHECK-SD-NEXT: fcvtzu v5.2d, v5.2d
-; CHECK-SD-NEXT: fcvtzu v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzu v3.2d, v3.2d
; CHECK-SD-NEXT: stp q7, q16, [x8, #224]
-; CHECK-SD-NEXT: fcvtl2 v7.2d, v2.4s
-; CHECK-SD-NEXT: fcvtzu v16.2d, v19.2d
-; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
-; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
-; CHECK-SD-NEXT: fcvtl2 v5.2d, v0.4s
+; CHECK-SD-NEXT: fcvtzu v16.2d, v21.2d
; CHECK-SD-NEXT: stp q6, q17, [x8, #192]
-; CHECK-SD-NEXT: fcvtl2 v6.2d, v1.4s
-; CHECK-SD-NEXT: fcvtzu v17.2d, v20.2d
+; CHECK-SD-NEXT: fcvtl2 v17.2d, v1.4s
; CHECK-SD-NEXT: fcvtl v1.2d, v1.2s
+; CHECK-SD-NEXT: stp q4, q19, [x8, #128]
+; CHECK-SD-NEXT: stp q3, q20, [x8, #96]
+; CHECK-SD-NEXT: stp q2, q16, [x8, #64]
+; CHECK-SD-NEXT: fcvtl2 v16.2d, v0.4s
; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
-; CHECK-SD-NEXT: stp q4, q16, [x8, #128]
-; CHECK-SD-NEXT: fcvtzu v7.2d, v7.2d
-; CHECK-SD-NEXT: fcvtzu v2.2d, v2.2d
-; CHECK-SD-NEXT: fcvtzu v4.2d, v6.2d
-; CHECK-SD-NEXT: stp q3, q17, [x8, #96]
-; CHECK-SD-NEXT: fcvtzu v3.2d, v5.2d
+; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
+; CHECK-SD-NEXT: fcvtzu v6.2d, v17.2d
; CHECK-SD-NEXT: fcvtzu v1.2d, v1.2d
+; CHECK-SD-NEXT: fcvtzu v4.2d, v16.2d
; CHECK-SD-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-SD-NEXT: stp q2, q7, [x8, #64]
-; CHECK-SD-NEXT: stp q0, q3, [x8]
-; CHECK-SD-NEXT: stp q1, q4, [x8, #32]
+; CHECK-SD-NEXT: stp q1, q6, [x8, #32]
+; CHECK-SD-NEXT: stp q0, q4, [x8]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v32f32_v32i64:
@@ -5244,45 +5244,45 @@ define <32 x i64> @fptos_v32f16_v32i64(<32 x half> %a) {
; CHECK-GI-FP16-NEXT: mov v17.d[1], v23.d[0]
; CHECK-GI-FP16-NEXT: mov v1.d[1], v29.d[0]
; CHECK-GI-FP16-NEXT: mov v19.d[1], v30.d[0]
-; CHECK-GI-FP16-NEXT: mov h21, v3.h[1]
+; CHECK-GI-FP16-NEXT: mov h16, v3.h[1]
; CHECK-GI-FP16-NEXT: stp q6, q5, [x8, #32]
; CHECK-GI-FP16-NEXT: mov v20.d[1], v22.d[0]
-; CHECK-GI-FP16-NEXT: mov h16, v3.h[2]
+; CHECK-GI-FP16-NEXT: mov h21, v3.h[2]
; CHECK-GI-FP16-NEXT: mov h7, v3.h[3]
; CHECK-GI-FP16-NEXT: mov h22, v3.h[4]
-; CHECK-GI-FP16-NEXT: mov h23, v3.h[5]
-; CHECK-GI-FP16-NEXT: mov h6, v3.h[6]
+; CHECK-GI-FP16-NEXT: mov h6, v3.h[5]
+; CHECK-GI-FP16-NEXT: mov h23, v3.h[6]
; CHECK-GI-FP16-NEXT: mov h5, v3.h[7]
; CHECK-GI-FP16-NEXT: mov v18.d[1], v24.d[0]
; CHECK-GI-FP16-NEXT: mov v2.d[1], v25.d[0]
; CHECK-GI-FP16-NEXT: fcvt d3, h3
-; CHECK-GI-FP16-NEXT: fcvt d21, h21
-; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-GI-FP16-NEXT: fcvt d16, h16
+; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-GI-FP16-NEXT: fcvt d21, h21
; CHECK-GI-FP16-NEXT: fcvtzs v4.2d, v4.2d
; CHECK-GI-FP16-NEXT: fcvt d7, h7
; CHECK-GI-FP16-NEXT: fcvt d22, h22
-; CHECK-GI-FP16-NEXT: fcvt d23, h23
-; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-GI-FP16-NEXT: fcvt d6, h6
+; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v1.2d
+; CHECK-GI-FP16-NEXT: fcvt d23, h23
; CHECK-GI-FP16-NEXT: fcvt d5, h5
; CHECK-GI-FP16-NEXT: fcvtzs v19.2d, v19.2d
-; CHECK-GI-FP16-NEXT: mov v3.d[1], v21.d[0]
-; CHECK-GI-FP16-NEXT: fcvtzs v20.2d, v20.2d
+; CHECK-GI-FP16-NEXT: mov v3.d[1], v16.d[0]
+; CHECK-GI-FP16-NEXT: fcvtzs v16.2d, v20.2d
; CHECK-GI-FP16-NEXT: stp q0, q4, [x8, #64]
; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v17.2d
; CHECK-GI-FP16-NEXT: fcvtzs v4.2d, v18.2d
-; CHECK-GI-FP16-NEXT: mov v16.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT: mov v22.d[1], v23.d[0]
-; CHECK-GI-FP16-NEXT: mov v6.d[1], v5.d[0]
+; CHECK-GI-FP16-NEXT: mov v21.d[1], v7.d[0]
+; CHECK-GI-FP16-NEXT: mov v22.d[1], v6.d[0]
+; CHECK-GI-FP16-NEXT: mov v23.d[1], v5.d[0]
; CHECK-GI-FP16-NEXT: stp q1, q19, [x8, #96]
; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-GI-FP16-NEXT: fcvtzs v2.2d, v3.2d
-; CHECK-GI-FP16-NEXT: stp q20, q0, [x8, #128]
-; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v16.2d
+; CHECK-GI-FP16-NEXT: stp q16, q0, [x8, #128]
+; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v21.2d
; CHECK-GI-FP16-NEXT: fcvtzs v3.2d, v22.2d
; CHECK-GI-FP16-NEXT: stp q4, q1, [x8, #160]
-; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v6.2d
+; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v23.2d
; CHECK-GI-FP16-NEXT: stp q2, q0, [x8, #192]
; CHECK-GI-FP16-NEXT: stp q3, q1, [x8, #224]
; CHECK-GI-FP16-NEXT: ret
@@ -5645,45 +5645,45 @@ define <32 x i64> @fptou_v32f16_v32i64(<32 x half> %a) {
; CHECK-GI-FP16-NEXT: mov v17.d[1], v23.d[0]
; CHECK-GI-FP16-NEXT: mov v1.d[1], v29.d[0]
; CHECK-GI-FP16-NEXT: mov v19.d[1], v30.d[0]
-; CHECK-GI-FP16-NEXT: mov h21, v3.h[1]
+; CHECK-GI-FP16-NEXT: mov h16, v3.h[1]
; CHECK-GI-FP16-NEXT: stp q6, q5, [x8, #32]
; CHECK-GI-FP16-NEXT: mov v20.d[1], v22.d[0]
-; CHECK-GI-FP16-NEXT: mov h16, v3.h[2]
+; CHECK-GI-FP16-NEXT: mov h21, v3.h[2]
; CHECK-GI-FP16-NEXT: mov h7, v3.h[3]
; CHECK-GI-FP16-NEXT: mov h22, v3.h[4]
-; CHECK-GI-FP16-NEXT: mov h23, v3.h[5]
-; CHECK-GI-FP16-NEXT: mov h6, v3.h[6]
+; CHECK-GI-FP16-NEXT: mov h6, v3.h[5]
+; CHECK-GI-FP16-NEXT: mov h23, v3.h[6]
; CHECK-GI-FP16-NEXT: mov h5, v3.h[7]
; CHECK-GI-FP16-NEXT: mov v18.d[1], v24.d[0]
; CHECK-GI-FP16-NEXT: mov v2.d[1], v25.d[0]
; CHECK-GI-FP16-NEXT: fcvt d3, h3
-; CHECK-GI-FP16-NEXT: fcvt d21, h21
-; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v0.2d
; CHECK-GI-FP16-NEXT: fcvt d16, h16
+; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v0.2d
+; CHECK-GI-FP16-NEXT: fcvt d21, h21
; CHECK-GI-FP16-NEXT: fcvtzu v4.2d, v4.2d
; CHECK-GI-FP16-NEXT: fcvt d7, h7
; CHECK-GI-FP16-NEXT: fcvt d22, h22
-; CHECK-GI-FP16-NEXT: fcvt d23, h23
-; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v1.2d
; CHECK-GI-FP16-NEXT: fcvt d6, h6
+; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v1.2d
+; CHECK-GI-FP16-NEXT: fcvt d23, h23
; CHECK-GI-FP16-NEXT: fcvt d5, h5
; CHECK-GI-FP16-NEXT: fcvtzu v19.2d, v19.2d
-; CHECK-GI-FP16-NEXT: mov v3.d[1], v21.d[0]
-; CHECK-GI-FP16-NEXT: fcvtzu v20.2d, v20.2d
+; CHECK-GI-FP16-NEXT: mov v3.d[1], v16.d[0]
+; CHECK-GI-FP16-NEXT: fcvtzu v16.2d, v20.2d
; CHECK-GI-FP16-NEXT: stp q0, q4, [x8, #64]
; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v17.2d
; CHECK-GI-FP16-NEXT: fcvtzu v4.2d, v18.2d
-; CHECK-GI-FP16-NEXT: mov v16.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT: mov v22.d[1], v23.d[0]
-; CHECK-GI-FP16-NEXT: mov v6.d[1], v5.d[0]
+; CHECK-GI-FP16-NEXT: mov v21.d[1], v7.d[0]
+; CHECK-GI-FP16-NEXT: mov v22.d[1], v6.d[0]
+; CHECK-GI-FP16-NEXT: mov v23.d[1], v5.d[0]
; CHECK-GI-FP16-NEXT: stp q1, q19, [x8, #96]
; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v2.2d
; CHECK-GI-FP16-NEXT: fcvtzu v2.2d, v3.2d
-; CHECK-GI-FP16-NEXT: stp q20, q0, [x8, #128]
-; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v16.2d
+; CHECK-GI-FP16-NEXT: stp q16, q0, [x8, #128]
+; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v21.2d
; CHECK-GI-FP16-NEXT: fcvtzu v3.2d, v22.2d
; CHECK-GI-FP16-NEXT: stp q4, q1, [x8, #160]
-; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v6.2d
+; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v23.2d
; CHECK-GI-FP16-NEXT: stp q2, q0, [x8, #192]
; CHECK-GI-FP16-NEXT: stp q3, q1, [x8, #224]
; CHECK-GI-FP16-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index f2c4e976b8c16..b1b5154a57b4d 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -3521,31 +3521,31 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: extr x8, x21, x27, #28
-; CHECK-NEXT: extr x9, x29, x20, #28
+; CHECK-NEXT: str x24, [x19]
+; CHECK-NEXT: bfi x22, x20, #36, #28
; CHECK-NEXT: stur x28, [x19, #75]
+; CHECK-NEXT: extr x9, x29, x20, #28
; CHECK-NEXT: fcmp s8, #0.0
-; CHECK-NEXT: bfi x22, x20, #36, #28
-; CHECK-NEXT: lsr x11, x29, #28
; CHECK-NEXT: stur x8, [x19, #41]
-; CHECK-NEXT: str x9, [x19, #16]
-; CHECK-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT: stp x22, x9, [x19, #8]
+; CHECK-NEXT: lsr x9, x29, #28
; CHECK-NEXT: csel x8, xzr, x0, lt
-; CHECK-NEXT: csel x9, xzr, x1, lt
+; CHECK-NEXT: csel x10, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
-; CHECK-NEXT: stp x24, x22, [x19]
-; CHECK-NEXT: stur x10, [x19, #50]
-; CHECK-NEXT: lsr x10, x21, #28
-; CHECK-NEX...
[truncated]
|
@llvm/pr-subscribers-backend-aarch64 Author: Ruiling, Song (ruiling) ChangesWhen a node is picked, we should reset its next cluster candidate to null before releasing its successors/predecessors. Patch is 1.82 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/139513.diff 110 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 236c55cb04142..e283cf0f392f1 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -967,6 +967,12 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
/// releaseSuccessors - Call releaseSucc on each of SU's successors.
void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
+ // Reset the next successor, For example, we want to cluster A B C.
+ // After A is picked, we will set B as next cluster succ, but if we pick
+ // D instead of B after A, then we need to reset the next cluster succ because
+ // we have decided to not pick the cluster candidate B during pickNode().
+ // Leaving B as the NextClusterSucc just make things messy.
+ NextClusterSucc = nullptr;
for (SDep &Succ : SU->Succs)
releaseSucc(SU, &Succ);
}
@@ -1004,6 +1010,7 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
/// releasePredecessors - Call releasePred on each of SU's predecessors.
void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
+ NextClusterPred = nullptr;
for (SDep &Pred : SU->Preds)
releasePred(SU, &Pred);
}
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index 1ed2e09c6b4d4..7ca6adb1338d3 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -8,11 +8,11 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
; CHECK-NEXT: fmov s0, wzr
; CHECK-NEXT: ldr x11, [sp]
; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: ldp x9, x10, [sp, #8]
+; CHECK-NEXT: ldp x8, x10, [sp, #8]
; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: csel x8, x5, x9, ne
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: tst w9, #0x1
+; CHECK-NEXT: csel x8, x5, x8, ne
; CHECK-NEXT: csel x9, x4, x11, ne
; CHECK-NEXT: stp x9, x8, [x10, #16]
; CHECK-NEXT: csel x8, x3, x7, ne
@@ -36,14 +36,14 @@ define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
; CHECK-NEXT: ldr x10, [sp, #16]
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: ldp x9, x8, [sp]
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: tst w9, #0x1
+; CHECK-NEXT: ldp x8, x9, [sp]
; CHECK-NEXT: csel x11, x2, x6, ne
; CHECK-NEXT: str x11, [x10]
-; CHECK-NEXT: csel x9, x4, x9, ne
-; CHECK-NEXT: csel x8, x5, x8, ne
-; CHECK-NEXT: stur x9, [x10, #12]
+; CHECK-NEXT: csel x8, x4, x8, ne
+; CHECK-NEXT: stur x8, [x10, #12]
+; CHECK-NEXT: csel x8, x5, x9, ne
; CHECK-NEXT: csel x9, x3, x7, ne
; CHECK-NEXT: str w8, [x10, #20]
; CHECK-NEXT: str w9, [x10, #8]
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 82114d60c4a93..75f3ffc9515e5 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -667,30 +667,30 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: add x10, x3, #12
; CHECK-NEXT: bic v1.8h, #255, lsl #8
; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4
-; CHECK-NEXT: ldr s3, [x0, #12]
-; CHECK-NEXT: ldp s2, s7, [x0, #4]
+; CHECK-NEXT: ldr s4, [x0, #12]
+; CHECK-NEXT: ldp s5, s2, [x2, #4]
; CHECK-NEXT: ldr s6, [x2, #12]
-; CHECK-NEXT: ldp s5, s4, [x2, #4]
-; CHECK-NEXT: ld1 { v3.s }[1], [x11]
+; CHECK-NEXT: ldp s3, s7, [x0, #4]
+; CHECK-NEXT: ld1 { v4.s }[1], [x11]
; CHECK-NEXT: ld1 { v6.s }[1], [x10]
-; CHECK-NEXT: ld1 { v2.s }[1], [x9]
-; CHECK-NEXT: ld1 { v4.s }[1], [x8]
+; CHECK-NEXT: ld1 { v2.s }[1], [x8]
; CHECK-NEXT: ld1 { v5.s }[1], [x3]
; CHECK-NEXT: add x8, x1, #8
+; CHECK-NEXT: ld1 { v3.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
-; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
-; CHECK-NEXT: ushll v4.8h, v4.8b, #0
-; CHECK-NEXT: uaddl v3.8h, v5.8b, v6.8b
+; CHECK-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-NEXT: uaddl v3.8h, v3.8b, v4.8b
+; CHECK-NEXT: uaddl v4.8h, v5.8b, v6.8b
; CHECK-NEXT: uaddw v1.8h, v1.8h, v7.8b
-; CHECK-NEXT: uaddw2 v4.8h, v4.8h, v0.16b
-; CHECK-NEXT: ushll v0.4s, v2.4h, #3
-; CHECK-NEXT: ushll v5.4s, v3.4h, #3
+; CHECK-NEXT: uaddw2 v2.8h, v2.8h, v0.16b
+; CHECK-NEXT: ushll v0.4s, v3.4h, #3
+; CHECK-NEXT: ushll v5.4s, v4.4h, #3
+; CHECK-NEXT: ushll2 v4.4s, v4.8h, #3
; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3
-; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
-; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v4.8h
-; CHECK-NEXT: uaddw v2.4s, v5.4s, v4.4h
+; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-NEXT: uaddw2 v3.4s, v4.4s, v2.8h
+; CHECK-NEXT: uaddw v2.4s, v5.4s, v2.4h
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
store <4 x i8> %lp1, ptr %z
@@ -1073,24 +1073,24 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: ld1 { v6.s }[1], [x10]
; CHECK-NEXT: ld1 { v5.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
-; CHECK-NEXT: uaddl v16.8h, v2.8b, v3.8b
-; CHECK-NEXT: uaddl v3.8h, v1.8b, v6.8b
-; CHECK-NEXT: uaddl v2.8h, v4.8b, v5.8b
+; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT: uaddl v1.8h, v1.8b, v6.8b
+; CHECK-NEXT: uaddl v3.8h, v4.8b, v5.8b
; CHECK-NEXT: uaddl v4.8h, v0.8b, v7.8b
-; CHECK-NEXT: ushll v0.4s, v16.4h, #3
-; CHECK-NEXT: ushll2 v1.4s, v16.8h, #3
-; CHECK-NEXT: ushll2 v18.4s, v16.8h, #0
-; CHECK-NEXT: ushll v6.4s, v2.4h, #3
-; CHECK-NEXT: ushll2 v7.4s, v2.8h, #3
-; CHECK-NEXT: ushll2 v5.4s, v2.8h, #0
+; CHECK-NEXT: ushll2 v0.4s, v2.8h, #0
+; CHECK-NEXT: ushll v5.4s, v2.4h, #3
+; CHECK-NEXT: ushll2 v16.4s, v2.8h, #3
+; CHECK-NEXT: ushll v6.4s, v3.4h, #3
+; CHECK-NEXT: ushll2 v7.4s, v3.8h, #3
; CHECK-NEXT: ushll v17.4s, v2.4h, #0
-; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v3.8h
-; CHECK-NEXT: uaddw v0.4s, v0.4s, v3.4h
+; CHECK-NEXT: ushll2 v18.4s, v3.8h, #0
+; CHECK-NEXT: ushll v19.4s, v3.4h, #0
+; CHECK-NEXT: stp q17, q0, [x4]
+; CHECK-NEXT: uaddw v0.4s, v5.4s, v1.4h
+; CHECK-NEXT: uaddw2 v1.4s, v16.4s, v1.8h
; CHECK-NEXT: uaddw2 v3.4s, v7.4s, v4.8h
; CHECK-NEXT: uaddw v2.4s, v6.4s, v4.4h
-; CHECK-NEXT: ushll v4.4s, v16.4h, #0
-; CHECK-NEXT: stp q17, q5, [x4, #32]
-; CHECK-NEXT: stp q4, q18, [x4]
+; CHECK-NEXT: stp q19, q18, [x4, #32]
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
%p2 = getelementptr i8, ptr %p, i32 4
@@ -1176,19 +1176,20 @@ define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: ld1 { v5.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT: uaddl v1.8h, v1.8b, v6.8b
; CHECK-NEXT: uaddl v3.8h, v4.8b, v5.8b
-; CHECK-NEXT: uaddl v4.8h, v1.8b, v6.8b
-; CHECK-NEXT: ushll v5.4s, v2.4h, #3
-; CHECK-NEXT: ushll2 v6.4s, v2.8h, #3
-; CHECK-NEXT: uaddl v2.8h, v0.8b, v7.8b
-; CHECK-NEXT: ushll v7.4s, v3.4h, #3
-; CHECK-NEXT: ushll2 v16.4s, v3.8h, #3
-; CHECK-NEXT: uaddw2 v1.4s, v6.4s, v4.8h
-; CHECK-NEXT: uaddw v0.4s, v5.4s, v4.4h
-; CHECK-NEXT: stp q5, q6, [x4]
-; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v2.8h
-; CHECK-NEXT: uaddw v2.4s, v7.4s, v2.4h
-; CHECK-NEXT: stp q7, q16, [x4, #32]
+; CHECK-NEXT: uaddl v5.8h, v0.8b, v7.8b
+; CHECK-NEXT: ushll v4.4s, v2.4h, #3
+; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
+; CHECK-NEXT: ushll v6.4s, v3.4h, #3
+; CHECK-NEXT: ushll2 v7.4s, v3.8h, #3
+; CHECK-NEXT: uaddw v0.4s, v4.4s, v1.4h
+; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
+; CHECK-NEXT: str q4, [x4]
+; CHECK-NEXT: stp q2, q6, [x4, #16]
+; CHECK-NEXT: uaddw2 v3.4s, v7.4s, v5.8h
+; CHECK-NEXT: uaddw v2.4s, v6.4s, v5.4h
+; CHECK-NEXT: str q7, [x4, #48]
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
%p2 = getelementptr i8, ptr %p, i32 4
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 9c4f0207b84ce..ae3b6a54a1f7f 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -2825,42 +2825,42 @@ define <32 x i64> @fptos_v32f32_v32i64(<32 x float> %a) {
; CHECK-SD-NEXT: fcvtl v7.2d, v7.2s
; CHECK-SD-NEXT: fcvtl2 v17.2d, v6.4s
; CHECK-SD-NEXT: fcvtl v6.2d, v6.2s
-; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
-; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
+; CHECK-SD-NEXT: fcvtl2 v21.2d, v2.4s
+; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
; CHECK-SD-NEXT: fcvtl2 v19.2d, v4.4s
; CHECK-SD-NEXT: fcvtl v4.2d, v4.2s
+; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
; CHECK-SD-NEXT: fcvtl2 v20.2d, v3.4s
+; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
; CHECK-SD-NEXT: fcvtl v3.2d, v3.2s
; CHECK-SD-NEXT: fcvtzs v16.2d, v16.2d
; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d
; CHECK-SD-NEXT: fcvtzs v17.2d, v17.2d
; CHECK-SD-NEXT: fcvtzs v6.2d, v6.2d
+; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
+; CHECK-SD-NEXT: fcvtzs v19.2d, v19.2d
+; CHECK-SD-NEXT: fcvtzs v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzs v18.2d, v18.2d
+; CHECK-SD-NEXT: fcvtzs v20.2d, v20.2d
; CHECK-SD-NEXT: fcvtzs v5.2d, v5.2d
-; CHECK-SD-NEXT: fcvtzs v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzs v3.2d, v3.2d
; CHECK-SD-NEXT: stp q7, q16, [x8, #224]
-; CHECK-SD-NEXT: fcvtl2 v7.2d, v2.4s
-; CHECK-SD-NEXT: fcvtzs v16.2d, v19.2d
-; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
-; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
-; CHECK-SD-NEXT: fcvtl2 v5.2d, v0.4s
+; CHECK-SD-NEXT: fcvtzs v16.2d, v21.2d
; CHECK-SD-NEXT: stp q6, q17, [x8, #192]
-; CHECK-SD-NEXT: fcvtl2 v6.2d, v1.4s
-; CHECK-SD-NEXT: fcvtzs v17.2d, v20.2d
+; CHECK-SD-NEXT: fcvtl2 v17.2d, v1.4s
; CHECK-SD-NEXT: fcvtl v1.2d, v1.2s
+; CHECK-SD-NEXT: stp q4, q19, [x8, #128]
+; CHECK-SD-NEXT: stp q3, q20, [x8, #96]
+; CHECK-SD-NEXT: stp q2, q16, [x8, #64]
+; CHECK-SD-NEXT: fcvtl2 v16.2d, v0.4s
; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
-; CHECK-SD-NEXT: stp q4, q16, [x8, #128]
-; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d
-; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-SD-NEXT: fcvtzs v4.2d, v6.2d
-; CHECK-SD-NEXT: stp q3, q17, [x8, #96]
-; CHECK-SD-NEXT: fcvtzs v3.2d, v5.2d
+; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
+; CHECK-SD-NEXT: fcvtzs v6.2d, v17.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
+; CHECK-SD-NEXT: fcvtzs v4.2d, v16.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: stp q2, q7, [x8, #64]
-; CHECK-SD-NEXT: stp q0, q3, [x8]
-; CHECK-SD-NEXT: stp q1, q4, [x8, #32]
+; CHECK-SD-NEXT: stp q1, q6, [x8, #32]
+; CHECK-SD-NEXT: stp q0, q4, [x8]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v32f32_v32i64:
@@ -2918,42 +2918,42 @@ define <32 x i64> @fptou_v32f32_v32i64(<32 x float> %a) {
; CHECK-SD-NEXT: fcvtl v7.2d, v7.2s
; CHECK-SD-NEXT: fcvtl2 v17.2d, v6.4s
; CHECK-SD-NEXT: fcvtl v6.2d, v6.2s
-; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
-; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
+; CHECK-SD-NEXT: fcvtl2 v21.2d, v2.4s
+; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
; CHECK-SD-NEXT: fcvtl2 v19.2d, v4.4s
; CHECK-SD-NEXT: fcvtl v4.2d, v4.2s
+; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
; CHECK-SD-NEXT: fcvtl2 v20.2d, v3.4s
+; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
; CHECK-SD-NEXT: fcvtl v3.2d, v3.2s
; CHECK-SD-NEXT: fcvtzu v16.2d, v16.2d
; CHECK-SD-NEXT: fcvtzu v7.2d, v7.2d
; CHECK-SD-NEXT: fcvtzu v17.2d, v17.2d
; CHECK-SD-NEXT: fcvtzu v6.2d, v6.2d
+; CHECK-SD-NEXT: fcvtzu v2.2d, v2.2d
+; CHECK-SD-NEXT: fcvtzu v19.2d, v19.2d
+; CHECK-SD-NEXT: fcvtzu v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzu v18.2d, v18.2d
+; CHECK-SD-NEXT: fcvtzu v20.2d, v20.2d
; CHECK-SD-NEXT: fcvtzu v5.2d, v5.2d
-; CHECK-SD-NEXT: fcvtzu v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzu v3.2d, v3.2d
; CHECK-SD-NEXT: stp q7, q16, [x8, #224]
-; CHECK-SD-NEXT: fcvtl2 v7.2d, v2.4s
-; CHECK-SD-NEXT: fcvtzu v16.2d, v19.2d
-; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
-; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
-; CHECK-SD-NEXT: fcvtl2 v5.2d, v0.4s
+; CHECK-SD-NEXT: fcvtzu v16.2d, v21.2d
; CHECK-SD-NEXT: stp q6, q17, [x8, #192]
-; CHECK-SD-NEXT: fcvtl2 v6.2d, v1.4s
-; CHECK-SD-NEXT: fcvtzu v17.2d, v20.2d
+; CHECK-SD-NEXT: fcvtl2 v17.2d, v1.4s
; CHECK-SD-NEXT: fcvtl v1.2d, v1.2s
+; CHECK-SD-NEXT: stp q4, q19, [x8, #128]
+; CHECK-SD-NEXT: stp q3, q20, [x8, #96]
+; CHECK-SD-NEXT: stp q2, q16, [x8, #64]
+; CHECK-SD-NEXT: fcvtl2 v16.2d, v0.4s
; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
-; CHECK-SD-NEXT: stp q4, q16, [x8, #128]
-; CHECK-SD-NEXT: fcvtzu v7.2d, v7.2d
-; CHECK-SD-NEXT: fcvtzu v2.2d, v2.2d
-; CHECK-SD-NEXT: fcvtzu v4.2d, v6.2d
-; CHECK-SD-NEXT: stp q3, q17, [x8, #96]
-; CHECK-SD-NEXT: fcvtzu v3.2d, v5.2d
+; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
+; CHECK-SD-NEXT: fcvtzu v6.2d, v17.2d
; CHECK-SD-NEXT: fcvtzu v1.2d, v1.2d
+; CHECK-SD-NEXT: fcvtzu v4.2d, v16.2d
; CHECK-SD-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-SD-NEXT: stp q2, q7, [x8, #64]
-; CHECK-SD-NEXT: stp q0, q3, [x8]
-; CHECK-SD-NEXT: stp q1, q4, [x8, #32]
+; CHECK-SD-NEXT: stp q1, q6, [x8, #32]
+; CHECK-SD-NEXT: stp q0, q4, [x8]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v32f32_v32i64:
@@ -5244,45 +5244,45 @@ define <32 x i64> @fptos_v32f16_v32i64(<32 x half> %a) {
; CHECK-GI-FP16-NEXT: mov v17.d[1], v23.d[0]
; CHECK-GI-FP16-NEXT: mov v1.d[1], v29.d[0]
; CHECK-GI-FP16-NEXT: mov v19.d[1], v30.d[0]
-; CHECK-GI-FP16-NEXT: mov h21, v3.h[1]
+; CHECK-GI-FP16-NEXT: mov h16, v3.h[1]
; CHECK-GI-FP16-NEXT: stp q6, q5, [x8, #32]
; CHECK-GI-FP16-NEXT: mov v20.d[1], v22.d[0]
-; CHECK-GI-FP16-NEXT: mov h16, v3.h[2]
+; CHECK-GI-FP16-NEXT: mov h21, v3.h[2]
; CHECK-GI-FP16-NEXT: mov h7, v3.h[3]
; CHECK-GI-FP16-NEXT: mov h22, v3.h[4]
-; CHECK-GI-FP16-NEXT: mov h23, v3.h[5]
-; CHECK-GI-FP16-NEXT: mov h6, v3.h[6]
+; CHECK-GI-FP16-NEXT: mov h6, v3.h[5]
+; CHECK-GI-FP16-NEXT: mov h23, v3.h[6]
; CHECK-GI-FP16-NEXT: mov h5, v3.h[7]
; CHECK-GI-FP16-NEXT: mov v18.d[1], v24.d[0]
; CHECK-GI-FP16-NEXT: mov v2.d[1], v25.d[0]
; CHECK-GI-FP16-NEXT: fcvt d3, h3
-; CHECK-GI-FP16-NEXT: fcvt d21, h21
-; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-GI-FP16-NEXT: fcvt d16, h16
+; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-GI-FP16-NEXT: fcvt d21, h21
; CHECK-GI-FP16-NEXT: fcvtzs v4.2d, v4.2d
; CHECK-GI-FP16-NEXT: fcvt d7, h7
; CHECK-GI-FP16-NEXT: fcvt d22, h22
-; CHECK-GI-FP16-NEXT: fcvt d23, h23
-; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-GI-FP16-NEXT: fcvt d6, h6
+; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v1.2d
+; CHECK-GI-FP16-NEXT: fcvt d23, h23
; CHECK-GI-FP16-NEXT: fcvt d5, h5
; CHECK-GI-FP16-NEXT: fcvtzs v19.2d, v19.2d
-; CHECK-GI-FP16-NEXT: mov v3.d[1], v21.d[0]
-; CHECK-GI-FP16-NEXT: fcvtzs v20.2d, v20.2d
+; CHECK-GI-FP16-NEXT: mov v3.d[1], v16.d[0]
+; CHECK-GI-FP16-NEXT: fcvtzs v16.2d, v20.2d
; CHECK-GI-FP16-NEXT: stp q0, q4, [x8, #64]
; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v17.2d
; CHECK-GI-FP16-NEXT: fcvtzs v4.2d, v18.2d
-; CHECK-GI-FP16-NEXT: mov v16.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT: mov v22.d[1], v23.d[0]
-; CHECK-GI-FP16-NEXT: mov v6.d[1], v5.d[0]
+; CHECK-GI-FP16-NEXT: mov v21.d[1], v7.d[0]
+; CHECK-GI-FP16-NEXT: mov v22.d[1], v6.d[0]
+; CHECK-GI-FP16-NEXT: mov v23.d[1], v5.d[0]
; CHECK-GI-FP16-NEXT: stp q1, q19, [x8, #96]
; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-GI-FP16-NEXT: fcvtzs v2.2d, v3.2d
-; CHECK-GI-FP16-NEXT: stp q20, q0, [x8, #128]
-; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v16.2d
+; CHECK-GI-FP16-NEXT: stp q16, q0, [x8, #128]
+; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v21.2d
; CHECK-GI-FP16-NEXT: fcvtzs v3.2d, v22.2d
; CHECK-GI-FP16-NEXT: stp q4, q1, [x8, #160]
-; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v6.2d
+; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v23.2d
; CHECK-GI-FP16-NEXT: stp q2, q0, [x8, #192]
; CHECK-GI-FP16-NEXT: stp q3, q1, [x8, #224]
; CHECK-GI-FP16-NEXT: ret
@@ -5645,45 +5645,45 @@ define <32 x i64> @fptou_v32f16_v32i64(<32 x half> %a) {
; CHECK-GI-FP16-NEXT: mov v17.d[1], v23.d[0]
; CHECK-GI-FP16-NEXT: mov v1.d[1], v29.d[0]
; CHECK-GI-FP16-NEXT: mov v19.d[1], v30.d[0]
-; CHECK-GI-FP16-NEXT: mov h21, v3.h[1]
+; CHECK-GI-FP16-NEXT: mov h16, v3.h[1]
; CHECK-GI-FP16-NEXT: stp q6, q5, [x8, #32]
; CHECK-GI-FP16-NEXT: mov v20.d[1], v22.d[0]
-; CHECK-GI-FP16-NEXT: mov h16, v3.h[2]
+; CHECK-GI-FP16-NEXT: mov h21, v3.h[2]
; CHECK-GI-FP16-NEXT: mov h7, v3.h[3]
; CHECK-GI-FP16-NEXT: mov h22, v3.h[4]
-; CHECK-GI-FP16-NEXT: mov h23, v3.h[5]
-; CHECK-GI-FP16-NEXT: mov h6, v3.h[6]
+; CHECK-GI-FP16-NEXT: mov h6, v3.h[5]
+; CHECK-GI-FP16-NEXT: mov h23, v3.h[6]
; CHECK-GI-FP16-NEXT: mov h5, v3.h[7]
; CHECK-GI-FP16-NEXT: mov v18.d[1], v24.d[0]
; CHECK-GI-FP16-NEXT: mov v2.d[1], v25.d[0]
; CHECK-GI-FP16-NEXT: fcvt d3, h3
-; CHECK-GI-FP16-NEXT: fcvt d21, h21
-; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v0.2d
; CHECK-GI-FP16-NEXT: fcvt d16, h16
+; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v0.2d
+; CHECK-GI-FP16-NEXT: fcvt d21, h21
; CHECK-GI-FP16-NEXT: fcvtzu v4.2d, v4.2d
; CHECK-GI-FP16-NEXT: fcvt d7, h7
; CHECK-GI-FP16-NEXT: fcvt d22, h22
-; CHECK-GI-FP16-NEXT: fcvt d23, h23
-; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v1.2d
; CHECK-GI-FP16-NEXT: fcvt d6, h6
+; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v1.2d
+; CHECK-GI-FP16-NEXT: fcvt d23, h23
; CHECK-GI-FP16-NEXT: fcvt d5, h5
; CHECK-GI-FP16-NEXT: fcvtzu v19.2d, v19.2d
-; CHECK-GI-FP16-NEXT: mov v3.d[1], v21.d[0]
-; CHECK-GI-FP16-NEXT: fcvtzu v20.2d, v20.2d
+; CHECK-GI-FP16-NEXT: mov v3.d[1], v16.d[0]
+; CHECK-GI-FP16-NEXT: fcvtzu v16.2d, v20.2d
; CHECK-GI-FP16-NEXT: stp q0, q4, [x8, #64]
; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v17.2d
; CHECK-GI-FP16-NEXT: fcvtzu v4.2d, v18.2d
-; CHECK-GI-FP16-NEXT: mov v16.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT: mov v22.d[1], v23.d[0]
-; CHECK-GI-FP16-NEXT: mov v6.d[1], v5.d[0]
+; CHECK-GI-FP16-NEXT: mov v21.d[1], v7.d[0]
+; CHECK-GI-FP16-NEXT: mov v22.d[1], v6.d[0]
+; CHECK-GI-FP16-NEXT: mov v23.d[1], v5.d[0]
; CHECK-GI-FP16-NEXT: stp q1, q19, [x8, #96]
; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v2.2d
; CHECK-GI-FP16-NEXT: fcvtzu v2.2d, v3.2d
-; CHECK-GI-FP16-NEXT: stp q20, q0, [x8, #128]
-; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v16.2d
+; CHECK-GI-FP16-NEXT: stp q16, q0, [x8, #128]
+; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v21.2d
; CHECK-GI-FP16-NEXT: fcvtzu v3.2d, v22.2d
; CHECK-GI-FP16-NEXT: stp q4, q1, [x8, #160]
-; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v6.2d
+; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v23.2d
; CHECK-GI-FP16-NEXT: stp q2, q0, [x8, #192]
; CHECK-GI-FP16-NEXT: stp q3, q1, [x8, #224]
; CHECK-GI-FP16-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index f2c4e976b8c16..b1b5154a57b4d 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -3521,31 +3521,31 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: extr x8, x21, x27, #28
-; CHECK-NEXT: extr x9, x29, x20, #28
+; CHECK-NEXT: str x24, [x19]
+; CHECK-NEXT: bfi x22, x20, #36, #28
; CHECK-NEXT: stur x28, [x19, #75]
+; CHECK-NEXT: extr x9, x29, x20, #28
; CHECK-NEXT: fcmp s8, #0.0
-; CHECK-NEXT: bfi x22, x20, #36, #28
-; CHECK-NEXT: lsr x11, x29, #28
; CHECK-NEXT: stur x8, [x19, #41]
-; CHECK-NEXT: str x9, [x19, #16]
-; CHECK-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT: stp x22, x9, [x19, #8]
+; CHECK-NEXT: lsr x9, x29, #28
; CHECK-NEXT: csel x8, xzr, x0, lt
-; CHECK-NEXT: csel x9, xzr, x1, lt
+; CHECK-NEXT: csel x10, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
-; CHECK-NEXT: stp x24, x22, [x19]
-; CHECK-NEXT: stur x10, [x19, #50]
-; CHECK-NEXT: lsr x10, x21, #28
-; CHECK-NEX...
[truncated]
|
Any concern with fixing this unreasonable behavior? |
When a node is picked, we should reset its next cluster candidate to null before releasing its successors/predecessors.