Skip to content

Commit ff0ef6a

Browse files
committed
[ARM][LowOverheadLoops] Make some stack spills valid for tail predication
This patch makes vector spills valid for tail predication when all loads from the same stack slot are within the loop Differential Revision: https://reviews.llvm.org/D105443
1 parent de39566 commit ff0ef6a

File tree

2 files changed

+246
-2
lines changed

2 files changed

+246
-2
lines changed

llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp

Lines changed: 80 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1084,7 +1084,85 @@ bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
10841084
return true;
10851085
}
10861086

1087-
bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
1087+
static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) {
1088+
1089+
auto GetFrameIndex = [](MachineMemOperand *Operand) {
1090+
const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
1091+
if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
1092+
if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(PseudoValue)) {
1093+
return FS->getFrameIndex();
1094+
}
1095+
}
1096+
return -1;
1097+
};
1098+
1099+
auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
1100+
switch (I->getOpcode()) {
1101+
case ARM::MVE_VSTRWU32:
1102+
case ARM::MVE_VLDRWU32: {
1103+
return I->getOperand(1).getReg() == ARM::SP &&
1104+
I->memoperands().size() == 1 &&
1105+
GetFrameIndex(I->memoperands().front()) >= 0;
1106+
}
1107+
default:
1108+
return false;
1109+
}
1110+
};
1111+
1112+
// An unpredicated vector register spill is allowed if all of the uses of the
1113+
// stack slot are within the loop
1114+
if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI))
1115+
return false;
1116+
1117+
// Search all blocks after the loop for accesses to the same stack slot.
1118+
// ReachingDefAnalysis doesn't work for sp as it relies on registers being
1119+
// live-out (which sp never is) to know what blocks to look in
1120+
if (MI->memoperands().size() == 0)
1121+
return false;
1122+
int FI = GetFrameIndex(MI->memoperands().front());
1123+
1124+
MachineFrameInfo FrameInfo = MI->getParent()->getParent()->getFrameInfo();
1125+
if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI))
1126+
return false;
1127+
1128+
SmallVector<MachineBasicBlock *> Frontier;
1129+
ML->getExitBlocks(Frontier);
1130+
SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()};
1131+
unsigned Idx = 0;
1132+
while (Idx < Frontier.size()) {
1133+
MachineBasicBlock *BB = Frontier[Idx];
1134+
bool LookAtSuccessors = true;
1135+
for (auto &I : *BB) {
1136+
if (!IsStackOp(&I) || I.memoperands().size() == 0)
1137+
continue;
1138+
if (GetFrameIndex(I.memoperands().front()) != FI)
1139+
continue;
1140+
// If this block has a store to the stack slot before any loads then we
1141+
// can ignore the block
1142+
if (I.getOpcode() == ARM::MVE_VSTRWU32) {
1143+
LookAtSuccessors = false;
1144+
break;
1145+
}
1146+
// If the store and the load are using the same stack slot then the
1147+
// store isn't valid for tail predication
1148+
if (I.getOpcode() == ARM::MVE_VLDRWU32)
1149+
return false;
1150+
}
1151+
1152+
if (LookAtSuccessors) {
1153+
for (auto Succ : BB->successors()) {
1154+
if (!Visited.contains(Succ) && !is_contained(Frontier, Succ))
1155+
Frontier.push_back(Succ);
1156+
}
1157+
}
1158+
Visited.insert(BB);
1159+
Idx++;
1160+
}
1161+
1162+
return true;
1163+
}
1164+
1165+
bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
10881166
if (CannotTailPredicate)
10891167
return false;
10901168

@@ -1140,7 +1218,7 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
11401218

11411219
// If the instruction is already explicitly predicated, then the conversion
11421220
// will be fine, but ensure that all store operations are predicated.
1143-
if (MI->mayStore())
1221+
if (MI->mayStore() && !ValidateMVEStore(MI, &ML))
11441222
return IsUse;
11451223

11461224
// If this instruction defines the VPR, update the predicate for the
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s
3+
--- |
4+
define hidden void @vector_spill_in_loop() {
5+
entry:
6+
ret void
7+
}
8+
9+
define hidden void @vector_spill_load_outside() {
10+
entry:
11+
ret void
12+
}
13+
...
14+
---
15+
name: vector_spill_in_loop
16+
tracksRegLiveness: true
17+
stack:
18+
- { id: 0, name: '', type: spill-slot, offset: -120, size: 16, alignment: 8,
19+
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
20+
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
21+
body: |
22+
; CHECK-LABEL: name: vector_spill_in_loop
23+
; CHECK: bb.0:
24+
; CHECK: successors: %bb.1(0x80000000)
25+
; CHECK: liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12
26+
; CHECK: $r6 = tMOVr $r2, 14 /* CC::al */, $noreg
27+
; CHECK: $r0 = tMOVr $r12, 14 /* CC::al */, $noreg
28+
; CHECK: $lr = MVE_DLSTP_16 renamable $r3
29+
; CHECK: bb.1:
30+
; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
31+
; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r10, $r11, $r12
32+
; CHECK: renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 0, $noreg
33+
; CHECK: renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 0, $noreg
34+
; CHECK: renamable $q5 = MVE_VSHR_immu16 killed renamable $q3, 11, 0, $noreg, undef renamable $q5
35+
; CHECK: MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8)
36+
; CHECK: dead renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8)
37+
; CHECK: dead renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 0, killed $noreg
38+
; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1
39+
bb.0:
40+
successors: %bb.1(0x80000000)
41+
liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12
42+
43+
$r6 = tMOVr $r2, 14 /* CC::al */, $noreg
44+
$r0 = tMOVr $r12, 14 /* CC::al */, $noreg
45+
$r9 = tMOVr $r3, 14 /* CC::al */, $noreg
46+
renamable $lr = t2DoLoopStartTP renamable $r1, renamable $r3
47+
48+
bb.1:
49+
successors: %bb.1(0x7c000000), %bb.2(0x04000000)
50+
liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12
51+
52+
renamable $vpr = MVE_VCTP16 renamable $r9, 0, $noreg
53+
MVE_VPST 8, implicit $vpr
54+
renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr
55+
renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 1, renamable $vpr
56+
MVE_VPST 2, implicit $vpr
57+
renamable $q5 = MVE_VSHR_immu16 renamable $q3, 11, 1, renamable $vpr, undef renamable $q5
58+
renamable $r9 = nsw t2SUBri killed renamable $r9, 8, 14 /* CC::al */, $noreg, $noreg
59+
MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8)
60+
MVE_VPST 8, implicit $vpr
61+
renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8)
62+
MVE_VPST 1, implicit $vpr
63+
renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 1, killed renamable $vpr
64+
renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr
65+
tB %bb.2, 14 /* CC::al */, $noreg
66+
67+
bb.2:
68+
successors: %bb.3(0x04000000), %bb.0(0x7c000000)
69+
liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12
70+
71+
renamable $r0 = tLDRspi $sp, 1, 14 /* CC::al */, $noreg
72+
renamable $r10 = nuw t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg
73+
renamable $r12 = t2ADDrs killed renamable $r12, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg
74+
renamable $r0 = tLDRspi $sp, 3, 14 /* CC::al */, $noreg
75+
renamable $r2 = t2ADDrs killed renamable $r2, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg
76+
renamable $r0 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg
77+
tCMPhir renamable $r10, killed renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr
78+
tBcc %bb.0, 1 /* CC::ne */, killed $cpsr
79+
80+
bb.3:
81+
$sp = frame-destroy tADDspi $sp, 24, 14 /* CC::al */, $noreg
82+
$sp = frame-destroy VLDMDIA_UPD $sp, 14 /* CC::al */, $noreg, def $d8, def $d9, def $d10, def $d11, def $d12, def $d13, def $d14, def $d15
83+
$sp = frame-destroy tADDspi $sp, 1, 14 /* CC::al */, $noreg
84+
$sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc
85+
...
86+
---
87+
name: vector_spill_load_outside
88+
stack:
89+
- { id: 0, name: '', type: spill-slot, offset: -120, size: 16, alignment: 8,
90+
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
91+
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
92+
tracksRegLiveness: true
93+
body: |
94+
; CHECK-LABEL: name: vector_spill_load_outside
95+
; CHECK: bb.0:
96+
; CHECK: successors: %bb.1(0x80000000)
97+
; CHECK: liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12
98+
; CHECK: $r6 = tMOVr $r2, 14 /* CC::al */, $noreg
99+
; CHECK: $r0 = tMOVr $r12, 14 /* CC::al */, $noreg
100+
; CHECK: $r9 = tMOVr $r3, 14 /* CC::al */, $noreg
101+
; CHECK: $lr = t2DLS renamable $r1
102+
; CHECK: bb.1:
103+
; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
104+
; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12
105+
; CHECK: renamable $vpr = MVE_VCTP16 renamable $r9, 0, $noreg
106+
; CHECK: MVE_VPST 8, implicit $vpr
107+
; CHECK: renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr
108+
; CHECK: renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 1, renamable $vpr
109+
; CHECK: MVE_VPST 2, implicit $vpr
110+
; CHECK: renamable $q5 = MVE_VSHR_immu16 killed renamable $q3, 11, 1, renamable $vpr, undef renamable $q5
111+
; CHECK: renamable $r9 = nsw t2SUBri killed renamable $r9, 8, 14 /* CC::al */, $noreg, $noreg
112+
; CHECK: MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8)
113+
; CHECK: MVE_VPST 8, implicit $vpr
114+
; CHECK: dead renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8)
115+
; CHECK: MVE_VPST 1, implicit $vpr
116+
; CHECK: dead renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 1, killed renamable $vpr
117+
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1
118+
bb.0:
119+
successors: %bb.1(0x80000000)
120+
liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12
121+
122+
$r6 = tMOVr $r2, 14 /* CC::al */, $noreg
123+
$r0 = tMOVr $r12, 14 /* CC::al */, $noreg
124+
$r9 = tMOVr $r3, 14 /* CC::al */, $noreg
125+
renamable $lr = t2DoLoopStartTP renamable $r1, renamable $r3
126+
127+
bb.1:
128+
successors: %bb.1(0x7c000000), %bb.2(0x04000000)
129+
liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12
130+
131+
renamable $vpr = MVE_VCTP16 renamable $r9, 0, $noreg
132+
MVE_VPST 8, implicit $vpr
133+
renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr
134+
renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 1, renamable $vpr
135+
MVE_VPST 2, implicit $vpr
136+
renamable $q5 = MVE_VSHR_immu16 renamable $q3, 11, 1, renamable $vpr, undef renamable $q5
137+
renamable $r9 = nsw t2SUBri killed renamable $r9, 8, 14 /* CC::al */, $noreg, $noreg
138+
MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8)
139+
MVE_VPST 8, implicit $vpr
140+
renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8)
141+
MVE_VPST 1, implicit $vpr
142+
renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 1, killed renamable $vpr
143+
renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr
144+
tB %bb.2, 14 /* CC::al */, $noreg
145+
146+
bb.2:
147+
successors: %bb.3(0x04000000), %bb.0(0x7c000000)
148+
liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12
149+
150+
renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8)
151+
renamable $r0 = tLDRspi $sp, 1, 14 /* CC::al */, $noreg
152+
renamable $r10 = nuw t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg
153+
renamable $r12 = t2ADDrs killed renamable $r12, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg
154+
renamable $r0 = tLDRspi $sp, 3, 14 /* CC::al */, $noreg
155+
renamable $r2 = t2ADDrs killed renamable $r2, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg
156+
renamable $r0 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg
157+
tCMPhir renamable $r10, killed renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr
158+
tBcc %bb.0, 1 /* CC::ne */, killed $cpsr
159+
160+
bb.3:
161+
$sp = frame-destroy tADDspi $sp, 24, 14 /* CC::al */, $noreg
162+
$sp = frame-destroy VLDMDIA_UPD $sp, 14 /* CC::al */, $noreg, def $d8, def $d9, def $d10, def $d11, def $d12, def $d13, def $d14, def $d15
163+
$sp = frame-destroy tADDspi $sp, 1, 14 /* CC::al */, $noreg
164+
$sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc
165+
166+
...

0 commit comments

Comments
 (0)