Skip to content

Commit 4792ee5

Browse files
committed
[MachineLICM] Workaround - apply RegMasks conservatively
Fixes a test failure on AArch64, at the cost of a small regression for AMDGPU which I will investigate. In the meantime, correctness prevails.
1 parent 3ca1744 commit 4792ee5

File tree

3 files changed

+57
-10
lines changed

3 files changed

+57
-10
lines changed

llvm/lib/CodeGen/MachineLICM.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -426,28 +426,26 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
426426
static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
427427
BitVector &RUs,
428428
const uint32_t *Mask) {
429-
BitVector ClobberedRUs(TRI.getNumRegUnits(), true);
429+
// FIXME: This is overly conservative when applying regmasks from, e.g. calls.
430+
// See `test/CodeGen/AMDGPU/indirect-call.ll` regression.
431+
BitVector RUsFromRegsNotInMask(TRI.getNumRegUnits());
430432
const unsigned NumRegs = TRI.getNumRegs();
431433
const unsigned MaskWords = (NumRegs + 31) / 32;
432434
for (unsigned K = 0; K < MaskWords; ++K) {
433435
const uint32_t Word = Mask[K];
434-
if (!Word)
435-
continue;
436-
437436
for (unsigned Bit = 0; Bit < 32; ++Bit) {
438437
const unsigned PhysReg = (K * 32) + Bit;
439438
if (PhysReg == NumRegs)
440439
break;
441440

442-
// Check if we have a valid PhysReg that is set in the mask.
443-
if ((Word >> Bit) & 1) {
441+
if (PhysReg && !((Word >> Bit) & 1)) {
444442
for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
445-
ClobberedRUs.reset(*RUI);
443+
RUsFromRegsNotInMask.set(*RUI);
446444
}
447445
}
448446
}
449447

450-
RUs |= ClobberedRUs;
448+
RUs |= RUsFromRegsNotInMask;
451449
}
452450

453451
/// Examine the instruction for potentai LICM candidate. Also
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=aarch64-unknown-linux-gnu -run-pass=greedy,machinelicm -verify-machineinstrs -debug -o - %s | FileCheck %s
3+
4+
# FIXME: Running RA is needed otherwise it runs pre-RA LICM.
5+
---
6+
name: test
7+
tracksRegLiveness: true
8+
body: |
9+
; CHECK-LABEL: name: test
10+
; CHECK: bb.0:
11+
; CHECK-NEXT: successors: %bb.1(0x80000000)
12+
; CHECK-NEXT: liveins: $x0, $w1, $x2
13+
; CHECK-NEXT: {{ $}}
14+
; CHECK-NEXT: B %bb.1
15+
; CHECK-NEXT: {{ $}}
16+
; CHECK-NEXT: bb.1:
17+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
18+
; CHECK-NEXT: liveins: $x0, $w1, $x2
19+
; CHECK-NEXT: {{ $}}
20+
; CHECK-NEXT: renamable $q11 = MOVIv4i32 2, 8
21+
; CHECK-NEXT: BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0
22+
; CHECK-NEXT: renamable $q10 = MVNIv4i32 4, 0
23+
; CHECK-NEXT: $xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv
24+
; CHECK-NEXT: Bcc 11, %bb.1, implicit $nzcv
25+
; CHECK-NEXT: B %bb.2
26+
; CHECK-NEXT: {{ $}}
27+
; CHECK-NEXT: bb.2:
28+
; CHECK-NEXT: liveins: $q10, $q11
29+
; CHECK-NEXT: {{ $}}
30+
; CHECK-NEXT: $q0 = COPY $q10
31+
; CHECK-NEXT: $q1 = COPY $q11
32+
bb.0:
33+
liveins: $x0, $w1, $x2
34+
B %bb.1
35+
36+
bb.1:
37+
liveins: $x0, $w1, $x2
38+
renamable $q11 = MOVIv4i32 2, 8
39+
BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0
40+
renamable $q10 = MVNIv4i32 4, 0
41+
$xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv
42+
Bcc 11, %bb.1, implicit $nzcv
43+
B %bb.2
44+
45+
bb.2:
46+
liveins: $q10, $q11
47+
$q0 = COPY $q10
48+
$q1 = COPY $q11
49+
...

llvm/test/CodeGen/AMDGPU/indirect-call.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
886886
; GCN-NEXT: v_writelane_b32 v40, s62, 30
887887
; GCN-NEXT: v_writelane_b32 v40, s63, 31
888888
; GCN-NEXT: s_mov_b64 s[6:7], exec
889-
; GCN-NEXT: s_movk_i32 s4, 0x7b
890889
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
891890
; GCN-NEXT: v_readfirstlane_b32 s8, v0
892891
; GCN-NEXT: v_readfirstlane_b32 s9, v1
893892
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
894893
; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
894+
; GCN-NEXT: s_movk_i32 s4, 0x7b
895895
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
896896
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
897897
; GCN-NEXT: s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
980980
; GISEL-NEXT: v_writelane_b32 v40, s62, 30
981981
; GISEL-NEXT: v_writelane_b32 v40, s63, 31
982982
; GISEL-NEXT: s_mov_b64 s[6:7], exec
983-
; GISEL-NEXT: s_movk_i32 s4, 0x7b
984983
; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
985984
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
986985
; GISEL-NEXT: v_readfirstlane_b32 s9, v1
987986
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
988987
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
988+
; GISEL-NEXT: s_movk_i32 s4, 0x7b
989989
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
990990
; GISEL-NEXT: ; implicit-def: $vgpr0
991991
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]

0 commit comments

Comments
 (0)