Skip to content

Commit f8da46e

Browse files
committed
[MachineLICM] Work-around Incomplete RegUnits
Fixes a miscompile on AArch64, at the cost of a small regression on AMDGPU. #96146 opened to investigate the issue.
1 parent b9ad0b6 commit f8da46e

File tree

3 files changed

+85
-10
lines changed

3 files changed

+85
-10
lines changed

llvm/lib/CodeGen/MachineLICM.cpp

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -426,28 +426,54 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
426426
static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
427427
BitVector &RUs,
428428
const uint32_t *Mask) {
429-
BitVector ClobberedRUs(TRI.getNumRegUnits(), true);
429+
// FIXME: This intentionally works in reverse due to some issues with the
430+
// Register Units infrastructure.
431+
//
432+
// This is used to apply callee-saved-register masks to the clobbered regunits
433+
// mask.
434+
//
435+
// The right way to approach this is to start with a BitVector full of ones,
436+
// then reset all the bits of the regunits of each register that is set in the
437+
// mask (registers preserved), then OR the resulting bits with the Clobbers
438+
// mask. This correctly prioritizes the saved registers, so if a RU is shared
439+
// between a register that is preserved, and one that is NOT preserved, that
440+
// RU will not be set in the output vector (the clobbers).
441+
//
442+
// What we have to do for now is the opposite: we have to assume that the
443+
// regunits of all registers that are NOT preserved are clobbered, even if
444+
// those regunits are preserved by another register. So if a RU is shared
445+
// like described previously, that RU will be set.
446+
//
447+
// This is to work around an issue which appears in AArch64, but isn't
448+
// exclusive to that target: AArch64's Qn registers (128 bits) have Dn
449+
// register (lower 64 bits). A few Dn registers are preserved by some calling
450+
// conventions, but Qn and Dn share exactly the same reg units.
451+
//
452+
// If we do this the right way, Qn will be marked as NOT clobbered even though
453+
// its upper 64 bits are NOT preserved. The conservative approach handles this
454+
// correctly at the cost of some missed optimizations on other targets.
455+
//
456+
// This is caused by how RegUnits are handled within TableGen. Ideally, Qn
457+
// should have an extra RegUnit to model the "unknown" bits not covered by the
458+
// subregs.
459+
BitVector RUsFromRegsNotInMask(TRI.getNumRegUnits());
430460
const unsigned NumRegs = TRI.getNumRegs();
431461
const unsigned MaskWords = (NumRegs + 31) / 32;
432462
for (unsigned K = 0; K < MaskWords; ++K) {
433463
const uint32_t Word = Mask[K];
434-
if (!Word)
435-
continue;
436-
437464
for (unsigned Bit = 0; Bit < 32; ++Bit) {
438465
const unsigned PhysReg = (K * 32) + Bit;
439466
if (PhysReg == NumRegs)
440467
break;
441468

442-
// Check if we have a valid PhysReg that is set in the mask.
443-
if ((Word >> Bit) & 1) {
469+
if (PhysReg && !((Word >> Bit) & 1)) {
444470
for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
445-
ClobberedRUs.reset(*RUI);
471+
RUsFromRegsNotInMask.set(*RUI);
446472
}
447473
}
448474
}
449475

450-
RUs |= ClobberedRUs;
476+
RUs |= RUsFromRegsNotInMask;
451477
}
452478

453479
/// Examine the instruction for potentai LICM candidate. Also
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=aarch64-unknown-linux-gnu -run-pass=greedy,machinelicm -verify-machineinstrs -debug -o - %s | FileCheck %s
3+
4+
# FIXME: Running RA is needed otherwise it runs pre-RA LICM.
5+
---
6+
name: test
7+
tracksRegLiveness: true
8+
body: |
9+
; CHECK-LABEL: name: test
10+
; CHECK: bb.0:
11+
; CHECK-NEXT: successors: %bb.1(0x80000000)
12+
; CHECK-NEXT: liveins: $x0, $w1, $x2
13+
; CHECK-NEXT: {{ $}}
14+
; CHECK-NEXT: B %bb.1
15+
; CHECK-NEXT: {{ $}}
16+
; CHECK-NEXT: bb.1:
17+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
18+
; CHECK-NEXT: liveins: $x0, $w1, $x2
19+
; CHECK-NEXT: {{ $}}
20+
; CHECK-NEXT: renamable $q11 = MOVIv4i32 2, 8
21+
; CHECK-NEXT: BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0
22+
; CHECK-NEXT: renamable $q10 = MVNIv4i32 4, 0
23+
; CHECK-NEXT: $xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv
24+
; CHECK-NEXT: Bcc 11, %bb.1, implicit $nzcv
25+
; CHECK-NEXT: B %bb.2
26+
; CHECK-NEXT: {{ $}}
27+
; CHECK-NEXT: bb.2:
28+
; CHECK-NEXT: liveins: $q10, $q11
29+
; CHECK-NEXT: {{ $}}
30+
; CHECK-NEXT: $q0 = COPY $q10
31+
; CHECK-NEXT: $q1 = COPY $q11
32+
bb.0:
33+
liveins: $x0, $w1, $x2
34+
B %bb.1
35+
36+
bb.1:
37+
liveins: $x0, $w1, $x2
38+
renamable $q11 = MOVIv4i32 2, 8
39+
BL &memset, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $w1, implicit $x2, implicit-def $sp, implicit-def $x0
40+
renamable $q10 = MVNIv4i32 4, 0
41+
$xzr = SUBSXri $x0, 1, 0, implicit-def $nzcv
42+
Bcc 11, %bb.1, implicit $nzcv
43+
B %bb.2
44+
45+
bb.2:
46+
liveins: $q10, $q11
47+
$q0 = COPY $q10
48+
$q1 = COPY $q11
49+
...

llvm/test/CodeGen/AMDGPU/indirect-call.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
886886
; GCN-NEXT: v_writelane_b32 v40, s62, 30
887887
; GCN-NEXT: v_writelane_b32 v40, s63, 31
888888
; GCN-NEXT: s_mov_b64 s[6:7], exec
889-
; GCN-NEXT: s_movk_i32 s4, 0x7b
890889
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
891890
; GCN-NEXT: v_readfirstlane_b32 s8, v0
892891
; GCN-NEXT: v_readfirstlane_b32 s9, v1
893892
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
894893
; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
894+
; GCN-NEXT: s_movk_i32 s4, 0x7b
895895
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
896896
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
897897
; GCN-NEXT: s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
980980
; GISEL-NEXT: v_writelane_b32 v40, s62, 30
981981
; GISEL-NEXT: v_writelane_b32 v40, s63, 31
982982
; GISEL-NEXT: s_mov_b64 s[6:7], exec
983-
; GISEL-NEXT: s_movk_i32 s4, 0x7b
984983
; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
985984
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
986985
; GISEL-NEXT: v_readfirstlane_b32 s9, v1
987986
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
988987
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
988+
; GISEL-NEXT: s_movk_i32 s4, 0x7b
989989
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
990990
; GISEL-NEXT: ; implicit-def: $vgpr0
991991
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]

0 commit comments

Comments
 (0)