Skip to content

Commit 64d7853

Browse files
AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize
1 parent 6144d82 commit 64d7853

File tree

3 files changed

+125
-100
lines changed

3 files changed

+125
-100
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 104 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "GCNSubtarget.h"
2424
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
2525
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
26+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
2627
#include "llvm/CodeGen/MachineFunctionPass.h"
2728
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2829
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -137,7 +138,109 @@ class AMDGPURegBankLegalizeCombiner {
137138
return {MatchMI, MatchMI->getOperand(1).getReg()};
138139
}
139140

141+
std::pair<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src) {
142+
MachineInstr *ReadAnyLane = MRI.getVRegDef(Src);
143+
if (ReadAnyLane->getOpcode() == AMDGPU::G_AMDGPU_READANYLANE) {
144+
Register RALSrc = ReadAnyLane->getOperand(1).getReg();
145+
if (auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI))
146+
return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
147+
}
148+
return {nullptr, -1};
149+
}
150+
151+
Register getReadAnyLaneSrc(Register Src) {
152+
// Src = G_AMDGPU_READANYLANE RALSrc
153+
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
154+
if (RAL)
155+
return RALSrc;
156+
157+
// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
158+
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
159+
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr
160+
// Src G_MERGE_VALUES LoSgpr, HiSgpr
161+
auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
162+
if (Merge) {
163+
unsigned NumElts = Merge->getNumSources();
164+
auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
165+
if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
166+
return {};
167+
168+
// check if all elements are from same unmerge and there is no shuffling
169+
for (unsigned i = 1; i < NumElts; ++i) {
170+
auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
171+
if (UnmergeI != Unmerge || (unsigned)IdxI != i)
172+
return {};
173+
}
174+
return Unmerge->getSourceReg();
175+
}
176+
177+
// ..., VgprI, ... = G_UNMERGE_VALUES VgprLarge
178+
// SgprI = G_AMDGPU_READANYLANE VgprI
179+
// SgprLarge G_MERGE_VALUES ..., SgprI, ...
180+
// ..., Src, ... = G_UNMERGE_VALUES SgprLarge
181+
auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
182+
if (UnMerge) {
183+
int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
184+
auto *Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
185+
if (Merge) {
186+
auto [RAL, RALSrc] =
187+
tryMatch(Merge->getSourceReg(Idx), AMDGPU::G_AMDGPU_READANYLANE);
188+
if (RAL)
189+
return RALSrc;
190+
}
191+
}
192+
193+
return {};
194+
}
195+
196+
void replaceRegWithOrBuildCopy(Register Dst, Register Src) {
197+
if (Dst.isVirtual())
198+
MRI.replaceRegWith(Dst, Src);
199+
else
200+
B.buildCopy(Dst, Src);
201+
}
202+
203+
bool tryEliminateReadAnyLane(MachineInstr &Copy) {
204+
Register Dst = Copy.getOperand(0).getReg();
205+
Register Src = Copy.getOperand(1).getReg();
206+
if (!Src.isVirtual())
207+
return false;
208+
209+
Register RALDst = Src;
210+
MachineInstr &SrcMI = *MRI.getVRegDef(Src);
211+
if (SrcMI.getOpcode() == AMDGPU::G_BITCAST)
212+
RALDst = SrcMI.getOperand(1).getReg();
213+
214+
Register RALSrc = getReadAnyLaneSrc(RALDst);
215+
if (!RALSrc)
216+
return false;
217+
218+
B.setInstr(Copy);
219+
if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
220+
// Src = READANYLANE RALSrc Src = READANYLANE RALSrc
221+
// Dst = Copy Src $Dst = Copy Src
222+
// -> ->
223+
// Dst = RALSrc $Dst = Copy RALSrc
224+
replaceRegWithOrBuildCopy(Dst, RALSrc);
225+
} else {
226+
// RALDst = READANYLANE RALSrc RALDst = READANYLANE RALSrc
227+
// Src = G_BITCAST RALDst Src = G_BITCAST RALDst
228+
// Dst = Copy Src Dst = Copy Src
229+
// -> ->
230+
// NewVgpr = G_BITCAST RALDst NewVgpr = G_BITCAST RALDst
231+
// Dst = NewVgpr $Dst = Copy NewVgpr
232+
auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
233+
replaceRegWithOrBuildCopy(Dst, Bitcast.getReg(0));
234+
}
235+
236+
eraseInstr(Copy, MRI, nullptr);
237+
return true;
238+
}
239+
140240
void tryCombineCopy(MachineInstr &MI) {
241+
if (tryEliminateReadAnyLane(MI))
242+
return;
243+
141244
Register Dst = MI.getOperand(0).getReg();
142245
Register Src = MI.getOperand(1).getReg();
143246
// Skip copies of physical registers.
@@ -160,24 +263,7 @@ class AMDGPURegBankLegalizeCombiner {
160263
auto One = B.buildConstant({SgprRB, S32}, 1);
161264
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
162265
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
163-
cleanUpAfterCombine(MI, Trunc);
164-
return;
165-
}
166-
167-
// Src = G_AMDGPU_READANYLANE RALSrc
168-
// Dst = COPY Src
169-
// ->
170-
// Dst = RALSrc
171-
if (MRI.getRegBankOrNull(Dst) == VgprRB &&
172-
MRI.getRegBankOrNull(Src) == SgprRB) {
173-
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
174-
if (!RAL)
175-
return;
176-
177-
assert(MRI.getRegBank(RALSrc) == VgprRB);
178-
MRI.replaceRegWith(Dst, RALSrc);
179-
cleanUpAfterCombine(MI, RAL);
180-
return;
266+
eraseInstr(MI, MRI, nullptr);
181267
}
182268
}
183269

llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@ define amdgpu_ps float @readanylane_to_physical_vgpr(ptr addrspace(1) inreg %ptr
2020
; CHECK-NEXT: v_mov_b32_e32 v0, 0
2121
; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
2222
; CHECK-NEXT: s_waitcnt vmcnt(0)
23-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
24-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
2523
; CHECK-NEXT: ; return to shader part epilog
2624
%load = load volatile float, ptr addrspace(1) %ptr
2725
ret float %load
@@ -33,8 +31,6 @@ define amdgpu_ps void @readanylane_to_bitcast_to_virtual_vgpr(ptr addrspace(1) i
3331
; CHECK-NEXT: v_mov_b32_e32 v0, 0
3432
; CHECK-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
3533
; CHECK-NEXT: s_waitcnt vmcnt(0)
36-
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
37-
; CHECK-NEXT: v_mov_b32_e32 v1, s0
3834
; CHECK-NEXT: global_store_dword v0, v1, s[2:3]
3935
; CHECK-NEXT: s_endpgm
4036
%load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
@@ -49,8 +45,6 @@ define amdgpu_ps float @readanylane_to_bitcast_to_physical_vgpr(ptr addrspace(1)
4945
; CHECK-NEXT: v_mov_b32_e32 v0, 0
5046
; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
5147
; CHECK-NEXT: s_waitcnt vmcnt(0)
52-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
53-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
5448
; CHECK-NEXT: ; return to shader part epilog
5549
%load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
5650
%bitcast = bitcast <2 x i16> %load to float
@@ -63,10 +57,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_to_virtual_vgpr(ptr addrspace(1
6357
; CHECK-NEXT: v_mov_b32_e32 v2, 0
6458
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
6559
; CHECK-NEXT: s_waitcnt vmcnt(0)
66-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
67-
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
68-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
69-
; CHECK-NEXT: v_mov_b32_e32 v1, s1
7060
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
7161
; CHECK-NEXT: s_endpgm
7262
%load = load volatile i64, ptr addrspace(1) %ptr0
@@ -85,10 +75,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_bitcast_to_virtual_vgpr(ptr add
8575
; CHECK-NEXT: v_mov_b32_e32 v2, 0
8676
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
8777
; CHECK-NEXT: s_waitcnt vmcnt(0)
88-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
89-
; CHECK-NEXT: v_readfirstlane_b32 s1, v1
90-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
91-
; CHECK-NEXT: v_mov_b32_e32 v1, s1
9278
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
9379
; CHECK-NEXT: s_endpgm
9480
%load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
@@ -109,9 +95,7 @@ define amdgpu_ps void @unmerge_readanylane_merge_extract_to_virtual_vgpr(ptr add
10995
; CHECK-NEXT: v_mov_b32_e32 v2, 0
11096
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
11197
; CHECK-NEXT: s_waitcnt vmcnt(0)
112-
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
113-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
114-
; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
98+
; CHECK-NEXT: global_store_dword v2, v1, s[2:3]
11599
; CHECK-NEXT: s_endpgm
116100
%load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
117101
%extracted = extractelement <2 x i32> %load, i32 1
@@ -125,8 +109,7 @@ define amdgpu_ps float @unmerge_readanylane_merge_extract_to_physical_vgpr(ptr a
125109
; CHECK-NEXT: v_mov_b32_e32 v0, 0
126110
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
127111
; CHECK-NEXT: s_waitcnt vmcnt(0)
128-
; CHECK-NEXT: v_readfirstlane_b32 s0, v1
129-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
112+
; CHECK-NEXT: v_mov_b32_e32 v0, v1
130113
; CHECK-NEXT: ; return to shader part epilog
131114
%load = load volatile <2 x float>, ptr addrspace(1) %ptr0
132115
%extracted = extractelement <2 x float> %load, i32 1
@@ -139,8 +122,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_extract_bitcast_to_virtual_vgpr
139122
; CHECK-NEXT: v_mov_b32_e32 v2, 0
140123
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
141124
; CHECK-NEXT: s_waitcnt vmcnt(0)
142-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
143-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
144125
; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
145126
; CHECK-NEXT: s_endpgm
146127
%load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
@@ -156,8 +137,6 @@ define amdgpu_ps float @unmerge_readanylane_merge_extract_bitcast_to_physical_vg
156137
; CHECK-NEXT: v_mov_b32_e32 v0, 0
157138
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
158139
; CHECK-NEXT: s_waitcnt vmcnt(0)
159-
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
160-
; CHECK-NEXT: v_mov_b32_e32 v0, s0
161140
; CHECK-NEXT: ; return to shader part epilog
162141
%load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
163142
%extracted = shufflevector <4 x i16> %load, <4 x i16> %load, <2 x i32> <i32 0, i32 1>

0 commit comments

Comments
 (0)