Skip to content

Commit 1baa896

Browse files
committed
[AMDGPU] Optimize LDS DMA soft waitcnt
This patch adds support for optimizing `S_WAITCNT_VMCNT_LDS_DMA_soft` pseudo instructions by analyzing whether they can be removed based on the absence of LDS DMA operations. These optimizations are a precursor to a dependent patch where these waitcnt pseudos will actually be emitted by the memory legalizer. Adding the waitcnt in the memory model first without any optimization would be too painful of a performance penalty.
1 parent a10f6c1 commit 1baa896

File tree

4 files changed

+185
-0
lines changed

4 files changed

+185
-0
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1278,6 +1278,19 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
12781278
if (Opcode == AMDGPU::S_WAITCNT) {
12791279
unsigned IEnc = II.getOperand(0).getImm();
12801280
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1281+
1282+
// These pseudo waitcnt instructions are only needed to synchronize DS
1283+
// operations with direct LDS loads that use vmcnt. We can safely relax
1284+
// them when no outstanding direct LDS loads exist, even if other vmcnt
1285+
// events are pending.
1286+
if (II.getOpcode() == AMDGPU::S_WAITCNT_VMCNT_LDS_DMA_soft) {
1287+
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1288+
AMDGPU::Waitcnt LDSDMAWait;
1289+
ScoreBrackets.determineWait(LOAD_CNT, RegNo, LDSDMAWait);
1290+
if (LDSDMAWait.LoadCnt == ~0u)
1291+
OldWait.LoadCnt = ~0u;
1292+
}
1293+
12811294
if (TrySimplify)
12821295
ScoreBrackets.simplifyWaitcnt(OldWait);
12831296
Wait = Wait.combined(OldWait);

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,6 +1010,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
10101010
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
10111011
switch (Opcode) {
10121012
case AMDGPU::S_WAITCNT_soft:
1013+
case AMDGPU::S_WAITCNT_VMCNT_LDS_DMA_soft:
10131014
return AMDGPU::S_WAITCNT;
10141015
case AMDGPU::S_WAITCNT_VSCNT_soft:
10151016
return AMDGPU::S_WAITCNT_VSCNT;

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1608,6 +1608,7 @@ let OtherPredicates = [HasImageInsts] in {
16081608
def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
16091609
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
16101610
}
1611+
def S_WAITCNT_VMCNT_LDS_DMA_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">;
16111612

16121613
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
16131614
[(int_amdgcn_s_sethalt timm:$simm16)]>;

llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,3 +117,173 @@ body: |
117117
S_ENDPGM 0
118118
119119
...
120+
121+
# Soft waitcnt should be honored here.
122+
# GCN-LABEL: name: buffer_load_dword_lds_ds_read_soft_wait
123+
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
124+
# GCN-NEXT: S_WAITCNT 3952
125+
# vmcnt(0)
126+
# GCN-NEXT: S_BARRIER
127+
---
128+
name: buffer_load_dword_lds_ds_read_soft_wait
129+
body: |
130+
bb.0:
131+
$m0 = S_MOV_B32 0
132+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
133+
S_WAITCNT_VMCNT_LDS_DMA_soft 3952
134+
S_BARRIER
135+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
136+
S_ENDPGM 0
137+
138+
...
139+
140+
# No need for waitcnt.
141+
# GCN-LABEL: name: buffer_store_lds_dword_ds_read_soft_wait
142+
# GCN: BUFFER_STORE_LDS_DWORD
143+
# GCN-NEXT: S_BARRIER
144+
---
145+
name: buffer_store_lds_dword_ds_read_soft_wait
146+
body: |
147+
bb.0:
148+
$m0 = S_MOV_B32 0
149+
BUFFER_STORE_LDS_DWORD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(3) poison` + 4), (store (s32) into `ptr addrspace(1) poison` + 4)
150+
S_WAITCNT_VMCNT_LDS_DMA_soft 3952
151+
S_BARRIER
152+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
153+
S_ENDPGM 0
154+
155+
...
156+
157+
# Soft waitcnt should mean vmcnt(1) before the barrier and vmcnt(0) after.
158+
# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait
159+
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
160+
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
161+
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
162+
# GCN-NEXT: S_WAITCNT 3953
163+
# vmcnt(1)
164+
# GCN-NEXT: S_BARRIER
165+
# GCN-NEXT: S_WAITCNT 3952
166+
# vmcnt(0)
167+
# GCN-NEXT: DS_READ_B32_gfx9
168+
---
169+
name: series_of_buffer_load_dword_lds_ds_read_soft_wait
170+
body: |
171+
bb.0:
172+
$m0 = S_MOV_B32 0
173+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
174+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
175+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
176+
S_WAITCNT_VMCNT_LDS_DMA_soft 3953
177+
S_BARRIER
178+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
179+
S_ENDPGM 0
180+
181+
...
182+
183+
# No waitcnt before the barrier because counter is too high
184+
# GCN-LABEL: name: buffer_load_dword_lds_ds_read_soft_wait_redundant
185+
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
186+
# GCN-NEXT: S_BARRIER
187+
# GCN-NEXT: S_WAITCNT 3952
188+
# vmcnt(0)
189+
# GCN-NEXT: DS_READ_B32_gfx9
190+
---
191+
name: buffer_load_dword_lds_ds_read_soft_wait_redundant
192+
body: |
193+
bb.0:
194+
$m0 = S_MOV_B32 0
195+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
196+
S_WAITCNT_VMCNT_LDS_DMA_soft 3953
197+
S_BARRIER
198+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
199+
S_ENDPGM 0
200+
201+
...
202+
203+
# Combine waitcnt.
204+
# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_repeat
205+
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
206+
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
207+
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
208+
# GCN-NEXT: S_WAITCNT 3953
209+
# vmcnt(1)
210+
# GCN-NEXT: S_BARRIER
211+
# GCN-NEXT: S_WAITCNT 3952
212+
# vmcnt(0)
213+
# GCN-NEXT: DS_READ_B32_gfx9
214+
---
215+
name: series_of_buffer_load_dword_lds_ds_read_soft_wait_repeat
216+
body: |
217+
bb.0:
218+
$m0 = S_MOV_B32 0
219+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
220+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
221+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
222+
S_WAITCNT_VMCNT_LDS_DMA_soft 3953
223+
S_WAITCNT_VMCNT_LDS_DMA_soft 3953
224+
S_BARRIER
225+
S_WAITCNT_VMCNT_LDS_DMA_soft 3953
226+
S_WAITCNT_VMCNT_LDS_DMA_soft 3953
227+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
228+
S_ENDPGM 0
229+
230+
...
231+
232+
# Merge waitcnt.
233+
# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_merge
234+
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
235+
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
236+
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
237+
# GCN-NEXT: S_WAITCNT 3953
238+
# vmcnt(1)
239+
# GCN-NEXT: S_BARRIER
240+
# GCN-NEXT: S_WAITCNT 3952
241+
# vmcnt(0)
242+
# GCN-NEXT: DS_READ_B32_gfx9
243+
---
244+
name: series_of_buffer_load_dword_lds_ds_read_soft_wait_merge
245+
body: |
246+
bb.0:
247+
$m0 = S_MOV_B32 0
248+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
249+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
250+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
251+
S_WAITCNT_VMCNT_LDS_DMA_soft 3954
252+
S_WAITCNT_VMCNT_LDS_DMA_soft 3953
253+
S_BARRIER
254+
S_WAITCNT_VMCNT_LDS_DMA_soft 3952
255+
S_WAITCNT_VMCNT_LDS_DMA_soft 3952
256+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
257+
S_ENDPGM 0
258+
259+
...
260+
261+
262+
# Handle the preexisting waitcnt.
263+
# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_preexisting
264+
# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN
265+
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
266+
# GCN-NEXT: S_WAITCNT 0
267+
# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
268+
# GCN-NEXT: S_BARRIER
269+
# GCN-NEXT: S_WAITCNT 3952
270+
# vmcnt(0)
271+
# GCN-NEXT: DS_READ_B32_gfx9
272+
---
273+
name: series_of_buffer_load_dword_lds_ds_read_soft_wait_preexisting
274+
body: |
275+
bb.0:
276+
$m0 = S_MOV_B32 0
277+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
278+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
279+
S_WAITCNT 0
280+
BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8)
281+
S_WAITCNT_VMCNT_LDS_DMA_soft 3953
282+
S_WAITCNT_VMCNT_LDS_DMA_soft 3953
283+
S_BARRIER
284+
S_WAITCNT_VMCNT_LDS_DMA_soft 3953
285+
S_WAITCNT_VMCNT_LDS_DMA_soft 3953
286+
$vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
287+
S_ENDPGM 0
288+
289+
...

0 commit comments

Comments
 (0)