@@ -5,11 +5,22 @@ target triple = "amdgcn-amd-amdhsa"
5
5
6
6
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
7
7
8
- ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
9
- ; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
10
- ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
8
+ ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
9
+ ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
10
+ ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
11
+ ; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
12
+ ; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
13
+
14
+ ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
15
+
11
16
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
12
- ; HSA-DAG: ds_write_b32 [[PTR]], [[K]]
17
+ ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
18
+
19
+ ; GFX9: s_cmp_lg_u32 [[PTR]], -1
20
+ ; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
21
+ ; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
22
+
23
+ ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
13
24
14
25
; HSA: .amdhsa_user_sgpr_private_segment_buffer 1
15
26
; HSA: .amdhsa_user_sgpr_dispatch_ptr 0
@@ -28,8 +39,22 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
28
39
29
40
; Test handling inside a non-kernel
30
41
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
42
+ ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}}
43
+ ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
44
+ ; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
45
+ ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
46
+ ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
47
+
48
+ ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
49
+
31
50
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
32
- ; HSA-DAG: ds_write_b32 v0, [[K]]
51
+
52
+ ; GFX9-DAG: v_mov_b32_e32 v[[VREG_HIBASE:[0-9]+]], s[[HIBASE]]
53
+ ; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
54
+ ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
55
+ ; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, v[[VREG_HIBASE]], vcc
56
+
57
+ ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
33
58
define void @use_group_to_flat_addrspacecast_func (ptr addrspace (3 ) %ptr ) #0 {
34
59
%stof = addrspacecast ptr addrspace (3 ) %ptr to ptr
35
60
store volatile i32 7 , ptr %stof
@@ -38,16 +63,23 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
38
63
39
64
; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
40
65
41
- ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
42
- ; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
43
- ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
44
- ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
45
- ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
46
- ; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
47
- ; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
48
- ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
49
- ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
50
- ; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
66
+ ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
67
+ ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
68
+
69
+ ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
70
+ ; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
71
+ ; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
72
+ ; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
73
+
74
+ ; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
75
+ ; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base
76
+
77
+ ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
78
+ ; GFX9: s_cmp_lg_u32 [[PTR]], -1
79
+ ; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
80
+ ; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
81
+
82
+ ; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
51
83
52
84
; HSA: .amdhsa_user_sgpr_private_segment_buffer 1
53
85
; HSA: .amdhsa_user_sgpr_dispatch_ptr 0
@@ -65,12 +97,10 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
65
97
; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
66
98
67
99
; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
68
- ; CI -DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
69
- ; CI -DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
100
+ ; HSA -DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
101
+ ; HSA -DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
70
102
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
71
- ; CI: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
72
- ; GFX9-DAG: v_mov_b32_e32 [[ADDR:v[0-9]+]], 0
73
- ; GFX9: global_store_dword [[ADDR]], [[K]], s[[[PTRLO]]:[[PTRHI]]]
103
+ ; HSA: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
74
104
75
105
; HSA: .amdhsa_user_sgpr_queue_ptr 0
76
106
define amdgpu_kernel void @use_global_to_flat_addrspacecast (ptr addrspace (1 ) %ptr ) #0 {
@@ -82,7 +112,9 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
82
112
; no-op
83
113
; HSA-LABEL: {{^}}use_constant_to_flat_addrspacecast:
84
114
; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
85
- ; HSA-DAG: s_load_dword s0, s[[[PTRLO]]:[[PTRHI]]], 0x0
115
+ ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
116
+ ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
117
+ ; HSA: flat_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]]
86
118
define amdgpu_kernel void @use_constant_to_flat_addrspacecast (ptr addrspace (4 ) %ptr ) #0 {
87
119
%stof = addrspacecast ptr addrspace (4 ) %ptr to ptr
88
120
%ld = load volatile i32 , ptr %stof
@@ -183,9 +215,11 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #0 {
183
215
}
184
216
185
217
; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
218
+
186
219
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
220
+ ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
187
221
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
188
- ; HSA: ds_write_b32 v[[LO ]], v[[K]]
222
+ ; HSA: flat_store_dword v[[[LO]]:[[HI] ]], v[[K]]
189
223
define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast () #0 {
190
224
%cast = addrspacecast ptr addrspace (3 ) null to ptr
191
225
store volatile i32 7 , ptr %cast
@@ -203,9 +237,10 @@ define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
203
237
}
204
238
205
239
; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
240
+ ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
206
241
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
207
- ; HSA-DAG: v_mov_b32_e32 v[[LO :[0-9]+]], -1
208
- ; HSA: ds_write_b32 v[[LO ]], v[[K]]
242
+ ; HSA-DAG: v_mov_b32_e32 v[[HI :[0-9]+]], 0{{$}}
243
+ ; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI] ]], v[[K]]
209
244
define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast () #0 {
210
245
%cast = addrspacecast ptr addrspace (3 ) inttoptr (i32 -1 to ptr addrspace (3 )) to ptr
211
246
store volatile i32 7 , ptr %cast
@@ -224,13 +259,10 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
224
259
225
260
; FIXME: Shouldn't need to enable queue ptr
226
261
; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
227
- ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
228
- ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
229
- ; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
230
- ; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
231
- ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
262
+ ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
263
+ ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
232
264
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
233
- ; HSA: buffer_store_dword v[[K]], off, s[[[BASELO ]]:[[RSRCHI ]]], 0
265
+ ; HSA: flat_store_dword v[[[LO ]]:[[HI ]]], v[[K]]
234
266
define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast () #0 {
235
267
%cast = addrspacecast ptr addrspace (5 ) null to ptr
236
268
store volatile i32 7 , ptr %cast
@@ -249,14 +281,10 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
249
281
250
282
; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
251
283
252
- ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
253
- ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
254
- ; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
255
- ; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
256
- ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
257
- ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
284
+ ; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
258
285
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
259
- ; HSA: buffer_store_dword v[[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
286
+ ; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
287
+ ; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
260
288
261
289
; CI: .amdhsa_user_sgpr_queue_ptr 1
262
290
; GFX9: .amdhsa_user_sgpr_queue_ptr 0
@@ -306,18 +334,16 @@ end:
306
334
307
335
; Check for prologue initializing special SGPRs pointing to scratch.
308
336
; HSA-LABEL: {{^}}store_flat_scratch:
337
+ ; CI-DAG: s_mov_b32 flat_scratch_lo, s9
309
338
; CI-DAG: s_add_i32 [[ADD:s[0-9]+]], s8, s11
310
339
; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
311
- ; HSA: buffer_store_dword
312
- ; HSA: s_barrier
313
- ; HSA: buffer_load_dword [[K:v[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen glc
314
- ; HSA-DAG: s_load_dwordx2
315
- ; CI-DAG: s_mov_b32 flat_scratch_lo, s9
316
- ; CI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s4
317
- ; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s5
318
- ; GFX9-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0
319
- ; CI: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
320
- ; GFX9: global_store_dword [[PTR]], [[K]]
340
+
341
+ ; GFX9: s_add_u32 flat_scratch_lo, s6, s9
342
+ ; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
343
+
344
+ ; HSA: {{flat|global}}_store_dword
345
+ ; HSA: s_barrier
346
+ ; HSA: {{flat|global}}_load_dword
321
347
define amdgpu_kernel void @store_flat_scratch (ptr addrspace (1 ) noalias %out , i32 ) #0 {
322
348
%alloca = alloca i32 , i32 9 , align 4 , addrspace (5 )
323
349
%x = call i32 @llvm.amdgcn.workitem.id.x () #2
0 commit comments