Skip to content

Commit 72fc9b4

Browse files
committed
[AMDGPU] Don't run InferAddressSpacesPass in code generation pipeline
1 parent 304c7a8 commit 72fc9b4

11 files changed

+186
-465
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
810810
#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
811811
#include "llvm/Passes/TargetPassRegistry.inc"
812812

813+
PB.registerScalarOptimizerLateEPCallback(
814+
[](FunctionPassManager &FPM, OptimizationLevel Level) {
815+
if (Level == OptimizationLevel::O0)
816+
return;
817+
818+
FPM.addPass(InferAddressSpacesPass());
819+
});
820+
813821
PB.registerPipelineEarlySimplificationEPCallback(
814822
[](ModulePassManager &PM, OptimizationLevel Level,
815823
ThinOrFullLTOPhase Phase) {
@@ -907,6 +915,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
907915
if (EnableLowerModuleLDS)
908916
PM.addPass(AMDGPULowerModuleLDSPass(*this));
909917
if (Level != OptimizationLevel::O0) {
918+
// We only want to run this with O2 or higher since inliner and SROA
919+
// don't run in O1.
920+
if (Level != OptimizationLevel::O1) {
921+
PM.addPass(
922+
createModuleToFunctionPassAdaptor(InferAddressSpacesPass()));
923+
}
910924
// Do we really need internalization in LTO?
911925
if (InternalizeSymbols) {
912926
PM.addPass(InternalizePass(mustPreserveGV));
@@ -1263,9 +1277,6 @@ void AMDGPUPassConfig::addIRPasses() {
12631277
addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
12641278
}
12651279

1266-
if (TM.getOptLevel() > CodeGenOptLevel::None)
1267-
addPass(createInferAddressSpacesPass());
1268-
12691280
// Run atomic optimizer before Atomic Expand
12701281
if ((TM.getTargetTriple().isAMDGCN()) &&
12711282
(TM.getOptLevel() >= CodeGenOptLevel::Less) &&
@@ -2004,9 +2015,6 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
20042015
if (EnableLowerModuleLDS)
20052016
addPass(AMDGPULowerModuleLDSPass(TM));
20062017

2007-
if (TM.getOptLevel() > CodeGenOptLevel::None)
2008-
addPass(InferAddressSpacesPass());
2009-
20102018
// Run atomic optimizer before Atomic Expand
20112019
if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
20122020
(AMDGPUAtomicOptimizerStrategy != ScanOptions::None))

llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -136,30 +136,35 @@ define void @constrained_if_register_class() {
136136
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
137137
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
138138
; CHECK-NEXT: s_cbranch_scc0 .LBB4_2
139-
; CHECK-NEXT: .LBB4_1: ; %bb12
139+
; CHECK-NEXT: ; %bb.1: ; %bb12
140140
; CHECK-NEXT: s_setpc_b64 s[30:31]
141141
; CHECK-NEXT: .LBB4_2: ; %bb2
142142
; CHECK-NEXT: s_getpc_b64 s[4:5]
143143
; CHECK-NEXT: s_add_u32 s4, s4, const.ptr@gotpcrel32@lo+4
144144
; CHECK-NEXT: s_addc_u32 s5, s5, const.ptr@gotpcrel32@hi+12
145145
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
146-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
146+
; CHECK-NEXT: s_mov_b32 s6, -1
147147
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
148148
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
149149
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
150-
; CHECK-NEXT: global_load_dword v0, v0, s[4:5]
151-
; CHECK-NEXT: s_mov_b32 s4, -1
152-
; CHECK-NEXT: s_waitcnt vmcnt(0)
153-
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0
154-
; CHECK-NEXT: s_cbranch_vccnz .LBB4_4
150+
; CHECK-NEXT: v_mov_b32_e32 v0, s4
151+
; CHECK-NEXT: v_mov_b32_e32 v1, s5
152+
; CHECK-NEXT: flat_load_dword v0, v[0:1]
153+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
154+
; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, 1.0, v0
155+
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
155156
; CHECK-NEXT: ; %bb.3: ; %bb7
156-
; CHECK-NEXT: s_mov_b32 s4, 0
157-
; CHECK-NEXT: .LBB4_4: ; %bb8
158-
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
159-
; CHECK-NEXT: s_cbranch_scc1 .LBB4_1
157+
; CHECK-NEXT: s_mov_b32 s6, 0
158+
; CHECK-NEXT: ; %bb.4: ; %bb8
159+
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
160+
; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], s6, 0
161+
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
162+
; CHECK-NEXT: s_cbranch_execz .LBB4_6
160163
; CHECK-NEXT: ; %bb.5: ; %bb11
161164
; CHECK-NEXT: v_mov_b32_e32 v0, 4.0
162165
; CHECK-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
166+
; CHECK-NEXT: .LBB4_6: ; %Flow
167+
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
163168
; CHECK-NEXT: s_waitcnt vmcnt(0)
164169
; CHECK-NEXT: s_setpc_b64 s[30:31]
165170
bb:

llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
361361
; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000)
362362
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
363363
; GFX90A-NEXT: {{ $}}
364-
; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1)
364+
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i23)
365365
; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
366366
; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0
367367
; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 -1
@@ -407,7 +407,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
407407
; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000)
408408
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr42_sgpr43, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67
409409
; GFX90A-NEXT: {{ $}}
410-
; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1)
410+
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i30)
411411
; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec
412412
; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 -1
413413
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY renamable $sgpr36_sgpr37
@@ -460,7 +460,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
460460
; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000)
461461
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr66_sgpr67
462462
; GFX90A-NEXT: {{ $}}
463-
; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1)
463+
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i37)
464464
; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec
465465
; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1
466466
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37
@@ -512,7 +512,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
512512
; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
513513
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc
514514
; GFX90A-NEXT: renamable $vgpr59, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
515-
; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec :: (load (s8) from %ir.i42, addrspace 1)
515+
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i44)
516516
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0
517517
; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1
518518
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37
@@ -610,7 +610,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
610610
; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc
611611
; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
612612
; GFX90A-NEXT: renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
613-
; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec :: (load (s8) from %ir.i49, addrspace 1)
613+
; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i51)
614614
; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0
615615
; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 -1
616616
; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
@@ -726,7 +726,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
726726
; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000)
727727
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55
728728
; GFX90A-NEXT: {{ $}}
729-
; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1)
729+
; GFX90A-NEXT: renamable $vgpr6 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76)
730730
; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
731731
; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0
732732
; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 -1

llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,49 @@
44
define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, ptr addrspace(1) %b, double %c) {
55
; CHECK-LABEL: IllegalGEPConst:
66
; CHECK: ; %bb.0: ; %entry
7+
; CHECK-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
8+
; CHECK-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
79
; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24
810
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
9-
; CHECK-NEXT: v_mov_b32_e32 v2, 0
11+
; CHECK-NEXT: s_mov_b32 s14, -1
12+
; CHECK-NEXT: s_mov_b32 s15, 0xe00000
13+
; CHECK-NEXT: s_add_u32 s12, s12, s11
14+
; CHECK-NEXT: s_addc_u32 s13, s13, 0
1015
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
1116
; CHECK-NEXT: s_ashr_i32 s7, s6, 31
12-
; CHECK-NEXT: v_mov_b32_e32 v0, s2
13-
; CHECK-NEXT: v_mov_b32_e32 v1, s3
14-
; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3
15-
; CHECK-NEXT: s_add_u32 s0, s0, s2
16-
; CHECK-NEXT: s_addc_u32 s1, s1, s3
17-
; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-8
18-
; CHECK-NEXT: s_waitcnt vmcnt(0)
17+
; CHECK-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
18+
; CHECK-NEXT: s_add_u32 s0, s0, s6
19+
; CHECK-NEXT: s_addc_u32 s1, s1, s7
20+
; CHECK-NEXT: s_add_u32 s0, s0, -8
21+
; CHECK-NEXT: s_mov_b64 s[4:5], src_private_base
22+
; CHECK-NEXT: s_addc_u32 s1, s1, -1
23+
; CHECK-NEXT: s_cmp_eq_u32 s1, s5
24+
; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
25+
; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5]
26+
; CHECK-NEXT: s_mov_b64 s[4:5], -1
27+
; CHECK-NEXT: s_cbranch_vccnz .LBB0_3
28+
; CHECK-NEXT: ; %bb.1: ; %Flow
29+
; CHECK-NEXT: s_andn2_b64 vcc, exec, s[4:5]
30+
; CHECK-NEXT: s_cbranch_vccz .LBB0_4
31+
; CHECK-NEXT: .LBB0_2: ; %atomicrmw.phi
32+
; CHECK-NEXT: s_endpgm
33+
; CHECK-NEXT: .LBB0_3: ; %atomicrmw.global
34+
; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
35+
; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
36+
; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
37+
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1938
; CHECK-NEXT: buffer_wbinvl1_vol
39+
; CHECK-NEXT: s_cbranch_execnz .LBB0_2
40+
; CHECK-NEXT: .LBB0_4: ; %atomicrmw.private
41+
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
42+
; CHECK-NEXT: s_cselect_b32 s0, s0, -1
43+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
44+
; CHECK-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen
45+
; CHECK-NEXT: buffer_load_dword v1, v2, s[12:15], 0 offen offset:4
46+
; CHECK-NEXT: s_waitcnt vmcnt(0)
47+
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3]
48+
; CHECK-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen
49+
; CHECK-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen offset:4
2050
; CHECK-NEXT: s_endpgm
2151
entry:
2252
%i = add nsw i32 %a, -1
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=infer-address-spaces < %s | FileCheck %s
3+
4+
define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
5+
; CHECK-LABEL: define amdgpu_kernel void @infer_as_before_atomic(
6+
; CHECK-SAME: ptr addrspace(4) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
7+
; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr addrspace(4) [[ARG]], align 8
8+
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[LOAD]] to ptr addrspace(1)
9+
; CHECK-NEXT: [[V:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP1]], float 1.000000e+00 syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.ignore.denormal.mode [[META0]]
10+
; CHECK-NEXT: ret void
11+
;
12+
%load = load ptr, ptr addrspace(4) %arg
13+
%v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
14+
ret void
15+
}
16+
17+
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
18+
19+
!0 = !{}
20+
21+
;.
22+
; CHECK: [[META0]] = !{}
23+
;.

0 commit comments

Comments
 (0)