Skip to content

Commit 109cd11

Browse files
authored
[Attributor] Skip AS specialization for volatile memory instructions (#107250)
1 parent 01eb071 commit 109cd11

File tree

6 files changed

+223
-94
lines changed

6 files changed

+223
-94
lines changed

llvm/lib/Transforms/IPO/AttributorAttributes.cpp

Lines changed: 37 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12492,6 +12492,33 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
1249212492

1249312493
/// ------------------------ Address Space ------------------------------------
1249412494
namespace {
12495+
12496+
template <typename InstType>
12497+
static bool makeChange(Attributor &A, InstType *MemInst, const Use &U,
12498+
Value *OriginalValue, PointerType *NewPtrTy,
12499+
bool UseOriginalValue) {
12500+
if (U.getOperandNo() != InstType::getPointerOperandIndex())
12501+
return false;
12502+
12503+
if (MemInst->isVolatile()) {
12504+
auto *TTI = A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(
12505+
*MemInst->getFunction());
12506+
unsigned NewAS = NewPtrTy->getPointerAddressSpace();
12507+
if (!TTI || !TTI->hasVolatileVariant(MemInst, NewAS))
12508+
return false;
12509+
}
12510+
12511+
if (UseOriginalValue) {
12512+
A.changeUseAfterManifest(const_cast<Use &>(U), *OriginalValue);
12513+
return true;
12514+
}
12515+
12516+
Instruction *CastInst = new AddrSpaceCastInst(OriginalValue, NewPtrTy);
12517+
CastInst->insertBefore(MemInst);
12518+
A.changeUseAfterManifest(const_cast<Use &>(U), *CastInst);
12519+
return true;
12520+
}
12521+
1249512522
struct AAAddressSpaceImpl : public AAAddressSpace {
1249612523
AAAddressSpaceImpl(const IRPosition &IRP, Attributor &A)
1249712524
: AAAddressSpace(IRP, A) {}
@@ -12535,25 +12562,15 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
1253512562
getAssociatedType()->getPointerAddressSpace())
1253612563
return ChangeStatus::UNCHANGED;
1253712564

12538-
Type *NewPtrTy = PointerType::get(getAssociatedType()->getContext(),
12539-
static_cast<uint32_t>(getAddressSpace()));
12565+
PointerType *NewPtrTy =
12566+
PointerType::get(getAssociatedType()->getContext(),
12567+
static_cast<uint32_t>(getAddressSpace()));
1254012568
bool UseOriginalValue =
1254112569
OriginalValue->getType()->getPointerAddressSpace() ==
1254212570
static_cast<uint32_t>(getAddressSpace());
1254312571

1254412572
bool Changed = false;
1254512573

12546-
auto MakeChange = [&](Instruction *I, Use &U) {
12547-
Changed = true;
12548-
if (UseOriginalValue) {
12549-
A.changeUseAfterManifest(U, *OriginalValue);
12550-
return;
12551-
}
12552-
Instruction *CastInst = new AddrSpaceCastInst(OriginalValue, NewPtrTy);
12553-
CastInst->insertBefore(cast<Instruction>(I));
12554-
A.changeUseAfterManifest(U, *CastInst);
12555-
};
12556-
1255712574
auto Pred = [&](const Use &U, bool &) {
1255812575
if (U.get() != AssociatedValue)
1255912576
return true;
@@ -12564,12 +12581,13 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
1256412581
// CGSCC if the AA is run on CGSCC instead of the entire module.
1256512582
if (!A.isRunOn(Inst->getFunction()))
1256612583
return true;
12567-
if (isa<LoadInst>(Inst))
12568-
MakeChange(Inst, const_cast<Use &>(U));
12569-
if (isa<StoreInst>(Inst)) {
12570-
// We only make changes if the use is the pointer operand.
12571-
if (U.getOperandNo() == 1)
12572-
MakeChange(Inst, const_cast<Use &>(U));
12584+
if (auto *LI = dyn_cast<LoadInst>(Inst)) {
12585+
Changed |=
12586+
makeChange(A, LI, U, OriginalValue, NewPtrTy, UseOriginalValue);
12587+
}
12588+
if (auto *SI = dyn_cast<StoreInst>(Inst)) {
12589+
Changed |=
12590+
makeChange(A, SI, U, OriginalValue, NewPtrTy, UseOriginalValue);
1257312591
}
1257412592
return true;
1257512593
};

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,10 @@ target triple = "amdgcn-amd-amdhsa"
77
; Make sure flat_scratch_init is set
88

99
; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls:
10-
; RW-FLAT: s_add_u32 s0, s0, s7
11-
; RW-FLAT: s_addc_u32 s1, s1, 0
10+
; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7
11+
; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0
1212
; RO-FLAT-NOT: flat_scratch
13-
; RW-FLAT: buffer_store_dword
14-
; RO-FLAT: scratch_store_dword
13+
; GCN: flat_store_dword
1514
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
1615
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
1716
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -S %s -o - | FileCheck %s
3+
4+
@g1 = protected addrspace(1) externally_initialized global i32 0, align 4
5+
6+
define internal void @volatile_load_store_as_0(ptr %p) {
7+
; CHECK-LABEL: define internal void @volatile_load_store_as_0(
8+
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
9+
; CHECK-NEXT: [[VAL_0:%.*]] = load i32, ptr addrspace(1) @g1, align 4
10+
; CHECK-NEXT: [[VAL_1:%.*]] = load volatile i32, ptr [[P]], align 4
11+
; CHECK-NEXT: store i32 [[VAL_1]], ptr addrspace(1) @g1, align 4
12+
; CHECK-NEXT: store volatile i32 [[VAL_0]], ptr [[P]], align 4
13+
; CHECK-NEXT: ret void
14+
;
15+
%val.0 = load i32, ptr addrspace(1) @g1, align 4
16+
%val.1 = load volatile i32, ptr %p, align 4
17+
store i32 %val.1, ptr addrspace(1) @g1, align 4
18+
store volatile i32 %val.0, ptr %p, align 4
19+
ret void
20+
}
21+
22+
define void @call_volatile_load_store_as_0(ptr %p1, ptr %p2) {
23+
; CHECK-LABEL: define void @call_volatile_load_store_as_0(
24+
; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR0]] {
25+
; CHECK-NEXT: call void @volatile_load_store_as_0(ptr [[P1]])
26+
; CHECK-NEXT: call void @volatile_load_store_as_0(ptr [[P2]])
27+
; CHECK-NEXT: ret void
28+
;
29+
call void @volatile_load_store_as_0(ptr %p1)
30+
call void @volatile_load_store_as_0(ptr %p2)
31+
ret void
32+
}
33+
34+
define internal void @volatile_load_store_as_1(ptr %p) {
35+
; CHECK-LABEL: define internal void @volatile_load_store_as_1(
36+
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
37+
; CHECK-NEXT: [[VAL_0:%.*]] = load i32, ptr addrspace(1) @g1, align 4
38+
; CHECK-NEXT: [[VAL_1:%.*]] = load volatile i32, ptr [[P]], align 4
39+
; CHECK-NEXT: store i32 [[VAL_1]], ptr addrspace(1) @g1, align 4
40+
; CHECK-NEXT: store volatile i32 [[VAL_0]], ptr [[P]], align 4
41+
; CHECK-NEXT: ret void
42+
;
43+
%val.0 = load i32, ptr addrspace(1) @g1, align 4
44+
%val.1 = load volatile i32, ptr %p, align 4
45+
store i32 %val.1, ptr addrspace(1) @g1, align 4
46+
store volatile i32 %val.0, ptr %p, align 4
47+
ret void
48+
}
49+
50+
define void @call_volatile_load_store_as_1(ptr addrspace(1) %p1, ptr addrspace(1) %p2) {
51+
; CHECK-LABEL: define void @call_volatile_load_store_as_1(
52+
; CHECK-SAME: ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[P2:%.*]]) #[[ATTR0]] {
53+
; CHECK-NEXT: [[P1_CAST:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
54+
; CHECK-NEXT: [[P2_CAST:%.*]] = addrspacecast ptr addrspace(1) [[P2]] to ptr
55+
; CHECK-NEXT: call void @volatile_load_store_as_1(ptr [[P1_CAST]])
56+
; CHECK-NEXT: call void @volatile_load_store_as_1(ptr [[P2_CAST]])
57+
; CHECK-NEXT: ret void
58+
;
59+
%p1.cast = addrspacecast ptr addrspace(1) %p1 to ptr
60+
%p2.cast = addrspacecast ptr addrspace(1) %p2 to ptr
61+
call void @volatile_load_store_as_1(ptr %p1.cast)
62+
call void @volatile_load_store_as_1(ptr %p2.cast)
63+
ret void
64+
}
65+
66+
define internal void @volatile_load_store_as_4(ptr %p) {
67+
%val.0 = load i32, ptr addrspace(1) @g1, align 4
68+
%val.1 = load volatile i32, ptr %p, align 4
69+
store i32 %val.1, ptr addrspace(1) @g1, align 4
70+
store volatile i32 %val.0, ptr %p, align 4
71+
ret void
72+
}
73+
74+
define void @call_volatile_load_store_as_4(ptr addrspace(4) %p1, ptr addrspace(4) %p2) {
75+
; CHECK-LABEL: define void @call_volatile_load_store_as_4(
76+
; CHECK-SAME: ptr addrspace(4) [[P1:%.*]], ptr addrspace(4) [[P2:%.*]]) #[[ATTR0]] {
77+
; CHECK-NEXT: [[P1_CAST:%.*]] = addrspacecast ptr addrspace(4) [[P1]] to ptr
78+
; CHECK-NEXT: [[P2_CAST:%.*]] = addrspacecast ptr addrspace(4) [[P2]] to ptr
79+
; CHECK-NEXT: call void @volatile_load_store_as_1(ptr [[P1_CAST]])
80+
; CHECK-NEXT: call void @volatile_load_store_as_1(ptr [[P2_CAST]])
81+
; CHECK-NEXT: ret void
82+
;
83+
%p1.cast = addrspacecast ptr addrspace(4) %p1 to ptr
84+
%p2.cast = addrspacecast ptr addrspace(4) %p2 to ptr
85+
call void @volatile_load_store_as_1(ptr %p1.cast)
86+
call void @volatile_load_store_as_1(ptr %p2.cast)
87+
ret void
88+
}

llvm/test/CodeGen/AMDGPU/addrspacecast.ll

Lines changed: 73 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,22 @@ target triple = "amdgcn-amd-amdhsa"
55

66
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
77

8-
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
9-
; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
10-
; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
8+
; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
9+
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
10+
; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
11+
; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
12+
; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
13+
14+
; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
15+
1116
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
12-
; HSA-DAG: ds_write_b32 [[PTR]], [[K]]
17+
; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
18+
19+
; GFX9: s_cmp_lg_u32 [[PTR]], -1
20+
; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
21+
; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
22+
23+
; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
1324

1425
; HSA: .amdhsa_user_sgpr_private_segment_buffer 1
1526
; HSA: .amdhsa_user_sgpr_dispatch_ptr 0
@@ -28,8 +39,22 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
2839

2940
; Test handling inside a non-kernel
3041
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
42+
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}}
43+
; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
44+
; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
45+
; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
46+
; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
47+
48+
; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
49+
3150
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
32-
; HSA-DAG: ds_write_b32 v0, [[K]]
51+
52+
; GFX9-DAG: v_mov_b32_e32 v[[VREG_HIBASE:[0-9]+]], s[[HIBASE]]
53+
; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
54+
; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
55+
; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, v[[VREG_HIBASE]], vcc
56+
57+
; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
3358
define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
3459
%stof = addrspacecast ptr addrspace(3) %ptr to ptr
3560
store volatile i32 7, ptr %stof
@@ -38,16 +63,23 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
3863

3964
; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
4065

41-
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
42-
; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
43-
; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
44-
; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
45-
; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
46-
; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
47-
; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
48-
; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
49-
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
50-
; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
66+
; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
67+
; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
68+
69+
; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
70+
; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
71+
; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
72+
; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
73+
74+
; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
75+
; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base
76+
77+
; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
78+
; GFX9: s_cmp_lg_u32 [[PTR]], -1
79+
; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
80+
; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
81+
82+
; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
5183

5284
; HSA: .amdhsa_user_sgpr_private_segment_buffer 1
5385
; HSA: .amdhsa_user_sgpr_dispatch_ptr 0
@@ -65,12 +97,10 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
6597
; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
6698

6799
; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
68-
; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
69-
; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
100+
; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
101+
; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
70102
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
71-
; CI: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
72-
; GFX9-DAG: v_mov_b32_e32 [[ADDR:v[0-9]+]], 0
73-
; GFX9: global_store_dword [[ADDR]], [[K]], s[[[PTRLO]]:[[PTRHI]]]
103+
; HSA: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
74104

75105
; HSA: .amdhsa_user_sgpr_queue_ptr 0
76106
define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) #0 {
@@ -82,7 +112,9 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
82112
; no-op
83113
; HSA-LABEL: {{^}}use_constant_to_flat_addrspacecast:
84114
; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
85-
; HSA-DAG: s_load_dword s0, s[[[PTRLO]]:[[PTRHI]]], 0x0
115+
; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
116+
; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
117+
; HSA: flat_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]]
86118
define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) #0 {
87119
%stof = addrspacecast ptr addrspace(4) %ptr to ptr
88120
%ld = load volatile i32, ptr %stof
@@ -183,9 +215,11 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #0 {
183215
}
184216

185217
; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
218+
186219
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
220+
; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
187221
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
188-
; HSA: ds_write_b32 v[[LO]], v[[K]]
222+
; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]]
189223
define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
190224
%cast = addrspacecast ptr addrspace(3) null to ptr
191225
store volatile i32 7, ptr %cast
@@ -203,9 +237,10 @@ define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
203237
}
204238

205239
; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
240+
; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
206241
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
207-
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1
208-
; HSA: ds_write_b32 v[[LO]], v[[K]]
242+
; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
243+
; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
209244
define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
210245
%cast = addrspacecast ptr addrspace(3) inttoptr (i32 -1 to ptr addrspace(3)) to ptr
211246
store volatile i32 7, ptr %cast
@@ -224,13 +259,10 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
224259

225260
; FIXME: Shouldn't need to enable queue ptr
226261
; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
227-
; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
228-
; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
229-
; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
230-
; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
231-
; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
262+
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
263+
; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
232264
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
233-
; HSA: buffer_store_dword v[[K]], off, s[[[BASELO]]:[[RSRCHI]]], 0
265+
; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]]
234266
define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
235267
%cast = addrspacecast ptr addrspace(5) null to ptr
236268
store volatile i32 7, ptr %cast
@@ -249,14 +281,10 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
249281

250282
; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
251283

252-
; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
253-
; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
254-
; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
255-
; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
256-
; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
257-
; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
284+
; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
258285
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
259-
; HSA: buffer_store_dword v[[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
286+
; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
287+
; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
260288

261289
; CI: .amdhsa_user_sgpr_queue_ptr 1
262290
; GFX9: .amdhsa_user_sgpr_queue_ptr 0
@@ -306,18 +334,16 @@ end:
306334

307335
; Check for prologue initializing special SGPRs pointing to scratch.
308336
; HSA-LABEL: {{^}}store_flat_scratch:
337+
; CI-DAG: s_mov_b32 flat_scratch_lo, s9
309338
; CI-DAG: s_add_i32 [[ADD:s[0-9]+]], s8, s11
310339
; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
311-
; HSA: buffer_store_dword
312-
; HSA: s_barrier
313-
; HSA: buffer_load_dword [[K:v[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen glc
314-
; HSA-DAG: s_load_dwordx2
315-
; CI-DAG: s_mov_b32 flat_scratch_lo, s9
316-
; CI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s4
317-
; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s5
318-
; GFX9-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0
319-
; CI: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
320-
; GFX9: global_store_dword [[PTR]], [[K]]
340+
341+
; GFX9: s_add_u32 flat_scratch_lo, s6, s9
342+
; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
343+
344+
; HSA: {{flat|global}}_store_dword
345+
; HSA: s_barrier
346+
; HSA: {{flat|global}}_load_dword
321347
define amdgpu_kernel void @store_flat_scratch(ptr addrspace(1) noalias %out, i32) #0 {
322348
%alloca = alloca i32, i32 9, align 4, addrspace(5)
323349
%x = call i32 @llvm.amdgcn.workitem.id.x() #2

0 commit comments

Comments
 (0)