-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[Attributor] Skip AS specialization for volatile memory instructions #107250
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Shilei Tian (shiltian) ChangesFull diff: https://github.com/llvm/llvm-project/pull/107250.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 69d29b6c042349..59c551196f6559 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12509,6 +12509,34 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
/// ------------------------ Address Space ------------------------------------
namespace {
+
+template <typename InstType>
+static bool makeChange(Attributor &A, InstType *MemInst, const Use &U,
+ Value *OriginalValue, PointerType *NewPtrTy,
+ bool UseOriginalValue) {
+ if (U.getOperandNo() != InstType::getPointerOperandIndex())
+ return false;
+
+ auto *TTI = A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(
+ *MemInst->getFunction());
+ if (!TTI)
+ return false;
+
+ unsigned OldAS = MemInst->getPointerAddressSpace();
+ if (MemInst->isVolatile() && !TTI->hasVolatileVariant(MemInst, OldAS))
+ return false;
+
+ if (UseOriginalValue) {
+ A.changeUseAfterManifest(const_cast<Use &>(U), *OriginalValue);
+ return true;
+ }
+
+ Instruction *CastInst = new AddrSpaceCastInst(OriginalValue, NewPtrTy);
+ CastInst->insertBefore(MemInst);
+ A.changeUseAfterManifest(const_cast<Use &>(U), *CastInst);
+ return true;
+}
+
struct AAAddressSpaceImpl : public AAAddressSpace {
AAAddressSpaceImpl(const IRPosition &IRP, Attributor &A)
: AAAddressSpace(IRP, A) {}
@@ -12552,25 +12580,15 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
getAssociatedType()->getPointerAddressSpace())
return ChangeStatus::UNCHANGED;
- Type *NewPtrTy = PointerType::get(getAssociatedType()->getContext(),
- static_cast<uint32_t>(getAddressSpace()));
+ PointerType *NewPtrTy =
+ PointerType::get(getAssociatedType()->getContext(),
+ static_cast<uint32_t>(getAddressSpace()));
bool UseOriginalValue =
OriginalValue->getType()->getPointerAddressSpace() ==
static_cast<uint32_t>(getAddressSpace());
bool Changed = false;
- auto MakeChange = [&](Instruction *I, Use &U) {
- Changed = true;
- if (UseOriginalValue) {
- A.changeUseAfterManifest(U, *OriginalValue);
- return;
- }
- Instruction *CastInst = new AddrSpaceCastInst(OriginalValue, NewPtrTy);
- CastInst->insertBefore(cast<Instruction>(I));
- A.changeUseAfterManifest(U, *CastInst);
- };
-
auto Pred = [&](const Use &U, bool &) {
if (U.get() != AssociatedValue)
return true;
@@ -12581,12 +12599,13 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
// CGSCC if the AA is run on CGSCC instead of the entire module.
if (!A.isRunOn(Inst->getFunction()))
return true;
- if (isa<LoadInst>(Inst))
- MakeChange(Inst, const_cast<Use &>(U));
- if (isa<StoreInst>(Inst)) {
- // We only make changes if the use is the pointer operand.
- if (U.getOperandNo() == 1)
- MakeChange(Inst, const_cast<Use &>(U));
+ if (auto *LI = dyn_cast<LoadInst>(Inst)) {
+ Changed |=
+ makeChange(A, LI, U, OriginalValue, NewPtrTy, UseOriginalValue);
+ }
+ if (auto *SI = dyn_cast<StoreInst>(Inst)) {
+ Changed |=
+ makeChange(A, SI, U, OriginalValue, NewPtrTy, UseOriginalValue);
}
return true;
};
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
index 66b88236bbb4c1..9b9249b62b0bca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@@ -7,11 +7,10 @@ target triple = "amdgcn-amd-amdhsa"
; Make sure flat_scratch_init is set
; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls:
-; RW-FLAT: s_add_u32 s0, s0, s7
-; RW-FLAT: s_addc_u32 s1, s1, 0
+; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7
+; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0
; RO-FLAT-NOT: flat_scratch
-; RW-FLAT: buffer_store_dword
-; RO-FLAT: scratch_store_dword
+; GCN: flat_store_dword
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 4b1484e9bd958e..7336543b41cbc8 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -5,11 +5,22 @@ target triple = "amdgcn-amd-amdhsa"
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
-; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
-; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
-; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
+; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
+; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
+; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
+; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
+
+; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
+
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; HSA-DAG: ds_write_b32 [[PTR]], [[K]]
+; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
+
+; GFX9: s_cmp_lg_u32 [[PTR]], -1
+; GFX9-DAG: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
+; GFX9-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
+
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
; HSA: .amdhsa_user_sgpr_private_segment_buffer 1
; HSA: .amdhsa_user_sgpr_dispatch_ptr 0
@@ -28,8 +39,22 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
; Test handling inside a non-kernel
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}}
+; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
+; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
+; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
+
+; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_shared_base
+
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; HSA-DAG: ds_write_b32 v0, [[K]]
+
+; GFX9-DAG: v_mov_b32_e32 v[[VREG_HIBASE:[0-9]+]], s[[HIBASE]]
+; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
+; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
+; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, v[[VREG_HIBASE]], vcc
+
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
%stof = addrspacecast ptr addrspace(3) %ptr to ptr
store volatile i32 7, ptr %stof
@@ -38,16 +63,23 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
-; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x0{{$}}
-; GFX9-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x0{{$}}
-; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], [[APERTURE]]
-; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
-; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
-; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
-; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
+; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
+
+; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
+; CI-DAG: s_cselect_b32 s[[HI:[0-9]+]], [[APERTURE]], 0
+; CI-DAG: s_cselect_b32 s[[LO:[0-9]+]], [[PTR]], 0
+
+; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
+; GFX9-DAG: s_mov_b64 s[{{[0-9]+}}:[[HIBASE:[0-9]+]]], src_private_base
+
+; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
+; GFX9: s_cmp_lg_u32 [[PTR]], -1
+; GFX9: s_cselect_b32 s[[LO:[0-9]+]], s[[HIBASE]], 0
+; GFX9: s_cselect_b32 s[[HI:[0-9]+]], [[PTR]], 0
+
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
; HSA: .amdhsa_user_sgpr_private_segment_buffer 1
; HSA: .amdhsa_user_sgpr_dispatch_ptr 0
@@ -65,12 +97,10 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
-; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
-; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; CI: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
-; GFX9-DAG: v_mov_b32_e32 [[ADDR:v[0-9]+]], 0
-; GFX9: global_store_dword [[ADDR]], [[K]], s[[[PTRLO]]:[[PTRHI]]]
+; HSA: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
; HSA: .amdhsa_user_sgpr_queue_ptr 0
define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) #0 {
@@ -82,7 +112,9 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
; no-op
; HSA-LABEL: {{^}}use_constant_to_flat_addrspacecast:
; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
-; HSA-DAG: s_load_dword s0, s[[[PTRLO]]:[[PTRHI]]], 0x0
+; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
+; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
+; HSA: flat_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]]
define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) #0 {
%stof = addrspacecast ptr addrspace(4) %ptr to ptr
%ld = load volatile i32, ptr %stof
@@ -183,9 +215,11 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #0 {
}
; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
+
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: ds_write_b32 v[[LO]], v[[K]]
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]]
define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
%cast = addrspacecast ptr addrspace(3) null to ptr
store volatile i32 7, ptr %cast
@@ -203,9 +237,10 @@ define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
}
; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
+; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1
-; HSA: ds_write_b32 v[[LO]], v[[K]]
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
%cast = addrspacecast ptr addrspace(3) inttoptr (i32 -1 to ptr addrspace(3)) to ptr
store volatile i32 7, ptr %cast
@@ -224,13 +259,10 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
; FIXME: Shouldn't need to enable queue ptr
; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
-; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
-; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
-; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
+; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: buffer_store_dword v[[K]], off, s[[[BASELO]]:[[RSRCHI]]], 0
+; HSA: flat_store_dword v[[[LO]]:[[HI]]], v[[K]]
define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
%cast = addrspacecast ptr addrspace(5) null to ptr
store volatile i32 7, ptr %cast
@@ -249,14 +281,10 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
-; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
-; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
-; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
-; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
-; HSA: buffer_store_dword v[[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
+; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
; CI: .amdhsa_user_sgpr_queue_ptr 1
; GFX9: .amdhsa_user_sgpr_queue_ptr 0
@@ -306,18 +334,16 @@ end:
; Check for prologue initializing special SGPRs pointing to scratch.
; HSA-LABEL: {{^}}store_flat_scratch:
+; CI-DAG: s_mov_b32 flat_scratch_lo, s9
; CI-DAG: s_add_i32 [[ADD:s[0-9]+]], s8, s11
; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
-; HSA: buffer_store_dword
-; HSA: s_barrier
-; HSA: buffer_load_dword [[K:v[0-9]+]], v{{[0-9]+}}, s[0:3], 0 offen glc
-; HSA-DAG: s_load_dwordx2
-; CI-DAG: s_mov_b32 flat_scratch_lo, s9
-; CI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s4
-; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s5
-; GFX9-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0
-; CI: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
-; GFX9: global_store_dword [[PTR]], [[K]]
+
+; GFX9: s_add_u32 flat_scratch_lo, s6, s9
+; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
+
+; HSA: {{flat|global}}_store_dword
+; HSA: s_barrier
+; HSA: {{flat|global}}_load_dword
define amdgpu_kernel void @store_flat_scratch(ptr addrspace(1) noalias %out, i32) #0 {
%alloca = alloca i32, i32 9, align 4, addrspace(5)
%x = call i32 @llvm.amdgcn.workitem.id.x() #2
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 879bceaef97c00..43cdf85ed3818c 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -425,7 +425,8 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
-; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(3) [[PTR]], align 4
+; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
+; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
; ATTRIBUTOR_HSA-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -442,7 +443,8 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR12]] {
-; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(5) [[PTR]], align 4
+; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
+; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
; ATTRIBUTOR_HSA-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -476,16 +478,11 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 {
; No-op addrspacecast should not use queue ptr
define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
-; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
-; AKF_HSA-NEXT: ret void
-;
-; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) [[PTR]], align 4
-; ATTRIBUTOR_HSA-NEXT: ret void
+; HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
+; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
+; HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
+; HSA-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(1) %ptr to ptr
store volatile i32 0, ptr %stof
@@ -493,16 +490,11 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
}
define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
-; AKF_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
-; AKF_HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4
-; AKF_HSA-NEXT: ret void
-;
-; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
-; ATTRIBUTOR_HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4
-; ATTRIBUTOR_HSA-NEXT: ret void
+; HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
+; HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
+; HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4
+; HSA-NEXT: ret void
;
%stof = addrspacecast ptr addrspace(4) %ptr to ptr
%ld = load volatile i32, ptr %stof
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 032ec65fa85133..4634eb4abf3815 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -38,9 +38,15 @@ define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
}
; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0
-; GCN-DAG: ds_write_b32 v[[LO]], v[[LO]] offset:16
+; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[4:5], 0x0
+ ; CIVI: v_mov_b32_e32 v[[LO:[0-9]+]], 16
+ ; CIVI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
+ ; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base
+ ; GFX9-DAG: v_mov_b32_e32 v[[VGPR_HI:[0-9]+]], s[[HI]]
+ ; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[VGPR_HI]]]
+
+ ; CIVI: {{flat|global}}_store_dword v[[[LO]]:[[HI]]]
define hidden void @use_queue_ptr_addrspacecast() #1 {
%asc = addrspacecast ptr addrspace(3) inttoptr (i32 16 to ptr addrspace(3)) to ptr
store volatile i32 0, ptr %asc
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing attributor IR test?
43192a0
to
cd6992b
Compare
Done. |
ping |
@@ -5,11 +5,22 @@ target triple = "amdgcn-amd-amdhsa" | |||
|
|||
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know why this test is running amdgpu-attributor but it probably shouldn't be? This ought to just be testing the codegen of addrspacecast, optimizing out cases just makes it more confusing
Anyway this is unbreaking the test since the cast now remains
; GFX9-DAG: v_mov_b32_e32 v[[VGPR_HI:[0-9]+]], s[[HI]] | ||
; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[VGPR_HI]]] | ||
|
||
; CIVI: {{flat|global}}_store_dword v[[[LO]]:[[HI]]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is un-breaking this test
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] { | ||
; ATTRIBUTOR_HSA-NEXT: [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4 | ||
; ATTRIBUTOR_HSA-NEXT: ret void | ||
; HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is unbreaking this test
cd6992b
to
bc72df1
Compare
No description provided.