-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Still set up the two SGPRs for queue ptr even it is COV5 #112403
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
e5bdbf6
to
8e6bdab
Compare
28df934
to
e35360a
Compare
8e6bdab
to
de01250
Compare
e35360a
to
700bdb6
Compare
de01250
to
8c9fc97
Compare
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesPatch is 26.04 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112403.diff 538 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 351e9f25e29cfc..3ff3cc26153964 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -472,9 +472,7 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(DispatchPtrReg);
}
- const Module *M = MF.getFunction().getParent();
- if (UserSGPRInfo.hasQueuePtr() &&
- AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
+ if (UserSGPRInfo.hasQueuePtr()) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8c197f23149612..91778223bc79f0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2393,9 +2393,7 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (UserSGPRInfo.hasDispatchPtr())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
- const Module *M = MF.getFunction().getParent();
- if (UserSGPRInfo.hasQueuePtr() &&
- AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5)
+ if (UserSGPRInfo.hasQueuePtr())
allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
@@ -2446,9 +2444,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(DispatchPtrReg);
}
- const Module *M = MF.getFunction().getParent();
- if (UserSGPRInfo.hasQueuePtr() &&
- AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
+ if (UserSGPRInfo.hasQueuePtr()) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
index 359c1e53de99e3..4345fa96da8c88 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -6,15 +6,15 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-LABEL: s_add_u64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s6, s0
-; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_add_u32 s2, s2, s4
+; GFX11-NEXT: s_addc_u32 s3, s3, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -22,14 +22,14 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12-LABEL: s_add_u64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -58,15 +58,15 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-LABEL: s_sub_u64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s0, s6, s0
-; GFX11-NEXT: s_subb_u32 s1, s7, s1
+; GFX11-NEXT: s_sub_u32 s2, s2, s4
+; GFX11-NEXT: s_subb_u32 s3, s3, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -74,14 +74,14 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12-LABEL: s_sub_u64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 43266554c2d8a6..382415f5653e4e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -1494,7 +1494,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1504,7 +1504,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v2, s16
; GFX940-NEXT: v_mov_b32_e32 v1, v0
; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX940-NEXT: s_mov_b64 s[4:5], 0
@@ -1531,7 +1531,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1542,13 +1542,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s18
-; GFX10-NEXT: s_mov_b32 s4, s6
-; GFX10-NEXT: s_mov_b32 s5, s7
-; GFX10-NEXT: s_mov_b32 s6, s16
-; GFX10-NEXT: s_mov_b32 s7, s17
+; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -1557,14 +1553,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_mov_b32 s4, s6
-; GFX90A-NEXT: s_mov_b32 s5, s7
-; GFX90A-NEXT: s_mov_b32 s6, s16
-; GFX90A-NEXT: s_mov_b32 s7, s17
-; GFX90A-NEXT: v_mov_b32_e32 v2, s18
+; GFX90A-NEXT: v_mov_b32_e32 v2, s20
; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b64 s[8:9], 0
+; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1573,28 +1565,24 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
; GFX90A-NEXT: v_max_f32_e32 v4, v0, v3
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: s_mov_b32 s4, s6
-; GFX908-NEXT: s_mov_b32 s5, s7
-; GFX908-NEXT: s_mov_b32 s6, s16
-; GFX908-NEXT: s_mov_b32 s7, s17
-; GFX908-NEXT: v_mov_b32_e32 v2, s18
+; GFX908-NEXT: v_mov_b32_e32 v2, s20
; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v3, v1, v1
; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1604,28 +1592,24 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX908-NEXT: v_max_f32_e32 v4, v0, v3
; GFX908-NEXT: v_mov_b32_e32 v0, v4
; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s4, s6
-; GFX8-NEXT: s_mov_b32 s5, s7
-; GFX8-NEXT: s_mov_b32 s6, s16
-; GFX8-NEXT: s_mov_b32 s7, s17
-; GFX8-NEXT: v_mov_b32_e32 v2, s18
+; GFX8-NEXT: v_mov_b32_e32 v2, s20
; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: buffer_load_dword v0, v2, s[4:7], 0 offen
-; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1
; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1635,26 +1619,22 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX8-NEXT: v_max_f32_e32 v4, v0, v3
; GFX8-NEXT: v_mov_b32_e32 v0, v4
; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s7
-; GFX7-NEXT: s_mov_b32 s6, s16
-; GFX7-NEXT: s_mov_b32 s7, s17
-; GFX7-NEXT: v_mov_b32_e32 v1, s18
-; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v1, s20
+; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -1670,7 +1650,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1680,7 +1660,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v2, s16
; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
; GFX940-NEXT: s_mov_b64 s[4:5], 0
; GFX940-NEXT: v_max_f32_e32 v3, v0, v0
@@ -1706,7 +1686,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s6
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1717,13 +1697,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s18
-; GFX10-NEXT: s_mov_b32 s4, s6
-; GFX10-NEXT: s_mov_b32 s5, s7
-; GFX10-NEXT: s_mov_b32 s6, s16
-; GFX10-NEXT: s_mov_b32 s7, s17
+; GFX10-NEXT: v_mov_b32_e32 v1, s20
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -1732,13 +1708,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_mov_b32 s4, s6
-; GFX90A-NEXT: s_mov_b32 s5, s7
-; GFX90A-NEXT: s_mov_b32 s6, s16
-; GFX90A-NEXT: s_mov_b32 s7, s17
-; GFX90A-NEXT: v_mov_b32_e32 v2, s18
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX90A-NEXT: s_mov_b64 s[8:9], 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, s20
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1746,28 +1718,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: s_mov_b32 s4, s6
-; GFX908-NEXT: s_mov_b32 s5, s7
-; GFX908-NEXT: s_mov_b32 s6, s16
-; GFX908-NEXT: s_mov_b32 s7, s17
-; GFX908-NEXT: v_mov_b32_e32 v2, s18
-; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v2, s20
+; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v3, v0, v0
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1776,28 +1744,24 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX908-NEXT: v_max_f32_e32 v0, v0, v3
; GFX908-NEXT: v_mov_b32_e32 v5, v1
; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; G...
[truncated]
|
700bdb6
to
0876ac6
Compare
8c9fc97
to
958a04e
Compare
0876ac6
to
c403ecc
Compare
958a04e
to
1abf3f2
Compare
c403ecc
to
76a1bb2
Compare
1abf3f2
to
128a266
Compare
76a1bb2
to
4bd5d5d
Compare
128a266
to
c9ff8ab
Compare
The only test failure is
|
Something is wrong with how inreg is interacting with special arguments at callsites. The special arguments shouldn't be changing the number of registers available for user arguments. |
It is actually not for user arguments, exec instead, when emitting function prologue. All non-callee save SGPRs are already allocated at this moment. The fix might be to spill one SGPR or reserve one ahead of time when allocating SGPRs for arguments. |
We probably should just reserved a register always for these spill situations. The SGPR argument part is incidental |
41ec3fb
to
7abe2a6
Compare
We’re facing an issue (#113782) that is currently blocking #112403. However, since #112403 involves extensive test changes, I’d prefer to land it as soon as possible. This PR reorganizes the tests by moving test cases expected to fail into a separate file. Additionally, it changes the `[15 x i32]` arguments to `[13 x i32]` to bypass the issue.
9b97b93
to
b289eef
Compare
We’re facing an issue (#113782) that is currently blocking #112403. However, since #112403 involves extensive test changes, I’d prefer to land it as soon as possible. This PR reorganizes the tests by moving test cases expected to fail into a separate file. Additionally, it changes the `[15 x i32]` arguments to `[13 x i32]` to bypass the issue.
7abe2a6
to
d972769
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/9877 Here is the relevant piece of the build log for the reference
|
Heu @shiltian I think this patch also broke the HIP bot: https://lab.llvm.org/buildbot/#/builders/123/builds/9031 and other OpenMP Offload bots. |
That's one of the flaky tests that randomly fails |
I thought the one that was failing randomly for some time now has been disabled and the HIP bot is still red. All OpenMP bots are also red. |
This change is likely to expose an issue in the register allocation, as we now lose 2 SGPRs by default. |
… COV5 (llvm#112403)" This reverts commit ca33649. Change-Id: Icb47ca972ee762362bb4bc0d1c04e2592e03932f
@shiltian This seems to unintentionally add the binary file llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.o to tests. |
Yes, indeed. Thanks! I fixed it downstream and I thought it was added there. |
We’re facing an issue (llvm#113782) that is currently blocking llvm#112403. However, since llvm#112403 involves extensive test changes, I’d prefer to land it as soon as possible. This PR reorganizes the tests by moving test cases expected to fail into a separate file. Additionally, it changes the `[15 x i32]` arguments to `[13 x i32]` to bypass the issue.
…COV5 (llvm#112403)" This reverts commit e215a1e as it broke both hip and openmp buildbots.
… COV5 (llvm#112403)" This reverts commit ca33649.
…ass (llvm#102913) Converts AMDGPUResourceUsageAnalysis pass from Module to MachineFunction pass. Moves function resource info propagation to to MC layer (through helpers in AMDGPUMCResourceInfo) by generating MCExprs for every function resource which the emitters have been prepped for. Fixes llvm#64863 [AMDGPU] Fix stack size metadata for functions with direct and indirect calls (llvm#110828) When a function has an external call, it should still use the stack sizes of direct, known, calls to calculate its own stack size [AMDGPU] Fix resource usage information for unnamed functions (llvm#115320) Resource usage information would try to overwrite unnamed functions if there are multiple within the same compilation unit. This aims to either use the `MCSymbol` assigned to the unnamed function (i.e., `CurrentFnSym`), or, rematerialize the `MCSymbol` for the unnamed function. Reapply [AMDGPU] Avoid resource propagation for recursion through multiple functions (llvm#112251) I was wrong last patch. I viewed the `Visited` set purely as a possible recursion deterrent where functions calling a callee multiple times are handled elsewhere. This doesn't consider cases where a function is called multiple times by different callers still part of the same call graph. New test shows the aforementioned case. Reapplies llvm#111004, fixes llvm#115562. [AMDGPU] Newly added test modified for recent SGPR use change (llvm#116427) Mistimed rebase for llvm#112251 which added new tests which did not consider the changes introduced in llvm#112403 yet Change-Id: I4dfe6a1b679137e080a6d2b44016347ea704b014
No description provided.