-
Notifications
You must be signed in to change notification settings - Fork 14k
AMDGPU: Start selecting buffer fat pointer atomicrmw fmin/fmax #95593
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Start selecting buffer fat pointer atomicrmw fmin/fmax #95593
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesPatch is 148.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95593.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 78557e006170a..e03f262831eae 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16049,7 +16049,8 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::None;
if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
return AtomicExpansionKind::None;
- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
+ AS == AMDGPUAS::BUFFER_FAT_POINTER) {
if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
return AtomicExpansionKind::None;
if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index fb068e35fc597..4b5cdd2ed32ef 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -21,32 +21,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: s_addk_co_i32 s4, 0x400
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
-; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v5, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5
-; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_cbranch_execnz .LBB0_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
@@ -81,64 +60,23 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: s_addk_i32 s4, 0x400
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX11-NEXT: v_max_f32_e32 v4, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_execnz .LBB0_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: s_addk_i32 s8, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s8
-; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
-; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX10-NEXT: s_mov_b32 s8, 0
-; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, s8
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v0, v5, v5
-; GFX10-NEXT: v_max_f32_e32 v4, v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: s_cbranch_execnz .LBB0_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
@@ -230,60 +168,19 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7)
; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, v0
-; GFX7-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s10, s8, 0x400
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, s10
-; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v5, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v4, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB0_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v1, v0
-; GFX6-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX6-NEXT: s_add_i32 s10, s8, 0x400
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, s10
-; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v0
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5
-; GFX6-NEXT: v_max_f32_e32 v4, v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v0, v4
-; GFX6-NEXT: v_mov_b32_e32 v1, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB0_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
@@ -299,31 +196,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_max_num_f32 v2, v0, v0
-; GFX12-NEXT: s_addk_co_i32 s4, 0x400
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v3, s4
-; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
-; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v0, v1, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_cbranch_execnz .LBB1_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
@@ -357,62 +234,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_max_f32 v2, v0, v0
-; GFX11-NEXT: s_addk_i32 s4, 0x400
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
-; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_execnz .LBB1_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s8
-; GFX10-NEXT: s_addk_i32 s8, 0x400
-; GFX10-NEXT: v_max_f32_e32 v2, v0, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
-; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
-; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_max_f32_e32 v0, v1, v1
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: s_cbranch_execnz .LBB1_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
@@ -502,57 +340,18 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s8
-; GFX7-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
-; GFX7-NEXT: s_add_i32 s10, s8, 0x400
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, s10
-; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB1_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s8
-; GFX6-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
-; GFX6-NEXT: s_add_i32 s10, s8, 0x400
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v0
-; GFX6-NEXT: v_mov_b32_e32 v3, s10
-; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v1
-; GFX6-NEXT: v_max_f32_e32 v0, v0, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB1_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
@@ -568,8 +367,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -581,54 +380,17 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr ad
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_max_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: ; implicit-def: $vgpr4
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB2_1
; GFX12-NEXT: ; %bb.2:
; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_max_num_f32_e32 v8, v5, v5
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB2_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB2_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_max_num_f32_e32 v4, v6, v6
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_max_num_f32_e32 v5, v4, v8
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB2_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB2_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_mov_b32_e32 v0, v4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
@@ -695,9 +457,8 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr...
[truncated]
|
7396f87
to
5e389ee
Compare
f47b66a
to
694f273
Compare
5e389ee
to
f661aa1
Compare
694f273
to
12f8d7f
Compare
f661aa1
to
b85a15a
Compare
12f8d7f
to
c2c6093
Compare
b85a15a
to
230ddd2
Compare
c2c6093
to
3eba246
Compare
230ddd2
to
5853be5
Compare
3eba246
to
26bdea6
Compare
5853be5
to
58b72f3
Compare
26bdea6
to
26d95dc
Compare
58b72f3
to
d61d35a
Compare
26d95dc
to
1d2a548
Compare
d61d35a
to
ff38a3c
Compare
1d2a548
to
380b019
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/109/builds/274 Here is the relevant piece of the build log for the reference:
|
Not it |
No description provided.