diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 2a8385df3f934..e416c91ba52d2 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1831,6 +1831,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasScratchBaseForwardingHazard() const { return GFX1250Insts && getGeneration() == GFX12; } + + /// \returns true if the subtarget requires a wait for xcnt before atomic + /// flat/global stores & rmw. + bool requiresWaitXCntBeforeAtomicStores() const { return GFX1250Insts; } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index f7c7bb509c9ef..bcf8a86effe5a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1051,6 +1051,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return AMDGPU::S_WAIT_DSCNT; case AMDGPU::S_WAIT_KMCNT_soft: return AMDGPU::S_WAIT_KMCNT; + case AMDGPU::S_WAIT_XCNT_soft: + return AMDGPU::S_WAIT_XCNT; default: return Opcode; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 53f554eccb1fb..27e2ac5372796 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -587,7 +587,11 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const; public: - SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} + SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) { + // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases + // the behavior is the same if assuming GFX12.0 in CU mode. + assert(ST.hasGFX1250Insts() ? ST.isCuModeEnabled() : true); + } bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, SIMemOp Op, @@ -2340,12 +2344,16 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, STORECnt |= true; break; case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore need to wait for operations to complete to ensure - // they are visible to waves in the other CU as the L0 is per CU. - // Otherwise in CU mode and all waves of a work-group are on the same CU - // which shares the same L0. - if (!ST.isCuModeEnabled()) { + // GFX12.0: + // In WGP mode the waves of a work-group can be executing on either CU + // of the WGP. Therefore need to wait for operations to complete to + // ensure they are visible to waves in the other CU as the L0 is per CU. + // Otherwise in CU mode and all waves of a work-group are on the same CU + // which shares the same L0. + // + // GFX12.5: + // TODO DOCS + if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; if ((Op & SIMemOp::STORE) != SIMemOp::NONE) @@ -2397,7 +2405,7 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // // This also applies to fences. Fences cannot pair with an instruction // tracked with bvh/samplecnt as we don't have any atomics that do that. - if (Order != AtomicOrdering::Acquire) { + if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0); BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0); } @@ -2449,10 +2457,14 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, ScopeImm = AMDGPU::CPol::SCOPE_DEV; break; case SIAtomicScope::WORKGROUP: - // In WGP mode the waves of a work-group can be executing on either CU of - // the WGP. Therefore we need to invalidate the L0 which is per CU. - // Otherwise in CU mode all waves of a work-group are on the same CU, and so - // the L0 does not need to be invalidated. + // GFX12.0: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore we need to invalidate the L0 which is per CU. + // Otherwise in CU mode all waves of a work-group are on the same CU, and + // so the L0 does not need to be invalidated. + // + // GFX12.5 + // TODO DOCS if (ST.isCuModeEnabled()) return false; @@ -2497,7 +2509,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - // global_wb is only necessary at system scope for gfx120x targets. + // global_wb is only necessary at system scope for GFX12.0, + // they're also necessary at device scope for GFX12.5. // // Emitting it for lower scopes is a slow no-op, so we omit it // for performance. @@ -2507,6 +2520,12 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, .addImm(AMDGPU::CPol::SCOPE_SYS); break; case SIAtomicScope::AGENT: + // TODO DOCS + if (ST.hasGFX1250Insts()) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) + .addImm(AMDGPU::CPol::SCOPE_DEV); + } + break; case SIAtomicScope::WORKGROUP: // No WB necessary, but we still have to wait. break; @@ -2569,17 +2588,31 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( } bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { - MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); - if (!CPol) - return false; + assert(MI.mayStore() && "Not a Store inst"); + const bool IsRMW = (MI.mayLoad() && MI.mayStore()); + bool Changed = false; + + // GFX12.5 only: xcnt wait is needed before flat and global atomics stores/rmw + if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) { + MachineBasicBlock &MBB = *MI.getParent(); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0); + Changed = true; + } + + // Remaining fixes do not apply to RMWs + if (IsRMW) + return Changed; + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); + if (!CPol) // Some vmem operations do not have a scope and are not concerned. + return Changed; const unsigned Scope = CPol->getImm() & CPol::SCOPE; // GFX12.0 only: Extra waits needed before system scope stores. if (!ST.hasGFX1250Insts()) { if (!Atomic && Scope == CPol::SCOPE_SYS) return insertWaitsBeforeSystemScopeStore(MI); - return false; + return Changed; } // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address @@ -2589,7 +2622,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { (!ST.hasCUStores() || TII->mayAccessScratchThroughFlat(MI))) return setScope(MI, CPol::SCOPE_SE); - return false; + return Changed; } bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, @@ -2778,6 +2811,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; + MachineInstr &RMWMI = *MI; if (MOI.isAtomic()) { const AtomicOrdering Order = MOI.getOrdering(); @@ -2812,6 +2846,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, Position::AFTER); } + Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true); return Changed; } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index a003a46191a87..8012e9e6bc9bc 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1656,6 +1656,11 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } + +let SubtargetPredicate = HasWaitXcnt in { + def S_WAIT_XCNT_soft : SOPP_Pseudo<"s_soft_wait_xcnt", (ins s16imm:$simm16), "$simm16">; +} + // Represents the point at which a wave must wait for all outstanding direct loads to LDS. // Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 481a2540eacb7..e886ea4fc6ac6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1501,6 +1501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1571,6 +1572,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1645,6 +1649,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1715,6 +1720,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1792,6 +1800,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -1902,6 +1911,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 4.0, v[0:1] +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1947,6 +1959,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1987,6 +2000,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2031,6 +2047,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -2107,6 +2124,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2190,6 +2208,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2418,6 +2439,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll index 5fc9f4a0f8038..4bb2a13d02cc7 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll @@ -364,6 +364,7 @@ define i16 @global_one_as_atomic_min_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -406,6 +407,7 @@ define i16 @global_one_as_atomic_umin_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -448,6 +450,7 @@ define i16 @global_one_as_atomic_max_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -490,6 +493,7 @@ define i16 @global_one_as_atomic_umax_i16(ptr addrspace(1) %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1344,6 +1348,7 @@ define i16 @flat_one_as_atomic_min_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1386,6 +1391,7 @@ define i16 @flat_one_as_atomic_umin_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1428,6 +1434,7 @@ define i16 @flat_one_as_atomic_max_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 @@ -1470,6 +1477,7 @@ define i16 @flat_one_as_atomic_umax_i16(ptr %ptr, i16 %val) { ; GFX1250-NEXT: v_lshlrev_b32_e32 v5, v3, v5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_or_b32 v6, v7, v4, v5 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[6:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 004d3c0c1cf53..9e3348bbfdef6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -7,6 +7,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -21,6 +23,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -36,6 +40,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -50,6 +56,8 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -65,6 +73,8 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -81,6 +91,8 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_neg2048: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -109,6 +121,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -123,6 +137,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -145,6 +161,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -159,6 +177,8 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -182,6 +202,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -195,6 +217,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -215,6 +239,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -228,6 +254,8 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 offset:42 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -271,6 +299,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB10_5 ; GFX1250-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -317,6 +347,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB10_5 ; GFX1250-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -371,6 +403,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB11_5 ; GFX1250-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -420,6 +454,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB11_5 ; GFX1250-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -469,6 +505,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -506,6 +544,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -551,6 +591,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -591,6 +633,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -621,6 +665,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -636,6 +682,8 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -652,6 +700,8 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -665,6 +715,8 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -699,6 +751,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB18_5 ; GFX1250-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -745,6 +799,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB18_5 ; GFX1250-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -799,6 +855,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB19_5 ; GFX1250-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -848,6 +906,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB19_5 ; GFX1250-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -897,6 +957,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -937,6 +999,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -985,6 +1049,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1028,6 +1094,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1061,6 +1129,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1076,6 +1146,8 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1092,6 +1164,8 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1105,6 +1179,8 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1139,6 +1215,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB26_5 ; GFX1250-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1185,6 +1263,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB26_5 ; GFX1250-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1239,6 +1319,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB27_5 ; GFX1250-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1288,6 +1370,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB27_5 ; GFX1250-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1337,6 +1421,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1377,6 +1463,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1425,6 +1513,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1468,6 +1558,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1501,6 +1593,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1516,6 +1610,8 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1532,6 +1628,8 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1545,6 +1643,8 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1579,6 +1679,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB34_5 ; GFX1250-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1626,6 +1728,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB34_5 ; GFX1250-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1681,6 +1785,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB35_5 ; GFX1250-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1731,6 +1837,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB35_5 ; GFX1250-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1781,6 +1889,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1822,6 +1932,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1871,6 +1983,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -1915,6 +2029,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -1949,6 +2065,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1964,6 +2082,8 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3 define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1980,6 +2100,8 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -1993,6 +2115,8 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2027,6 +2151,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB42_5 ; GFX1250-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2074,6 +2200,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB42_5 ; GFX1250-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2129,6 +2257,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB43_5 ; GFX1250-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2179,6 +2309,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB43_5 ; GFX1250-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2229,6 +2361,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2270,6 +2404,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2319,6 +2455,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2363,6 +2501,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2397,6 +2537,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_rtn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2412,6 +2554,8 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2428,6 +2572,8 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_nortn: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2441,6 +2587,8 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV @@ -2475,6 +2623,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB50_5 ; GFX1250-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2522,6 +2672,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB50_5 ; GFX1250-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2577,6 +2729,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB51_5 ; GFX1250-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2627,6 +2781,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB51_5 ; GFX1250-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2677,6 +2833,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2718,6 +2876,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2767,6 +2927,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV @@ -2811,6 +2973,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV @@ -2873,7 +3037,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_max_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -2885,7 +3049,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-LABEL: flat_max_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -2914,20 +3078,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_4 ; GFX1250-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB58_5 ; GFX1250-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -2961,20 +3122,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_4 ; GFX1250-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB58_5 ; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -3016,20 +3174,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_4 ; GFX1250-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB59_5 ; GFX1250-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -3066,20 +3221,17 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_4 ; GFX1250-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB59_5 ; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -3120,9 +3272,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2 @@ -3159,9 +3311,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 @@ -3206,9 +3358,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2 @@ -3248,9 +3400,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 @@ -3307,7 +3459,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_min_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -3319,7 +3471,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-LABEL: flat_min_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -3348,20 +3500,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_4 ; GFX1250-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB66_5 ; GFX1250-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -3395,20 +3544,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_4 ; GFX1250-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB66_5 ; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -3450,20 +3596,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_4 ; GFX1250-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB67_5 ; GFX1250-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -3500,20 +3643,17 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_4 ; GFX1250-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB67_5 ; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -3554,9 +3694,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2 @@ -3593,9 +3733,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 @@ -3640,9 +3780,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2 @@ -3682,9 +3822,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 @@ -3741,7 +3881,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_umax_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -3753,7 +3893,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-LABEL: flat_umax_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -3782,20 +3922,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_4 ; GFX1250-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB74_5 ; GFX1250-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -3829,20 +3966,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_4 ; GFX1250-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB74_5 ; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -3884,20 +4018,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_4 ; GFX1250-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB75_5 ; GFX1250-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -3934,20 +4065,17 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_4 ; GFX1250-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB75_5 ; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -3988,9 +4116,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2 @@ -4027,9 +4155,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 @@ -4074,9 +4202,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2 @@ -4116,9 +4244,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 @@ -4175,7 +4303,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-LABEL: flat_umin_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -4187,7 +4315,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-LABEL: flat_umin_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128 -; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -4216,20 +4344,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_4 ; GFX1250-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB82_5 ; GFX1250-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -4263,20 +4388,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_4 ; GFX1250-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB82_5 ; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -4318,20 +4440,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_4 ; GFX1250-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB83_5 ; GFX1250-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo @@ -4368,20 +4487,17 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_4 ; GFX1250-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB83_5 ; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo @@ -4422,9 +4538,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2 @@ -4461,9 +4577,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 @@ -4508,9 +4624,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2 @@ -4550,9 +4666,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 0cb2b0b7df3d2..c68871e7c84fa 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1473,6 +1473,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1513,6 +1514,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1557,6 +1561,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1597,6 +1602,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1673,6 +1681,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -1765,6 +1774,9 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1809,6 +1821,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1849,6 +1862,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -1893,6 +1909,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 { ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 @@ -1969,6 +1986,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -2063,6 +2081,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) { ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f64 v2, v[0:1], s[0:1] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 @@ -2136,6 +2157,7 @@ define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2275,6 +2297,7 @@ define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2307,6 +2330,7 @@ define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, doub ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2339,6 +2363,7 @@ define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 80445f793934b..260ad842fee60 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: @@ -78,6 +79,12 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU-LABEL: workgroup_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -145,6 +152,12 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -217,6 +230,12 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -289,6 +308,12 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -359,6 +384,12 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -426,6 +457,12 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU-LABEL: workgroup_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -498,6 +535,12 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -570,6 +613,12 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -662,6 +711,13 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -744,6 +800,13 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -842,6 +905,14 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -940,6 +1011,14 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1032,6 +1111,13 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1114,6 +1200,13 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1212,6 +1305,14 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1310,6 +1411,14 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1404,6 +1513,13 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1490,6 +1606,13 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1594,6 +1717,14 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1698,6 +1829,14 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1792,6 +1931,13 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1878,6 +2024,13 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1982,6 +2135,14 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2086,6 +2247,14 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index 7a419a5031ba9..767dbc1432242 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: @@ -76,6 +77,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -142,6 +148,10 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -208,6 +218,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -274,6 +288,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -331,6 +349,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -388,6 +410,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU-LABEL: workgroup_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -445,6 +471,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -502,6 +532,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -570,6 +604,11 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -636,6 +675,10 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-LABEL: agent_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -702,6 +745,10 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-LABEL: agent_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -768,6 +815,10 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-LABEL: agent_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -825,6 +876,10 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU-LABEL: agent_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -882,6 +937,10 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-LABEL: agent_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -939,6 +998,10 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -996,6 +1059,10 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1064,6 +1131,11 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1130,6 +1202,10 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-LABEL: system_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1196,6 +1272,10 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-LABEL: system_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1262,6 +1342,10 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-LABEL: system_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1319,6 +1403,10 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU-LABEL: system_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1376,6 +1464,10 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-LABEL: system_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1433,6 +1525,10 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: system_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1490,6 +1586,10 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: system_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 0e459ed0f1243..af3b7535c990c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX6-LABEL: singlethread_acquire_fence: @@ -65,6 +66,10 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX12-CU-LABEL: singlethread_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread") acquire ret void @@ -122,6 +127,10 @@ define amdgpu_kernel void @singlethread_release_fence() { ; GFX12-CU-LABEL: singlethread_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread") release ret void @@ -179,6 +188,10 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() { ; GFX12-CU-LABEL: singlethread_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread") acq_rel ret void @@ -236,6 +249,10 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() { ; GFX12-CU-LABEL: singlethread_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread") seq_cst ret void @@ -293,6 +310,10 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; GFX12-CU-LABEL: singlethread_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acquire ret void @@ -350,6 +371,10 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() { ; GFX12-CU-LABEL: singlethread_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") release ret void @@ -407,6 +432,10 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: singlethread_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acq_rel ret void @@ -464,6 +493,10 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: singlethread_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: singlethread_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") seq_cst ret void @@ -521,6 +554,10 @@ define amdgpu_kernel void @wavefront_acquire_fence() { ; GFX12-CU-LABEL: wavefront_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront") acquire ret void @@ -578,6 +615,10 @@ define amdgpu_kernel void @wavefront_release_fence() { ; GFX12-CU-LABEL: wavefront_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront") release ret void @@ -635,6 +676,10 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() { ; GFX12-CU-LABEL: wavefront_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront") acq_rel ret void @@ -692,6 +737,10 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() { ; GFX12-CU-LABEL: wavefront_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront") seq_cst ret void @@ -749,6 +798,10 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; GFX12-CU-LABEL: wavefront_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acquire ret void @@ -806,6 +859,10 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() { ; GFX12-CU-LABEL: wavefront_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") release ret void @@ -863,6 +920,10 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: wavefront_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acq_rel ret void @@ -920,6 +981,10 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: wavefront_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: wavefront_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") seq_cst ret void @@ -998,6 +1063,12 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire ret void @@ -1073,6 +1144,12 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") release ret void @@ -1153,6 +1230,12 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel ret void @@ -1233,6 +1316,12 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst ret void @@ -1303,6 +1392,12 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire ret void @@ -1370,6 +1465,12 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU-LABEL: workgroup_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release ret void @@ -1442,6 +1543,12 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel ret void @@ -1514,6 +1621,12 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst ret void @@ -1606,6 +1719,13 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acquire ret void @@ -1688,6 +1808,13 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") release ret void @@ -1786,6 +1913,14 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel ret void @@ -1884,6 +2019,14 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst ret void @@ -1976,6 +2119,13 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire ret void @@ -2058,6 +2208,13 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release ret void @@ -2156,6 +2313,14 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel ret void @@ -2254,6 +2419,14 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: agent_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst ret void @@ -2348,6 +2521,13 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence acquire ret void @@ -2434,6 +2614,13 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence release ret void @@ -2538,6 +2725,14 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence acq_rel ret void @@ -2642,6 +2837,14 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence seq_cst ret void @@ -2736,6 +2939,13 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acquire_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acquire ret void @@ -2822,6 +3032,13 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_release_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") release ret void @@ -2926,6 +3143,14 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_acq_rel_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel ret void @@ -3030,6 +3255,14 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: system_one_as_seq_cst_fence: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 07ad8cb0c4a3d..6c70f25a10783 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-LABEL: flat_agent_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") monotonic, align 4 @@ -566,6 +589,18 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") acquire, align 4 @@ -789,6 +824,20 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4 @@ -939,6 +988,17 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4 @@ -1088,6 +1148,17 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4 @@ -1261,6 +1332,20 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") release, align 4 @@ -1434,6 +1519,20 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4 @@ -1583,6 +1682,17 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic @@ -1763,6 +1873,19 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -1936,6 +2059,20 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release @@ -2140,6 +2277,22 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -2344,6 +2497,22 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -2552,6 +2721,20 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -2789,6 +2972,23 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -3026,6 +3226,23 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -3264,6 +3481,21 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3533,6 +3765,23 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3795,6 +4044,24 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4088,6 +4355,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4381,6 +4668,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4650,6 +4957,23 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4919,6 +5243,23 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5212,6 +5553,26 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5505,6 +5866,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5798,6 +6179,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6091,6 +6492,26 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6384,6 +6805,26 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6677,6 +7118,26 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6970,6 +7431,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7263,6 +7744,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7545,6 +8046,23 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7844,6 +8362,24 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8152,6 +8688,26 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8479,6 +9035,27 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8806,6 +9383,27 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9109,6 +9707,24 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9408,6 +10024,24 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9735,6 +10369,27 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10062,6 +10717,27 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10389,6 +11065,27 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10716,6 +11413,27 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11039,6 +11757,27 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11366,6 +12105,27 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11693,6 +12453,27 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12020,6 +12801,27 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12204,6 +13006,17 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") unordered, align 4 @@ -12386,6 +13199,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") monotonic, align 4 @@ -12593,6 +13417,19 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") acquire, align 4 @@ -12826,6 +13663,21 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4 @@ -12976,6 +13828,17 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4 @@ -13125,6 +13988,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4 @@ -13298,6 +14172,20 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4 @@ -13471,6 +14359,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4 @@ -13620,6 +14522,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic @@ -13796,6 +14709,19 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -13969,6 +14895,20 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release @@ -14169,6 +15109,22 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -14369,6 +15325,22 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -14587,6 +15559,21 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -14834,6 +15821,24 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -15081,6 +16086,24 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -15319,6 +16342,21 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15584,6 +16622,23 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15846,6 +16901,24 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16135,6 +17208,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16424,6 +17517,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16689,6 +17802,23 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16954,6 +18084,23 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17243,6 +18390,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17532,6 +18699,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17821,6 +19008,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18110,6 +19317,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18399,6 +19626,26 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18688,6 +19935,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18977,6 +20244,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19266,6 +20553,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19548,6 +20855,23 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19857,6 +21181,25 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20165,6 +21508,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20502,6 +21865,28 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20839,6 +22224,28 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21152,6 +22559,25 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21461,6 +22887,25 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21798,6 +23243,28 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22135,6 +23602,28 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22472,6 +23961,28 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22809,6 +24320,28 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23142,6 +24675,28 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23479,6 +25034,28 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23816,6 +25393,28 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24153,6 +25752,28 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index a00af8e5b6582..3bd4d39dbb7a5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_0: @@ -16,6 +17,17 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_last_use_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm entry: %val = load i32, ptr %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr %out @@ -55,6 +67,21 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_last_use_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr %in, i32 %tid @@ -80,6 +107,17 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_last_use_and_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm entry: %val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr %out @@ -100,6 +138,17 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_last_use_and_nontemporal_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm entry: %val = load i32, ptr %in, align 4, !amdgpu.last.use !{}, !nontemporal !0 store i32 %val, ptr %out diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index 3c24c36ec547d..abc75816f9485 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4, !nontemporal !0 @@ -555,6 +567,21 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -739,6 +766,17 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] th:TH_STORE_NT scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4 @@ -1095,6 +1133,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1293,6 +1345,17 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index b88a10ab24a98..4cc3046e83bf1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-LABEL: flat_singlethread_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") monotonic, align 4 @@ -551,6 +574,17 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") acquire, align 4 @@ -733,6 +767,17 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") seq_cst, align 4 @@ -883,6 +928,17 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4 @@ -1032,6 +1088,17 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4 @@ -1181,6 +1248,17 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4 @@ -1330,6 +1408,17 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4 @@ -1479,6 +1568,17 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic @@ -1628,6 +1728,17 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -1777,6 +1888,17 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release @@ -1926,6 +2048,17 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2075,6 +2208,17 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -2268,6 +2412,19 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -2462,6 +2619,19 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2656,6 +2826,19 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -2894,6 +3077,21 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3132,6 +3330,21 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3370,6 +3583,21 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3608,6 +3836,21 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3846,6 +4089,21 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4084,6 +4342,21 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4322,6 +4595,21 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4560,6 +4848,21 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4798,6 +5101,21 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5036,6 +5354,21 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5274,6 +5607,21 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5512,6 +5860,21 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5750,6 +6113,21 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5988,6 +6366,21 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6226,6 +6619,21 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6508,6 +6916,23 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6792,6 +7217,23 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7076,6 +7518,23 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7360,6 +7819,23 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7644,6 +8120,23 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7928,6 +8421,23 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8212,6 +8722,23 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8496,6 +9023,23 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8780,6 +9324,23 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9064,6 +9625,23 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9348,6 +9926,23 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9632,6 +10227,23 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9916,6 +10528,23 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10200,6 +10829,23 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10484,6 +11130,23 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10668,6 +11331,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") unordered, align 4 @@ -10850,6 +11524,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") monotonic, align 4 @@ -11032,6 +11717,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") acquire, align 4 @@ -11214,6 +11910,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -11364,6 +12071,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4 @@ -11513,6 +12231,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4 @@ -11662,6 +12391,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4 @@ -11811,6 +12551,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -11960,6 +12711,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -12109,6 +12871,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12258,6 +13031,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release @@ -12407,6 +13191,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -12556,6 +13351,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12749,6 +13555,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12943,6 +13762,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13137,6 +13969,19 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -13375,6 +14220,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13613,6 +14473,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13851,6 +14726,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14089,6 +14979,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14327,6 +15232,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14565,6 +15485,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14803,6 +15738,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15041,6 +15991,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15279,6 +16244,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15517,6 +16497,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15755,6 +16750,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15993,6 +17003,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16231,6 +17256,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16469,6 +17509,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16707,6 +17762,21 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16989,6 +18059,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17273,6 +18360,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17557,6 +18661,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17841,6 +18962,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18125,6 +19263,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18409,6 +19564,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18693,6 +19865,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18977,6 +20166,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19261,6 +20467,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19545,6 +20768,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19829,6 +21069,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20113,6 +21370,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20397,6 +21671,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20681,6 +21972,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20965,6 +22273,23 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 919fc3e8f4e4f..b1d95bf6864e5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-LABEL: flat_system_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in monotonic, align 4 @@ -568,6 +591,18 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in acquire, align 4 @@ -793,6 +828,20 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in seq_cst, align 4 @@ -943,6 +992,17 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out unordered, align 4 @@ -1092,6 +1152,17 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out monotonic, align 4 @@ -1269,6 +1340,20 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out release, align 4 @@ -1446,6 +1531,20 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out seq_cst, align 4 @@ -1595,6 +1694,17 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in monotonic @@ -1777,6 +1887,19 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -1954,6 +2077,20 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in release @@ -2164,6 +2301,22 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -2374,6 +2527,22 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -2584,6 +2753,20 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -2827,6 +3010,23 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -3070,6 +3270,23 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -3308,6 +3525,21 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3579,6 +3811,23 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3845,6 +4094,24 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4144,6 +4411,26 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4443,6 +4730,26 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4714,6 +5021,23 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4985,6 +5309,23 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5284,6 +5625,26 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5583,6 +5944,26 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5882,6 +6263,26 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6181,6 +6582,26 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6480,6 +6901,26 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6779,6 +7220,26 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7078,6 +7539,26 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7377,6 +7858,26 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7659,6 +8160,23 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7960,6 +8478,24 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8272,6 +8808,26 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8605,6 +9161,27 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8938,6 +9515,27 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9243,6 +9841,24 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9544,6 +10160,24 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9877,6 +10511,27 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10210,6 +10865,27 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10543,6 +11219,27 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10876,6 +11573,27 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11205,6 +11923,27 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11538,6 +12277,27 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11871,6 +12631,27 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12204,6 +12985,27 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12388,6 +13190,17 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") unordered, align 4 @@ -12570,6 +13383,17 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") monotonic, align 4 @@ -12779,6 +13603,19 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") acquire, align 4 @@ -13014,6 +13851,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4 @@ -13164,6 +14016,17 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4 @@ -13313,6 +14176,17 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4 @@ -13490,6 +14364,20 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") release, align 4 @@ -13667,6 +14555,20 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4 @@ -13816,6 +14718,17 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic @@ -13994,6 +14907,19 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -14171,6 +15097,20 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release @@ -14377,6 +15317,22 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -14583,6 +15539,22 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -14803,6 +15775,21 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -15056,6 +16043,24 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -15309,6 +16314,24 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -15547,6 +16570,21 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15814,6 +16852,23 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16080,6 +17135,24 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16375,6 +17448,26 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16670,6 +17763,26 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16937,6 +18050,23 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17204,6 +18334,23 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17499,6 +18646,26 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17794,6 +18961,26 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18089,6 +19276,26 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18384,6 +19591,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18679,6 +19906,26 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18974,6 +20221,26 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19269,6 +20536,26 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19564,6 +20851,26 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19846,6 +21153,23 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20157,6 +21481,25 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20469,6 +21812,26 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20812,6 +22175,28 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21155,6 +22540,28 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21470,6 +22877,25 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21781,6 +23207,25 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22124,6 +23569,28 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22467,6 +23934,28 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22810,6 +24299,28 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23153,6 +24664,28 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23492,6 +25025,28 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23835,6 +25390,28 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24178,6 +25755,28 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24521,6 +26120,28 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index a88e0e217fdb4..e2739debea36a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -7,6 +7,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: @@ -143,6 +144,17 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4 @@ -415,6 +427,21 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -563,6 +590,18 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4 @@ -831,6 +870,21 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_nontemporal_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v1, s[2:3] +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -971,6 +1025,17 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_volatile_workgroup_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic volatile i32, ptr %in syncscope("workgroup") acquire, align 4 @@ -1090,6 +1155,19 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_volatile_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 7c637a20ab47b..cd12f48468777 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-LABEL: flat_wavefront_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") monotonic, align 4 @@ -551,6 +574,17 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") acquire, align 4 @@ -733,6 +767,17 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") seq_cst, align 4 @@ -883,6 +928,17 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4 @@ -1032,6 +1088,17 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4 @@ -1181,6 +1248,17 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4 @@ -1330,6 +1408,17 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4 @@ -1479,6 +1568,17 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic @@ -1628,6 +1728,17 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -1777,6 +1888,17 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release @@ -1926,6 +2048,17 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2075,6 +2208,17 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -2268,6 +2412,19 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -2462,6 +2619,19 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2656,6 +2826,19 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -2894,6 +3077,21 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3132,6 +3330,21 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3370,6 +3583,21 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3608,6 +3836,21 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3846,6 +4089,21 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4084,6 +4342,21 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4322,6 +4595,21 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4560,6 +4848,21 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4798,6 +5101,21 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5036,6 +5354,21 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5274,6 +5607,21 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5512,6 +5860,21 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5750,6 +6113,21 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5988,6 +6366,21 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6226,6 +6619,21 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6508,6 +6916,23 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6792,6 +7217,23 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7076,6 +7518,23 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7360,6 +7819,23 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7644,6 +8120,23 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7928,6 +8421,23 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8212,6 +8722,23 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8496,6 +9023,23 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8780,6 +9324,23 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9064,6 +9625,23 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9348,6 +9926,23 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9632,6 +10227,23 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9916,6 +10528,23 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10200,6 +10829,23 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10484,6 +11130,23 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10668,6 +11331,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") unordered, align 4 @@ -10850,6 +11524,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") monotonic, align 4 @@ -11032,6 +11717,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") acquire, align 4 @@ -11214,6 +11910,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -11364,6 +12071,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4 @@ -11513,6 +12231,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4 @@ -11662,6 +12391,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4 @@ -11811,6 +12551,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -11960,6 +12711,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -12109,6 +12871,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12258,6 +13031,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release @@ -12407,6 +13191,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -12556,6 +13351,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12749,6 +13555,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12943,6 +13762,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13137,6 +13969,19 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -13375,6 +14220,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13613,6 +14473,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13851,6 +14726,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14089,6 +14979,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14327,6 +15232,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14565,6 +15485,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14803,6 +15738,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15041,6 +15991,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15279,6 +16244,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15517,6 +16497,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15755,6 +16750,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15993,6 +17003,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16231,6 +17256,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16469,6 +17509,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16707,6 +17762,21 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16989,6 +18059,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17273,6 +18360,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17557,6 +18661,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17841,6 +18962,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18125,6 +19263,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18409,6 +19564,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18693,6 +19865,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18977,6 +20166,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19261,6 +20467,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19545,6 +20768,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19829,6 +21069,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20113,6 +21370,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20397,6 +21671,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20681,6 +21972,23 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 0fd4aa4a7a93f..4d22b6abadad7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-LABEL: flat_workgroup_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") monotonic, align 4 @@ -563,6 +586,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") acquire, align 4 @@ -776,6 +810,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4 @@ -926,6 +973,17 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4 @@ -1075,6 +1133,17 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4 @@ -1241,6 +1310,19 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4 @@ -1407,6 +1489,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4 @@ -1556,6 +1651,17 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic @@ -1724,6 +1830,18 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -1890,6 +2008,19 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release @@ -2075,6 +2206,20 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -2260,6 +2405,20 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -2465,6 +2624,19 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -2690,6 +2862,21 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -2915,6 +3102,21 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -3153,6 +3355,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3410,6 +3627,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3665,6 +3898,23 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3939,6 +4189,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4213,6 +4481,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4470,6 +4756,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4727,6 +5029,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5001,6 +5319,24 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5275,6 +5611,24 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5549,6 +5903,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5823,6 +6195,24 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6105,6 +6495,23 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6401,6 +6808,23 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6702,6 +7126,25 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7017,6 +7460,25 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7332,6 +7794,25 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7630,6 +8111,23 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7926,6 +8424,23 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8241,6 +8756,25 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8556,6 +9090,25 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8871,6 +9424,25 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9186,6 +9758,25 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9499,6 +10090,25 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9814,6 +10424,25 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10129,6 +10758,25 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10444,6 +11092,25 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10628,6 +11295,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") unordered, align 4 @@ -10810,6 +11488,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") monotonic, align 4 @@ -11000,6 +11689,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") acquire, align 4 @@ -11202,6 +11902,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -11352,6 +12065,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4 @@ -11501,6 +12225,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4 @@ -11660,6 +12395,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4 @@ -11819,6 +12567,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -11968,6 +12729,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -12127,6 +12899,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -12286,6 +13070,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release @@ -12455,6 +13252,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -12624,6 +13435,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -12825,6 +13650,19 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13039,6 +13877,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13253,6 +14106,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13491,6 +14359,21 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13739,6 +14622,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13987,6 +14886,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14245,6 +15161,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14503,6 +15437,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14751,6 +15703,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14999,6 +15967,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15257,6 +16241,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15515,6 +16517,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15773,6 +16793,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16031,6 +17069,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16289,6 +17345,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16547,6 +17621,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16805,6 +17897,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17063,6 +18173,24 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17345,6 +18473,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17637,6 +18782,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17931,6 +19093,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18235,6 +19416,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18539,6 +19739,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18833,6 +20052,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19125,6 +20361,23 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19429,6 +20682,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19733,6 +21005,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20037,6 +21328,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20341,6 +21651,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20643,6 +21972,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20947,6 +22295,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21251,6 +22618,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21555,6 +22941,25 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 74a72e04fa4ae..7b8c55aeea80e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_agent_unordered_load( ; GFX6-LABEL: global_agent_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_agent_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") monotonic, align 4 @@ -574,6 +597,18 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") acquire, align 4 @@ -793,6 +828,20 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4 @@ -950,6 +999,17 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4 @@ -1106,6 +1166,17 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4 @@ -1287,6 +1358,20 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4 @@ -1468,6 +1553,20 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4 @@ -1622,6 +1721,17 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic @@ -1805,6 +1915,19 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -1984,6 +2107,20 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release @@ -2192,6 +2329,22 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -2400,6 +2553,22 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -2598,6 +2767,20 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -2826,6 +3009,23 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -3054,6 +3254,23 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -3273,6 +3490,21 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3521,6 +3753,23 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3765,6 +4014,24 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4038,6 +4305,26 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4311,6 +4598,26 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4559,6 +4866,23 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4807,6 +5131,23 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5080,6 +5421,26 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5353,6 +5714,26 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5626,6 +6007,26 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5899,6 +6300,26 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6172,6 +6593,26 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6445,6 +6886,26 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6718,6 +7179,26 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6991,6 +7472,26 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7240,6 +7741,23 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7507,6 +8025,24 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7783,6 +8319,26 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8079,6 +8635,27 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8375,6 +8952,27 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8646,6 +9244,24 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8913,6 +9529,24 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9209,6 +9843,27 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9505,6 +10160,27 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9801,6 +10477,27 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10097,6 +10794,27 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10389,6 +11107,27 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10685,6 +11424,27 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10981,6 +11741,27 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11277,6 +12058,27 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11463,6 +12265,17 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") unordered, align 4 @@ -11647,6 +12460,17 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") monotonic, align 4 @@ -11847,6 +12671,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") acquire, align 4 @@ -12066,6 +12902,20 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4 @@ -12223,6 +13073,17 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4 @@ -12379,6 +13240,17 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4 @@ -12560,6 +13432,20 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4 @@ -12741,6 +13627,20 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4 @@ -12895,6 +13795,17 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic @@ -13078,6 +13989,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -13257,6 +14181,20 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release @@ -13465,6 +14403,22 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -13673,6 +14627,22 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -13871,6 +14841,20 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -14099,6 +15083,23 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -14327,6 +15328,23 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -14546,6 +15564,21 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14794,6 +15827,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15038,6 +16088,24 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15311,6 +16379,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15584,6 +16672,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15832,6 +16940,23 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16080,6 +17205,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16353,6 +17495,26 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16626,6 +17788,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16899,6 +18081,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17172,6 +18374,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17445,6 +18667,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17718,6 +18960,26 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17991,6 +19253,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18264,6 +19546,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18513,6 +19815,23 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18780,6 +20099,24 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19076,6 +20413,27 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19372,6 +20730,27 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19643,6 +21022,24 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19910,6 +21307,24 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20206,6 +21621,27 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20502,6 +21938,27 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20798,6 +22255,27 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21094,6 +22572,27 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21386,6 +22885,27 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21682,6 +23202,27 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21978,6 +23519,27 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22274,6 +23836,27 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index 5f952b98041f3..48f9a35b1a8b7 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_load_0: @@ -14,6 +15,18 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_last_use_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm entry: %val = load i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out @@ -37,6 +50,21 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_last_use_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid @@ -58,6 +86,17 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_last_use_and_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out @@ -81,6 +120,21 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_last_use_and_nontemporal_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 16e55058e4fc8..c9125a5b1bb2a 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_nontemporal_load_0( ; GFX6-LABEL: global_nontemporal_load_0: @@ -189,6 +190,18 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_nontemporal_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4, !nontemporal !0 @@ -448,6 +461,21 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_nontemporal_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -633,6 +661,18 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_nontemporal_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -866,6 +906,20 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_nontemporal_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s3 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1056,6 +1110,17 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_nontemporal_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 8042d38716107..859069c91aef3 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX6-LABEL: global_singlethread_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") monotonic, align 4 @@ -558,6 +581,17 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") acquire, align 4 @@ -742,6 +776,17 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") seq_cst, align 4 @@ -899,6 +944,17 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4 @@ -1055,6 +1111,17 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4 @@ -1211,6 +1278,17 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4 @@ -1367,6 +1445,17 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4 @@ -1521,6 +1610,17 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic @@ -1675,6 +1775,17 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -1829,6 +1940,17 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release @@ -1983,6 +2105,17 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2137,6 +2270,17 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -2319,6 +2463,19 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -2502,6 +2659,19 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2685,6 +2855,19 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -2904,6 +3087,21 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3123,6 +3321,21 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3342,6 +3555,21 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3561,6 +3789,21 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3780,6 +4023,21 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3999,6 +4257,21 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4218,6 +4491,21 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4437,6 +4725,21 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4656,6 +4959,21 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4875,6 +5193,21 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5094,6 +5427,21 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5313,6 +5661,21 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5532,6 +5895,21 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5751,6 +6129,21 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5970,6 +6363,21 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6219,6 +6627,23 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6470,6 +6895,23 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6721,6 +7163,23 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6972,6 +7431,23 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7223,6 +7699,23 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7474,6 +7967,23 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7725,6 +8235,23 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7976,6 +8503,23 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8227,6 +8771,23 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8478,6 +9039,23 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8729,6 +9307,23 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8980,6 +9575,23 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9231,6 +9843,23 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9482,6 +10111,23 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9733,6 +10379,23 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9919,6 +10582,17 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") unordered, align 4 @@ -10103,6 +10777,17 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") monotonic, align 4 @@ -10287,6 +10972,17 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") acquire, align 4 @@ -10471,6 +11167,17 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -10628,6 +11335,17 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4 @@ -10784,6 +11502,17 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4 @@ -10940,6 +11669,17 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4 @@ -11096,6 +11836,17 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -11250,6 +12001,17 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -11404,6 +12166,17 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -11558,6 +12331,17 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release @@ -11712,6 +12496,17 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -11866,6 +12661,17 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12048,6 +12854,19 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12231,6 +13050,19 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -12414,6 +13246,19 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12633,6 +13478,21 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12852,6 +13712,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13071,6 +13946,21 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13290,6 +14180,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13509,6 +14414,21 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13728,6 +14648,21 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13947,6 +14882,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14166,6 +15116,21 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14385,6 +15350,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14604,6 +15584,21 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14823,6 +15818,21 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15042,6 +16052,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15261,6 +16286,21 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15480,6 +16520,21 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15699,6 +16754,21 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15948,6 +17018,23 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16199,6 +17286,23 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16450,6 +17554,23 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16701,6 +17822,23 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16952,6 +18090,23 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17203,6 +18358,23 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17454,6 +18626,23 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17705,6 +18894,23 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17956,6 +19162,23 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18207,6 +19430,23 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18458,6 +19698,23 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18709,6 +19966,23 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18960,6 +20234,23 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19211,6 +20502,23 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19462,6 +20770,23 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index be148464c156e..28d179b159974 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_system_unordered_load( ; GFX6-LABEL: global_system_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_system_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in monotonic, align 4 @@ -576,6 +599,18 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in acquire, align 4 @@ -797,6 +832,20 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4 @@ -954,6 +1003,17 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out unordered, align 4 @@ -1110,6 +1170,17 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4 @@ -1295,6 +1366,20 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out release, align 4 @@ -1480,6 +1565,20 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 @@ -1634,6 +1733,17 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic @@ -1819,6 +1929,19 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -2002,6 +2125,20 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release @@ -2216,6 +2353,22 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -2430,6 +2583,22 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -2630,6 +2799,20 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -2864,6 +3047,23 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -3098,6 +3298,23 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -3317,6 +3534,21 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3567,6 +3799,23 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3815,6 +4064,24 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4094,6 +4361,26 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4373,6 +4660,26 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4623,6 +4930,23 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4873,6 +5197,23 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5152,6 +5493,26 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5431,6 +5792,26 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5710,6 +6091,26 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5989,6 +6390,26 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6238,6 +6659,23 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6507,6 +6945,24 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6809,6 +7265,27 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7111,6 +7588,27 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7384,6 +7882,24 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7653,6 +8169,24 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7955,6 +8489,27 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8257,6 +8812,27 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8559,6 +9135,27 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8861,6 +9458,27 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9159,6 +9777,27 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9461,6 +10100,27 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9763,6 +10423,27 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10065,6 +10746,27 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10251,6 +10953,17 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") unordered, align 4 @@ -10435,6 +11148,17 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") monotonic, align 4 @@ -10637,6 +11361,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") acquire, align 4 @@ -10858,6 +11594,20 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4 @@ -11015,6 +11765,17 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4 @@ -11171,6 +11932,17 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4 @@ -11356,6 +12128,20 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4 @@ -11541,6 +12327,20 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4 @@ -11695,6 +12495,17 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic @@ -11880,6 +12691,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -12063,6 +12887,20 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release @@ -12277,6 +13115,22 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -12491,6 +13345,22 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -12691,6 +13561,20 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -12925,6 +13809,23 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -13159,6 +14060,23 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -13378,6 +14296,21 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13628,6 +14561,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13876,6 +14826,24 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14155,6 +15123,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14434,6 +15422,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14684,6 +15692,23 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14934,6 +15959,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15213,6 +16255,26 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15492,6 +16554,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15771,6 +16853,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16050,6 +17152,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16329,6 +17451,26 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16608,6 +17750,26 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16887,6 +18049,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17166,6 +18348,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17415,6 +18617,23 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17684,6 +18903,24 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17964,6 +19201,26 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18266,6 +19523,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18568,6 +19846,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18841,6 +20140,24 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19110,6 +20427,24 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19412,6 +20747,27 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19714,6 +21070,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20016,6 +21393,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20318,6 +21716,27 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20616,6 +22035,27 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20918,6 +22358,27 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21220,6 +22681,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21522,6 +23004,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 8a5c5dda9f79c..e16b40b8606cd 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -8,6 +8,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_volatile_load_0( ; GFX6-LABEL: global_volatile_load_0: @@ -146,6 +147,17 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4 @@ -345,6 +357,21 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -501,6 +528,19 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -693,6 +733,21 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s3 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -838,6 +893,17 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_workgroup_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic volatile i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4 @@ -969,6 +1035,19 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_volatile_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 151ba07a0b531..8dd0f53d7c203 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX6-LABEL: global_wavefront_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") monotonic, align 4 @@ -558,6 +581,17 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") acquire, align 4 @@ -742,6 +776,17 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") seq_cst, align 4 @@ -899,6 +944,17 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4 @@ -1055,6 +1111,17 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4 @@ -1211,6 +1278,17 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4 @@ -1367,6 +1445,17 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4 @@ -1521,6 +1610,17 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic @@ -1675,6 +1775,17 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -1829,6 +1940,17 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release @@ -1983,6 +2105,17 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2137,6 +2270,17 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -2319,6 +2463,19 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -2502,6 +2659,19 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2685,6 +2855,19 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -2904,6 +3087,21 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3123,6 +3321,21 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3342,6 +3555,21 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3561,6 +3789,21 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3780,6 +4023,21 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3999,6 +4257,21 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4218,6 +4491,21 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4437,6 +4725,21 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4656,6 +4959,21 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4875,6 +5193,21 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5094,6 +5427,21 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5313,6 +5661,21 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5532,6 +5895,21 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5751,6 +6129,21 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5970,6 +6363,21 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6219,6 +6627,23 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6470,6 +6895,23 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6721,6 +7163,23 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6972,6 +7431,23 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7223,6 +7699,23 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7474,6 +7967,23 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7725,6 +8235,23 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7976,6 +8503,23 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8227,6 +8771,23 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8478,6 +9039,23 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8729,6 +9307,23 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8980,6 +9575,23 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9231,6 +9843,23 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9482,6 +10111,23 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9733,6 +10379,23 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9919,6 +10582,17 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") unordered, align 4 @@ -10103,6 +10777,17 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") monotonic, align 4 @@ -10287,6 +10972,17 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") acquire, align 4 @@ -10471,6 +11167,17 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -10628,6 +11335,17 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4 @@ -10784,6 +11502,17 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4 @@ -10940,6 +11669,17 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4 @@ -11096,6 +11836,17 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -11250,6 +12001,17 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -11404,6 +12166,17 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -11558,6 +12331,17 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release @@ -11712,6 +12496,17 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -11866,6 +12661,17 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12048,6 +12854,19 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12231,6 +13050,19 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -12414,6 +13246,19 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12633,6 +13478,21 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12852,6 +13712,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13071,6 +13946,21 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13290,6 +14180,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13509,6 +14414,21 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13728,6 +14648,21 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13947,6 +14882,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14166,6 +15116,21 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14385,6 +15350,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14604,6 +15584,21 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14823,6 +15818,21 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15042,6 +16052,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15261,6 +16286,21 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15480,6 +16520,21 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15699,6 +16754,21 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15948,6 +17018,23 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16199,6 +17286,23 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16450,6 +17554,23 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16701,6 +17822,23 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16952,6 +18090,23 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17203,6 +18358,23 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17454,6 +18626,23 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17705,6 +18894,23 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17956,6 +19162,23 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18207,6 +19430,23 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18458,6 +19698,23 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18709,6 +19966,23 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18960,6 +20234,23 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19211,6 +20502,23 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19462,6 +20770,23 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 69b0c7f93ab0e..36732b509c702 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX6-LABEL: global_workgroup_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") monotonic, align 4 @@ -563,6 +586,17 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4 @@ -764,6 +798,19 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4 @@ -921,6 +968,17 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4 @@ -1077,6 +1135,17 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4 @@ -1251,6 +1320,19 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 @@ -1425,6 +1507,19 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4 @@ -1579,6 +1674,17 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic @@ -1743,6 +1849,18 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -1915,6 +2033,19 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release @@ -2097,6 +2228,20 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -2279,6 +2424,20 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2466,6 +2625,19 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -2674,6 +2846,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -2882,6 +3069,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3101,6 +3303,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3330,6 +3547,22 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3567,6 +3800,23 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3814,6 +4064,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4061,6 +4329,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4290,6 +4576,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4519,6 +4821,22 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4766,6 +5084,24 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5013,6 +5349,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5260,6 +5614,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5507,6 +5879,24 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5754,6 +6144,24 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6001,6 +6409,24 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6248,6 +6674,24 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6495,6 +6939,24 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6744,6 +7206,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7000,6 +7479,23 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7269,6 +7765,25 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7545,6 +8060,25 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7821,6 +8355,25 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8079,6 +8632,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8335,6 +8905,23 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8611,6 +9198,25 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8887,6 +9493,25 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9163,6 +9788,25 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9439,6 +10083,25 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9713,6 +10376,25 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9989,6 +10671,25 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10265,6 +10966,25 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10541,6 +11261,25 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10727,6 +11466,17 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") unordered, align 4 @@ -10911,6 +11661,17 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") monotonic, align 4 @@ -11100,6 +11861,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") acquire, align 4 @@ -11297,6 +12069,19 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -11454,6 +12239,17 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4 @@ -11610,6 +12406,17 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4 @@ -11776,6 +12583,19 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4 @@ -11942,6 +12762,19 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -12096,6 +12929,17 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -12260,6 +13104,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -12424,6 +13280,19 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release @@ -12598,6 +13467,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -12772,6 +13655,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -12959,6 +13856,19 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13159,6 +14069,21 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13359,6 +14284,21 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13578,6 +14518,21 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13807,6 +14762,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14036,6 +15007,23 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14275,6 +15263,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14514,6 +15520,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14743,6 +15767,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14972,6 +16012,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15211,6 +16267,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15450,6 +16524,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15689,6 +16781,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15928,6 +17038,24 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16167,6 +17295,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16406,6 +17552,24 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16645,6 +17809,24 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16884,6 +18066,24 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17133,6 +18333,23 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17389,6 +18606,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17650,6 +18884,25 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17918,6 +19171,25 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18186,6 +19458,25 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18444,6 +19735,23 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18700,6 +20008,23 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18968,6 +20293,25 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19236,6 +20580,25 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19504,6 +20867,25 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19772,6 +21154,25 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20038,6 +21439,25 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20306,6 +21726,25 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20574,6 +22013,25 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20842,6 +22300,25 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-CU-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-CU-NEXT: s_wait_loadcnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 0467c5047a0be..6dde2d824ecaf 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_agent_unordered_load( ; GFX6-LABEL: local_agent_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_agent_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") monotonic, align 4 @@ -524,6 +549,18 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") acquire, align 4 @@ -718,6 +755,20 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4 @@ -859,6 +910,16 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") unordered, align 4 @@ -999,6 +1060,16 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") monotonic, align 4 @@ -1157,6 +1228,18 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4 @@ -1315,6 +1398,18 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4 @@ -1455,6 +1550,16 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") monotonic @@ -1611,6 +1716,17 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acquire @@ -1769,6 +1885,18 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") release @@ -1943,6 +2071,19 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2117,6 +2258,19 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -2304,6 +2458,19 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acquire @@ -2510,6 +2677,21 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2716,6 +2898,21 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -2883,6 +3080,18 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3066,6 +3275,19 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3251,6 +3473,20 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3452,6 +3688,21 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3653,6 +3904,21 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3836,6 +4102,19 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4019,6 +4298,19 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4220,6 +4512,21 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4421,6 +4728,21 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4622,6 +4944,21 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4823,6 +5160,21 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5024,6 +5376,21 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5225,6 +5592,21 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5426,6 +5808,21 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5627,6 +6024,21 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5836,6 +6248,21 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6052,6 +6479,21 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6281,6 +6723,23 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6515,6 +6974,23 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6749,6 +7225,23 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6965,6 +7458,21 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7181,6 +7689,21 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7415,6 +7938,23 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7649,6 +8189,23 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7883,6 +8440,23 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8117,6 +8691,23 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8351,6 +8942,23 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8585,6 +9193,23 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8819,6 +9444,23 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9053,6 +9695,23 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9226,6 +9885,18 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") unordered, align 4 @@ -9397,6 +10068,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") monotonic, align 4 @@ -9568,6 +10251,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") acquire, align 4 @@ -9739,6 +10434,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") seq_cst, align 4 @@ -9880,6 +10587,16 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") unordered, align 4 @@ -10020,6 +10737,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") monotonic, align 4 @@ -10160,6 +10887,16 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") release, align 4 @@ -10300,6 +11037,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") seq_cst, align 4 @@ -10440,6 +11187,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") monotonic @@ -10580,6 +11337,16 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acquire @@ -10720,6 +11487,16 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") release @@ -10860,6 +11637,16 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -11000,6 +11787,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -11182,6 +11979,19 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acquire @@ -11365,6 +12175,19 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -11548,6 +12371,19 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -11715,6 +12551,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11882,6 +12730,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12049,6 +12909,18 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12216,6 +13088,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12383,6 +13267,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12550,6 +13446,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12717,6 +13625,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12884,6 +13804,18 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13051,6 +13983,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13218,6 +14162,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13385,6 +14341,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13552,6 +14520,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13719,6 +14699,18 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13886,6 +14878,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14053,6 +15057,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14262,6 +15278,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14473,6 +15504,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14684,6 +15730,21 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14895,6 +15956,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15106,6 +16182,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15317,6 +16408,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15528,6 +16634,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15739,6 +16860,21 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15950,6 +17086,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16161,6 +17312,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16372,6 +17538,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16583,6 +17764,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16794,6 +17990,21 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17005,6 +18216,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17216,6 +18442,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 78209ee34cad4..689932469d78d 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_nontemporal_load_0( ; GFX6-LABEL: local_nontemporal_load_0: @@ -193,6 +194,18 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_nontemporal_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ds_load_b32 v1, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(3) %in, align 4, !nontemporal !0 @@ -428,6 +441,22 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_nontemporal_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX1250-CU-NEXT: s_mov_b32 s2, 2 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-CU-NEXT: ds_load_b32 v1, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -597,6 +626,18 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_nontemporal_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -802,6 +843,22 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_nontemporal_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s1, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX1250-CU-NEXT: s_mov_b32 s1, 2 +; GFX1250-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -991,6 +1048,18 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_nontemporal_volatile_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ds_load_b32 v1, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index f84d451f8ecb0..97c80ece2b053 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX6-LABEL: local_singlethread_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") monotonic, align 4 @@ -519,6 +544,18 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") acquire, align 4 @@ -690,6 +727,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") seq_cst, align 4 @@ -831,6 +880,16 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") unordered, align 4 @@ -971,6 +1030,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") monotonic, align 4 @@ -1111,6 +1180,16 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") release, align 4 @@ -1251,6 +1330,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") seq_cst, align 4 @@ -1391,6 +1480,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") monotonic @@ -1531,6 +1630,16 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acquire @@ -1671,6 +1780,16 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") release @@ -1811,6 +1930,16 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acq_rel @@ -1951,6 +2080,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") seq_cst @@ -2133,6 +2272,19 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acquire @@ -2316,6 +2468,19 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acq_rel @@ -2499,6 +2664,19 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") seq_cst @@ -2666,6 +2844,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -2833,6 +3023,18 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3000,6 +3202,18 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3167,6 +3381,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3334,6 +3560,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3501,6 +3739,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3668,6 +3918,18 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3835,6 +4097,18 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4002,6 +4276,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4169,6 +4455,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4336,6 +4634,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4503,6 +4813,18 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4670,6 +4992,18 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4837,6 +5171,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5004,6 +5350,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5213,6 +5571,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5424,6 +5797,21 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5635,6 +6023,21 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5846,6 +6249,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6057,6 +6475,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6268,6 +6701,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6479,6 +6927,21 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6690,6 +7153,21 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6901,6 +7379,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7112,6 +7605,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7323,6 +7831,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7534,6 +8057,21 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7745,6 +8283,21 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7956,6 +8509,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,6 +8735,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8340,6 +8923,18 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") unordered, align 4 @@ -8511,6 +9106,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") monotonic, align 4 @@ -8682,6 +9289,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") acquire, align 4 @@ -8853,6 +9472,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -8994,6 +9625,16 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") unordered, align 4 @@ -9134,6 +9775,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") monotonic, align 4 @@ -9274,6 +9925,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") release, align 4 @@ -9414,6 +10075,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -9554,6 +10225,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -9694,6 +10375,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -9834,6 +10525,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") release @@ -9974,6 +10675,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -10114,6 +10825,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -10296,6 +11017,19 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -10479,6 +11213,19 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -10662,6 +11409,19 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -10829,6 +11589,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -10996,6 +11768,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11163,6 +11947,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11330,6 +12126,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11497,6 +12305,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11664,6 +12484,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11831,6 +12663,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11998,6 +12842,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12165,6 +13021,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12332,6 +13200,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12499,6 +13379,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12666,6 +13558,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12833,6 +13737,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13000,6 +13916,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13167,6 +14095,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13376,6 +14316,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13587,6 +14542,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13798,6 +14768,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14009,6 +14994,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14220,6 +15220,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14431,6 +15446,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14642,6 +15672,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14853,6 +15898,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15064,6 +16124,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15275,6 +16350,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15486,6 +16576,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15697,6 +16802,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15908,6 +17028,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16119,6 +17254,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16330,6 +17480,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 74a297241d851..2a09fce2474d2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_system_unordered_load( ; GFX6-LABEL: local_system_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_system_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_system_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in monotonic, align 4 @@ -524,6 +549,18 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in acquire, align 4 @@ -718,6 +755,20 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in seq_cst, align 4 @@ -859,6 +910,16 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out unordered, align 4 @@ -999,6 +1060,16 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out monotonic, align 4 @@ -1157,6 +1228,18 @@ define amdgpu_kernel void @local_system_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out release, align 4 @@ -1315,6 +1398,18 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out seq_cst, align 4 @@ -1455,6 +1550,16 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in monotonic @@ -1611,6 +1716,17 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acquire @@ -1769,6 +1885,18 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in release @@ -1943,6 +2071,19 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2117,6 +2258,19 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -2304,6 +2458,19 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acquire @@ -2510,6 +2677,21 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2716,6 +2898,21 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -2883,6 +3080,18 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3066,6 +3275,19 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3251,6 +3473,20 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3452,6 +3688,21 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3653,6 +3904,21 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3836,6 +4102,19 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4019,6 +4298,19 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4220,6 +4512,21 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4421,6 +4728,21 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4622,6 +4944,21 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4823,6 +5160,21 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5024,6 +5376,21 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5225,6 +5592,21 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5426,6 +5808,21 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5627,6 +6024,21 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5836,6 +6248,21 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6052,6 +6479,21 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6281,6 +6723,23 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6515,6 +6974,23 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6749,6 +7225,23 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6965,6 +7458,21 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7181,6 +7689,21 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7415,6 +7938,23 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7649,6 +8189,23 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7883,6 +8440,23 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8117,6 +8691,23 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8351,6 +8942,23 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8585,6 +9193,23 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8819,6 +9444,23 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9053,6 +9695,23 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9226,6 +9885,18 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") unordered, align 4 @@ -9397,6 +10068,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") monotonic, align 4 @@ -9568,6 +10251,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") acquire, align 4 @@ -9739,6 +10434,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") seq_cst, align 4 @@ -9880,6 +10587,16 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") unordered, align 4 @@ -10020,6 +10737,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") monotonic, align 4 @@ -10160,6 +10887,16 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") release, align 4 @@ -10300,6 +11037,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") seq_cst, align 4 @@ -10440,6 +11187,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") monotonic @@ -10580,6 +11337,16 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acquire @@ -10720,6 +11487,16 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") release @@ -10860,6 +11637,16 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acq_rel @@ -11000,6 +11787,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") seq_cst @@ -11182,6 +11979,19 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acquire @@ -11365,6 +12175,19 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acq_rel @@ -11548,6 +12371,19 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") seq_cst @@ -11715,6 +12551,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11882,6 +12730,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12049,6 +12909,18 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12216,6 +13088,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12383,6 +13267,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12550,6 +13446,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12717,6 +13625,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12884,6 +13804,18 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13051,6 +13983,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13218,6 +14162,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13385,6 +14341,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13552,6 +14520,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13719,6 +14699,18 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13886,6 +14878,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14053,6 +15057,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14262,6 +15278,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14473,6 +15504,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14684,6 +15730,21 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14895,6 +15956,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15106,6 +16182,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15317,6 +16408,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15528,6 +16634,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15739,6 +16860,21 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15950,6 +17086,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16161,6 +17312,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16372,6 +17538,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16583,6 +17764,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16794,6 +17990,21 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17005,6 +18216,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17216,6 +18442,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 5e5e3bf83d610..4dc4c51dd2419 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -8,6 +8,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_volatile_load_0( ; GFX6-LABEL: local_volatile_load_0: @@ -141,6 +142,18 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_load_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: ds_load_b32 v1, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(3) %in, align 4 @@ -308,6 +321,22 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_load_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-CU-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX1250-CU-NEXT: s_mov_b32 s2, 2 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-CU-NEXT: ds_load_b32 v1, v1 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -429,6 +458,18 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_store_0: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -570,6 +611,22 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_store_1: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-CU-NEXT: s_wait_xcnt 0x0 +; GFX1250-CU-NEXT: s_mov_b32 s1, 0x3ff +; GFX1250-CU-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX1250-CU-NEXT: s_mov_b32 s1, 2 +; GFX1250-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -698,6 +755,18 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_workgroup_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic volatile i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4 @@ -813,6 +882,18 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_volatile_workgroup_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index b24622a48a16b..b8ad75049aff8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX6-LABEL: local_wavefront_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") monotonic, align 4 @@ -519,6 +544,18 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") acquire, align 4 @@ -690,6 +727,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") seq_cst, align 4 @@ -831,6 +880,16 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") unordered, align 4 @@ -971,6 +1030,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") monotonic, align 4 @@ -1111,6 +1180,16 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") release, align 4 @@ -1251,6 +1330,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") seq_cst, align 4 @@ -1391,6 +1480,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") monotonic @@ -1531,6 +1630,16 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acquire @@ -1671,6 +1780,16 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") release @@ -1811,6 +1930,16 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acq_rel @@ -1951,6 +2080,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") seq_cst @@ -2133,6 +2272,19 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acquire @@ -2316,6 +2468,19 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acq_rel @@ -2499,6 +2664,19 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") seq_cst @@ -2666,6 +2844,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -2833,6 +3023,18 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3000,6 +3202,18 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3167,6 +3381,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3334,6 +3560,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3501,6 +3739,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3668,6 +3918,18 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3835,6 +4097,18 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4002,6 +4276,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4169,6 +4455,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4336,6 +4634,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4503,6 +4813,18 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4670,6 +4992,18 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4837,6 +5171,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5004,6 +5350,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5213,6 +5571,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5424,6 +5797,21 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5635,6 +6023,21 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5846,6 +6249,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6057,6 +6475,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6268,6 +6701,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6479,6 +6927,21 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6690,6 +7153,21 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6901,6 +7379,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7112,6 +7605,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7323,6 +7831,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7534,6 +8057,21 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7745,6 +8283,21 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7956,6 +8509,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,6 +8735,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8340,6 +8923,18 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_unordered_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") unordered, align 4 @@ -8511,6 +9106,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") monotonic, align 4 @@ -8682,6 +9289,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") acquire, align 4 @@ -8853,6 +9472,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: ds_load_b32 v1, v0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -8994,6 +9625,16 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_unordered_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") unordered, align 4 @@ -9134,6 +9775,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") monotonic, align 4 @@ -9274,6 +9925,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") release, align 4 @@ -9414,6 +10075,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -9554,6 +10225,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -9694,6 +10375,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -9834,6 +10525,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") release @@ -9974,6 +10675,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -10114,6 +10825,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -10296,6 +11017,19 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -10479,6 +11213,19 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -10662,6 +11409,19 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -10829,6 +11589,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -10996,6 +11768,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11163,6 +11947,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11330,6 +12126,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11497,6 +12305,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11664,6 +12484,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11831,6 +12663,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11998,6 +12842,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12165,6 +13021,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12332,6 +13200,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12499,6 +13379,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12666,6 +13558,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12833,6 +13737,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13000,6 +13916,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13167,6 +14095,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13376,6 +14316,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13587,6 +14542,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13798,6 +14768,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14009,6 +14994,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14220,6 +15220,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14431,6 +15446,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14642,6 +15672,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14853,6 +15898,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15064,6 +16124,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15275,6 +16350,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15486,6 +16576,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15697,6 +16802,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15908,6 +17028,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16119,6 +17254,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16330,6 +17480,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250-CU: ; %bb.0: ; %entry +; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: ds_store_b32 v0, v1 +; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 94f5aab1eb67d..97ece0f94ccc0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -762,7 +762,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 ; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_load_b32 v1, v0 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -1235,7 +1236,8 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_store_b32 v0, v1 ; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { @@ -1404,7 +1406,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_store_b32 v0, v1 ; GFX1250-CU-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { @@ -1890,7 +1893,8 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { @@ -2075,7 +2079,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -2261,7 +2266,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -2679,7 +2685,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2899,7 +2906,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3475,7 +3483,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { @@ -3689,7 +3698,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -3904,7 +3914,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -4511,7 +4522,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -4726,7 +4738,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -4941,7 +4954,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -5156,7 +5170,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -5371,7 +5386,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -5586,7 +5602,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -5801,7 +5818,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -6016,7 +6034,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: s_endpgm @@ -6714,7 +6733,8 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 @@ -6964,7 +6984,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7214,7 +7235,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7926,7 +7948,8 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8176,7 +8199,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8426,7 +8450,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8676,7 +8701,8 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8926,7 +8952,8 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9176,7 +9203,8 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9426,7 +9454,8 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -9676,7 +9705,8 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 +; GFX1250-CU-NEXT: s_wait_storecnt 0x0 +; GFX1250-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX1250-CU-NEXT: s_wait_dscnt 0x0 ; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0