Skip to content

Conversation

@Pierre-vh
Copy link
Contributor

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Sep 9, 2025

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)

Changes

Patch is 698.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157640.diff

29 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp (-5)
  • (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+12-12)
  • (modified) llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll (+14-14)
  • (modified) llvm/test/CodeGen/AMDGPU/bf16-conversions.ll (+6-6)
  • (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll (+116-116)
  • (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll (+97-97)
  • (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+12-12)
  • (modified) llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll (+4-4)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll (+24-24)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll (+4-4)
  • (modified) llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll (+2-2)
  • (modified) llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll (+46-46)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll (+4-4)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll (+5-5)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll (+52-52)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll (+46-46)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (+4-4)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll (+51-51)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll (+52-52)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll (+45-45)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll (+52-52)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll (+45-45)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll (+52-52)
  • (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll (+52-52)
  • (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+2-2)
  • (modified) llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll (+5-5)
  • (modified) llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll (+3-3)
  • (modified) llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll (+1857-1857)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index f61c0d8f84b29..0be6a9d09379f 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -2655,11 +2655,6 @@ bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
     return Changed;
   }
 
-  // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
-  // space.
-  if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU)
-    return setScope(MI, CPol::SCOPE_SE);
-
   return Changed;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index e886ea4fc6ac6..7e297f46a780e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -73,7 +73,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -192,7 +192,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -311,7 +311,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
@@ -429,7 +429,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
@@ -547,7 +547,7 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -666,7 +666,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -785,7 +785,7 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
@@ -903,7 +903,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
@@ -1021,7 +1021,7 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -1140,7 +1140,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -1259,7 +1259,7 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
@@ -1377,7 +1377,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr
 ; GFX1250:       ; %bb.0: ; %main_body
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
 ; GFX1250-NEXT:    s_endpgm
 main_body:
   %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
index 4bb2a13d02cc7..ef52694910da3 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
@@ -572,7 +572,7 @@ define double @flat_system_atomic_fadd_f64(ptr %ptr, double %val) {
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_add_f64_e32 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB34_5: ; %Flow1
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s1
@@ -634,7 +634,7 @@ define double @flat_one_as_atomic_fadd_f64(ptr %ptr, double %val) {
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_add_f64_e32 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB35_5: ; %Flow1
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s1
@@ -714,7 +714,7 @@ define double @flat_system_atomic_fmin_f64(ptr %ptr, double %val) {
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX1250-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB38_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -758,7 +758,7 @@ define double @flat_one_as_atomic_fmin_f64(ptr %ptr, double %val) {
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX1250-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB39_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -826,7 +826,7 @@ define double @flat_system_atomic_fmax_f64(ptr %ptr, double %val) {
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB42_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -870,7 +870,7 @@ define double @flat_one_as_atomic_fmax_f64(ptr %ptr, double %val) {
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB43_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -1009,7 +1009,7 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_min_i64 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB52_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -1052,7 +1052,7 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_min_i64 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB53_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -1095,7 +1095,7 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_max_i64 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB54_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -1138,7 +1138,7 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_max_i64 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB55_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -1181,7 +1181,7 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_min_u64 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB56_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -1224,7 +1224,7 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_min_u64 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB57_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -1267,7 +1267,7 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_max_u64 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB58_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -1310,7 +1310,7 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_max_u64 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
+; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-NEXT:  .LBB59_4: ; %atomicrmw.phi
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index 02ead572145f9..752a87ac3cb73 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -344,7 +344,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   %a.cvt = fptrunc float %a to bfloat
@@ -380,7 +380,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, |v0|, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   %a.abs = call float @llvm.fabs.f32(float %a)
@@ -417,7 +417,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   %a.neg = fneg float %a
@@ -480,7 +480,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
 ; GFX1250-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   %a.cvt = fptrunc double %a to bfloat
@@ -543,7 +543,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   %a.neg = fneg double %a
@@ -607,7 +607,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
   %a.abs = call double @llvm.fabs.f64(double %a)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 1bf37d512f845..f8c2ddf0d7d3c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -577,7 +577,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
-; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
@@ -625,7 +625,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
-; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[4:5], off
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -761,7 +761,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
-; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v4, v[2:3], off
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
@@ -812,7 +812,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
-; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE
+; GFX1250-GISEL-NEXT:    scratch_store_b64 v2, v[4:5], off
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -948,7 +948,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc_lo
-; GFX1250-SDAG-NEXT:    scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE
+; GFX1250-SDAG-NEXT:    scratch_store_b64 v0, v[2:3], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn:
@@ -987,7 +9...
[truncated]

Copy link
Contributor Author

Pierre-vh commented Sep 10, 2025

Merge activity

  • Sep 10, 8:16 AM UTC: A user started a stack merge that includes this pull request via Graphite.
  • Sep 10, 8:23 AM UTC: Graphite rebased this pull request as part of a merge.
  • Sep 10, 9:03 AM UTC: @Pierre-vh merged this pull request with Graphite.

Base automatically changed from users/pierre-vh/rm-cu-stores to main September 10, 2025 08:21
@Pierre-vh Pierre-vh force-pushed the users/pierre-vh/rm-se-scratch-store branch from f1e9273 to 2a7f810 Compare September 10, 2025 08:23
@Pierre-vh Pierre-vh merged commit d6d0f4f into main Sep 10, 2025
9 checks passed
@Pierre-vh Pierre-vh deleted the users/pierre-vh/rm-se-scratch-store branch September 10, 2025 09:04
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants