Skip to content

release/20.x: AMDGPU: Handle gfx950 XDL-write-VGPR-VALU-Mem-Exp wait state change (#126727) #126776

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 11, 2025

Conversation

llvmbot
Copy link
Member

@llvmbot llvmbot commented Feb 11, 2025

Backport a2263eb

Requested by: @arsenm

@llvmbot llvmbot added this to the LLVM 20.X Release milestone Feb 11, 2025
@llvmbot
Copy link
Member Author

llvmbot commented Feb 11, 2025

@arsenm What do you think about merging this PR to the release branch?

@llvmbot
Copy link
Member Author

llvmbot commented Feb 11, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: None (llvmbot)

Changes

Backport a2263eb

Requested by: @arsenm


Patch is 159.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126776.diff

7 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+10-7)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll (+6-6)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (+36-36)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll (+95-95)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll (+117-117)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll (+101-101)
  • (modified) llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir (+24-12)
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 873d18e30a430..b40958073a092 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2611,12 +2611,14 @@ static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
   return NumPasses + 3;
 }
 
-static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
-  // 2 pass -> 5
-  // 4 pass -> 7
-  // 8 pass -> 11
-  // 16 pass -> 19
-  return NumPasses + 3;
+static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
+                                                              bool IsGFX950) {
+  // xdl def cycles | gfx940 | gfx950
+  // 2 pass         |  5        5
+  // 4 pass         |  7        8
+  // 8 pass         |  11       12
+  // 16 pass        |  19       20
+  return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
 }
 
 static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
@@ -2767,7 +2769,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
       } else if (ST.hasGFX940Insts()) {
         NeedWaitStates =
             isXDL(ST, *MFMA)
-                ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
+                ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
+                      NumPasses, ST.hasGFX950Insts())
                 : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
                       NumPasses);
       } else {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 8d380516df8b5..452033f332659 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -49,7 +49,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
 ; GCN-NEXT:    v_mov_b32_e32 v9, s17
 ; GCN-NEXT:    v_mov_b32_e32 v10, s18
 ; GCN-NEXT:    v_mov_b32_e32 v11, s19
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 4
 ; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -122,7 +122,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
 ; GCN-NEXT:    v_mov_b32_e32 v9, s17
 ; GCN-NEXT:    v_mov_b32_e32 v10, s18
 ; GCN-NEXT:    v_mov_b32_e32 v11, s19
-; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    s_nop 4
 ; GCN-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -179,7 +179,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -224,7 +224,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -417,7 +417,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -459,7 +459,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GCN-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 44cb4e803ffad..4628a9c15391b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -19,7 +19,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -39,7 +39,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -67,7 +67,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -88,7 +88,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
@@ -114,7 +114,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -135,7 +135,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
@@ -186,7 +186,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v11, s19
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 4
 ; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -253,7 +253,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 4
 ; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -316,7 +316,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v11, s19
-; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    s_nop 4
 ; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -383,7 +383,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 4
 ; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -430,7 +430,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -475,7 +475,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -776,7 +776,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -813,7 +813,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
 ; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -855,7 +855,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -892,7 +892,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 ; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -919,7 +919,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -939,7 +939,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -971,7 +971,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -992,7 +992,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0)
@@ -1022,7 +1022,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1043,7 +1043,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
+; GISEL-NEXT:    s_nop 6
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 3, i32 2, i32 1)
@@ -1097,7 +1097,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
@@ -1169,7 +1169,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 4
 ; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -1233,7 +1233,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
@@ -1305,7 +1305,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    s_nop 4
 ; GISEL-NEXT:    global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1
@@ -1352,7 +1352,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1397,7 +1397,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    s_nop 3
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1717,7 +1717,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -1754,7 +1754,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -1801,7 +1801,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -1838,7 +1838,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
@@ -1865,7 +1865,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1885,7 +1885,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
@@ -1913,7 +1913,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; GCN-NEXT:    v_accvgpr_write_b...
[truncated]

@tstellar tstellar merged commit f874fac into llvm:release/20.x Feb 11, 2025
7 of 10 checks passed
Copy link

@arsenm (or anyone else). If you would like to add a note about this fix in the release notes (completely optional). Please reply to this comment with a one or two sentence description of the fix. When you are done, please add the release:note label to this PR.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
Development

Successfully merging this pull request may close these issues.

3 participants