[AMDGPU] Classify FLAT instructions as VMEM #137148

ro-i · 2025-04-24T10:05:33Z

Also adapt hazard and wait handling.

llvmbot · 2025-04-24T10:07:50Z

@llvm/pr-subscribers-backend-amdgpu

Author: Robert Imschweiler (ro-i)

Changes

Also adapt hazard and wait handling.

Full diff: https://github.com/llvm/llvm-project/pull/137148.diff

10 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp (+6-6)
(modified) llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp (+2-1)
(modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+20-29)
(modified) llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp (+2-1)
(modified) llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp (+1-1)
(modified) llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp (+2-2)
(modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+7-7)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+3-1)
(modified) llvm/test/CodeGen/AMDGPU/hard-clauses.mir (+15-6)
(modified) llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir (+5-2)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 87c1d2586cce5..c0731f6bbdd32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2420,29 +2420,29 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           TII->isVMEM(MI))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
            MI.mayLoad() &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           TII->isVMEM(MI))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
            MI.mayStore() &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           TII->isVMEM(MI))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
-           TII->isDS(MI))
+           (TII->isDS(MI) || TII->isLDSDMA(MI)))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
-           MI.mayLoad() && TII->isDS(MI))
+           MI.mayLoad() && (TII->isDS(MI) || TII->isLDSDMA(MI)))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
-           MI.mayStore() && TII->isDS(MI))
+           MI.mayStore() && (TII->isDS(MI) || TII->isLDSDMA(MI)))
     Result = true;
 
   else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
index bfdd8cf1bc2b1..a5e5f2912cbdd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
@@ -232,7 +232,8 @@ class AMDGPUWaitSGPRHazards {
         State.ActiveFlat = true;
 
       // SMEM or VMEM clears hazards
-      if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSMRD(*MI)) {
+      // FIXME: adapt to add FLAT without VALU (so !isLDSDMA())?
+      if ((SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI)) || SIInstrInfo::isSMRD(*MI)) {
         State.VCCHazard = HazardState::None;
         State.SALUHazards.reset();
         State.VALUHazards.reset();
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index aaefe27b1324f..50d518e45acf0 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -183,10 +183,7 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   if (ST.hasNoDataDepHazard())
     return NoHazard;
 
-  // FIXME: Should flat be considered vmem?
-  if ((SIInstrInfo::isVMEM(*MI) ||
-       SIInstrInfo::isFLAT(*MI))
-      && checkVMEMHazards(MI) > 0)
+  if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
     return HazardType;
 
   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
@@ -202,8 +199,8 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
     return HazardType;
 
   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
-       SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
-       SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
+       SIInstrInfo::isDS(*MI) || SIInstrInfo::isEXP(*MI)) &&
+      checkMAIVALUHazards(MI) > 0)
     return HazardType;
 
   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
@@ -230,7 +227,6 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
     return HazardType;
 
   if ((SIInstrInfo::isVMEM(*MI) ||
-       SIInstrInfo::isFLAT(*MI) ||
        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
     return HazardType;
 
@@ -324,7 +320,7 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
   if (ST.hasNoDataDepHazard())
     return WaitStates;
 
-  if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
+  if (SIInstrInfo::isVMEM(*MI))
     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
 
   if (SIInstrInfo::isVALU(*MI))
@@ -340,8 +336,8 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
 
   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
-       SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
-       SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
+       SIInstrInfo::isDS(*MI) || SIInstrInfo::isEXP(*MI)) &&
+      checkMAIVALUHazards(MI) > 0)
     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
 
   if (MI->isInlineAsm())
@@ -370,7 +366,6 @@ unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
     return std::max(WaitStates, checkMAIHazards(MI));
 
   if (SIInstrInfo::isVMEM(*MI) ||
-      SIInstrInfo::isFLAT(*MI) ||
       SIInstrInfo::isDS(*MI))
     return std::max(WaitStates, checkMAILdStHazards(MI));
 
@@ -598,7 +593,7 @@ static bool breaksSMEMSoftClause(MachineInstr *MI) {
 }
 
 static bool breaksVMEMSoftClause(MachineInstr *MI) {
-  return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
+  return !SIInstrInfo::isVMEM(*MI);
 }
 
 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
@@ -1250,8 +1245,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
-    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
-        !SIInstrInfo::isFLAT(I))
+    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I))
       return false;
 
     for (const MachineOperand &Def : MI->defs()) {
@@ -1424,9 +1418,8 @@ static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
   bool HasVmem = false;
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
-      HasLds |= SIInstrInfo::isDS(MI);
-      HasVmem |=
-          SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
+      HasLds |= SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI);
+      HasVmem |= SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isLDSDMA(MI);
       if (HasLds && HasVmem)
         return true;
     }
@@ -1448,9 +1441,9 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
   assert(!ST.hasExtendedWaitCounts());
 
   auto IsHazardInst = [](const MachineInstr &MI) {
-    if (SIInstrInfo::isDS(MI))
+    if (SIInstrInfo::isDS(MI) || SIInstrInfo::isLDSDMA(MI))
       return 1;
-    if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
+    if (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isLDSDMA(MI))
       return 2;
     return 0;
   };
@@ -1517,8 +1510,8 @@ bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
     if (WaitStates >= NoHazardWaitStates)
       return true;
     // Instructions which cause va_vdst==0 expire hazard
-    return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
-           SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
+    return SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
+           SIInstrInfo::isEXP(I);
   };
   auto GetWaitStatesFn = [](const MachineInstr &MI) {
     return SIInstrInfo::isVALU(MI) ? 1 : 0;
@@ -1549,8 +1542,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
   const Register VDSTReg = VDST->getReg();
 
   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
-    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
-        !SIInstrInfo::isDS(I))
+    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I))
       return false;
     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
   };
@@ -1635,8 +1627,8 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
       return HazardExpired;
 
     // Instructions which cause va_vdst==0 expire hazard
-    if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
-        SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
+    if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
+        SIInstrInfo::isEXP(I) ||
         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
       return HazardExpired;
@@ -1772,8 +1764,8 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
       return HazardExpired;
 
     // Instructions which cause va_vdst==0 expire hazard
-    if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
-        SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
+    if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) ||
+        SIInstrInfo::isEXP(I) ||
         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
          I.getOperand(0).getImm() == 0x0fff))
       return HazardExpired;
@@ -2003,7 +1995,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
     return 0;
 
   auto IsHazardFn = [](const MachineInstr &I) {
-    if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
+    if (!SIInstrInfo::isVMEM(I))
       return false;
     return SIInstrInfo::isFPAtomic(I);
   };
@@ -2626,7 +2618,6 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
   int WaitStatesNeeded = 0;
 
   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
-               SIInstrInfo::isFLAT(*MI) ||
                SIInstrInfo::isDS(*MI);
   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
   bool IsVALU = SIInstrInfo::isVALU(*MI);
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index 4802ed4bb53df..f00c95f71f467 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -303,7 +303,8 @@ void AMDGPUCustomBehaviour::generateWaitCntInfo() {
 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
   return MCID.TSFlags & SIInstrFlags::MUBUF ||
          MCID.TSFlags & SIInstrFlags::MTBUF ||
-         MCID.TSFlags & SIInstrFlags::MIMG;
+         MCID.TSFlags & SIInstrFlags::MIMG ||
+         MCID.TSFlags & SIInstrFlags::FLAT;
 }
 
 // taken from SIInstrInfo::hasModifiersSet()
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index bbc0280aed42e..7524747833468 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -100,7 +100,7 @@ FunctionPass *llvm::createSIFormMemoryClausesLegacyPass() {
 }
 
 static bool isVMEMClauseInst(const MachineInstr &MI) {
-  return SIInstrInfo::isFLAT(MI) || SIInstrInfo::isVMEM(MI);
+  return SIInstrInfo::isVMEM(MI);
 }
 
 static bool isSMEMClauseInst(const MachineInstr &MI) {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 88ff04d55629c..fd8023b3455c2 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -97,7 +97,7 @@ class SIInsertHardClauses {
   HardClauseType getHardClauseType(const MachineInstr &MI) {
     if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) {
       if (ST->getGeneration() == AMDGPUSubtarget::GFX10) {
-        if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
+        if (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isLDSDMA(MI)) {
           if (ST->hasNSAClauseBug()) {
             const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
             if (Info && Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA)
@@ -121,7 +121,7 @@ class SIInsertHardClauses {
                                               : HARDCLAUSE_MIMG_LOAD
                               : HARDCLAUSE_MIMG_STORE;
         }
-        if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI)) {
+        if (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isLDSDMA(MI)) {
           return MI.mayLoad() ? MI.mayStore() ? HARDCLAUSE_VMEM_ATOMIC
                                               : HARDCLAUSE_VMEM_LOAD
                               : HARDCLAUSE_VMEM_STORE;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8848eebdeb6b3..7ccab028d0f1a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -168,8 +168,8 @@ static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
     AMDGPU::S_WAIT_KMCNT};
 
 static bool updateVMCntOnly(const MachineInstr &Inst) {
-  return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
-         SIInstrInfo::isFLATScratch(Inst);
+  return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
+         SIInstrInfo::isFLATGlobal(Inst) || SIInstrInfo::isFLATScratch(Inst);
 }
 
 #ifndef NDEBUG
@@ -695,14 +695,14 @@ class SIInsertWaitcnts {
 #endif // NDEBUG
   }
 
-  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
-  // FLAT instruction.
+  // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
+  // instruction.
   WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
     // Maps VMEM access types to their corresponding WaitEventType.
     static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
         VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
 
-    assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
+    assert(SIInstrInfo::isVMEM(Inst));
     // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
     // these should use VM_CNT.
     if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
@@ -2454,8 +2454,8 @@ bool SIInsertWaitcnts::isPreheaderToFlush(
 }
 
 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
-  return SIInstrInfo::isVMEM(MI) ||
-         (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
+  return (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
+         SIInstrInfo::isVMEM(MI);
 }
 
 // Return true if it is better to flush the vmcnt counter in the preheader of
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a3a54659d299a..30de92ae83532 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -449,7 +449,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   }
 
   static bool isVMEM(const MachineInstr &MI) {
-    return isMUBUF(MI) || isMTBUF(MI) || isImage(MI);
+    if (isFLAT(MI))
+      assert(usesVM_CNT(MI) && "oh no");
+    return isMUBUF(MI) || isMTBUF(MI) || isImage(MI) || isFLAT(MI);
   }
 
   bool isVMEM(uint16_t Opcode) const {
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
index 44b988a7121c7..bdb496f378548 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir
@@ -630,20 +630,29 @@ body: |
     ; CHECK-LABEL: name: flat_global_load
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; CHECK-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit $vgpr0_vgpr1, implicit $exec, implicit $flat_scr {
+    ; CHECK-NEXT:   S_CLAUSE 1
+    ; CHECK-NEXT:   $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; CHECK-NEXT:   $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+    ; CHECK-NEXT: }
     ;
     ; GFX11-LABEL: name: flat_global_load
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; GFX11-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+    ; GFX11-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit $vgpr0_vgpr1, implicit $exec, implicit $flat_scr {
+    ; GFX11-NEXT:   S_CLAUSE 1
+    ; GFX11-NEXT:   $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX11-NEXT:   $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+    ; GFX11-NEXT: }
     ;
     ; GFX12-LABEL: name: flat_global_load
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
-    ; GFX12-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit $vgpr0_vgpr1, implicit $exec, implicit $flat_scr {
+    ; GFX12-NEXT:   S_CLAUSE 1
+    ; GFX12-NEXT:   $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT:   $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+    ; GFX12-NEXT: }
     $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
     $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir b/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir
index 86e657093b5b2..245abf03811d0 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir
@@ -269,11 +269,14 @@ body:            |
     S_ENDPGM 0
 ...
 
-# GCN-LABEL: name: no_hazard_lds_branch_flat
+# FLAT_* instructions are "based on per-thread address (VGPR), can load/store:
+# global memory, LDS or scratch memory" (RDNA4 ISA)
+# GCN-LABEL: name: hazard_lds_branch_flat
 # GCN:      bb.1:
+# GFX10-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
 # GCN-NEXT: FLAT_LOAD_DWORD
 ---
-name:            no_hazard_lds_branch_flat
+name:            hazard_lds_branch_flat
 body:            |
   bb.0:
     successors: %bb.1

llvm/lib/Target/AMDGPU/SIInstrInfo.h

arsenm · 2025-04-24T10:13:31Z

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

+  return (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
+         SIInstrInfo::isVMEM(MI);


This amounts to just isVMEM. I'm not sure why this is bothering to specially handle the case where a flat instruction is statically known to only access LDS. I think the only way that would happen is if we had a volatile flat access to an LDS variable, which is mildly useful.

This should probably be something like if (isFLAT()) return mayAccessVMEMThroughFlat(); else // other non-flat cases

done. But are you sure it's a good idea to list the elements of isVMEM separately?

github-actions · 2025-04-24T10:13:54Z

✅ With the latest revision this PR passed the C/C++ code formatter.

jayfoad · 2025-04-24T10:14:33Z

If we're going to do this then it should be done as an NFC change, so there should be no changes to lit tests.

If you discover anything that's currently broken due to isVMEM not including FLAT, then that should be fixed first in a separate patch.

jayfoad · 2025-04-24T10:21:07Z

llvm/test/CodeGen/AMDGPU/hard-clauses.mir

+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit $vgpr0_vgpr1, implicit $exec, implicit $flat_scr {
+    ; CHECK-NEXT:   S_CLAUSE 1
+    ; CHECK-NEXT:   $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; CHECK-NEXT:   $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 4, 0, implicit $exec, implicit $flat_scr
+    ; CHECK-NEXT: }


This is bad. FLAT and GLOBAL instructions can't be mixed in a clause (at least by GFX10 rules).

Ah ok, I thought that it's no problem here since they both access the same address and thus the FLAT_LOAD is actually a GLOBAL_LOAD, too. (Or am I mistaken?)

It is a problem. There are rules for what types of instruction can be claused together, and this pass has to respect the rules. That is why I think this patch should be NFC. Any behavioral changes can be discussed separately, in separate PRs, to see if they are OK or not.

Yup, I already changed this PR to NFC. Just wanted to verify my understanding of this test

arsenm · 2025-04-24T14:13:13Z

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

+      HasVmem |= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) ||
+                 SIInstrInfo::isSegmentSpecificFLAT(MI);


This looks wrong, it's ignoring FLAT_ instructions

This is addressed by #137170 because otherwise this PR wouldn't be NFC anymore

jayfoad · 2025-05-06T10:26:36Z

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

@@ -2420,29 +2420,27 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
    Result = true;

  else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
-           (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
+           TII->isVMEM(MI))


Am I right in thinking that the isDS test was redundant, since no instructions are both FLAT and DS? @kerbowa

Otherwise, it should check for !isLDSDMA if we specifically want to avoid DS accesses, I think

jayfoad · 2025-05-06T10:27:52Z

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

    Result = true;

  else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
-           TII->isDS(MI))
+           (TII->isDS(MI) || TII->isLDSDMA(MI)))


How is this change related to changing the definition of isVMEM??

Hm, it's not. I think it was just bothering me as I was going through the instruction detection. I should probably move this to another PR or just let it be.

jayfoad · 2025-05-06T10:34:12Z

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

+  return SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isMTBUF(MI) ||
+         SIInstrInfo::isImage(MI);


Simpler:

Suggested change

return SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isMTBUF(MI) ||

SIInstrInfo::isImage(MI);

return SIInstrInfo::isVMEM(MI);

I think Matt wanted me to list them separately: #137148 (comment)

That's not how I interpreted Matt's comment, but it's not important anyway.

Ah ok, maybe I was mistaken. I'd like to get it right - @arsenm what did you mean by "This should probably be something like if (isFLAT()) return mayAccessVMEMThroughFlat(); else // other non-flat cases"?

I mean the special case handling where it's looking at the memory operands. mayAccessVMEMThroughFlat can only improve a very niche case. To handle that, treat isFLAT as the special case and then the rest doesn't matter, it's the vmem except flat

Alright, thanks for the clarification, fixed it :)

jayfoad

LGTM

jayfoad · 2025-05-06T13:06:59Z

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

+  return SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isMTBUF(MI) ||
+         SIInstrInfo::isImage(MI);


That's not how I interpreted Matt's comment, but it's not important anyway.

ro-i · 2025-05-07T07:21:18Z

merged. thanks for your reviews!

Also adapt hazard and wait handling.

ro-i requested a review from arsenm April 24, 2025 10:05

[AMDGPU] Classify FLAT instructions as VMEM

15c1d84

Also adapt hazard and wait handling.

ro-i force-pushed the flat-isvmem branch from 017f7bf to 15c1d84 Compare April 24, 2025 10:06

llvmbot added the backend:AMDGPU label Apr 24, 2025

arsenm reviewed Apr 24, 2025

View reviewed changes

jayfoad reviewed Apr 24, 2025

View reviewed changes

implement feedback; convert to NFC; fix formatting

5f3f7d2

This was referenced Apr 24, 2025

[AMDGPU] Consider FLAT instructions for VMEM hazard detection #137170

Open

[AMDGPU] IGLP: Fixes for VMEM load detection and unsigned int handling #135090

Open

arsenm reviewed Apr 24, 2025

View reviewed changes

arsenm approved these changes May 5, 2025

View reviewed changes

jayfoad reviewed May 6, 2025

View reviewed changes

fallback to previous DS detection for IGLP

5beaf42

jayfoad approved these changes May 6, 2025

View reviewed changes

fix misunderstanding

4a5a041

jayfoad approved these changes May 6, 2025

View reviewed changes

ro-i merged commit e55172f into llvm:main May 7, 2025
11 checks passed

GeorgeARM pushed a commit to GeorgeARM/llvm-project that referenced this pull request May 7, 2025

[AMDGPU] Classify FLAT instructions as VMEM (llvm#137148)

a3fddd9

Also adapt hazard and wait handling.

jayfoad mentioned this pull request Jun 25, 2025

[AMDGPU][GFX1250] Insert S_WAIT_XCNT for SMEM and VMEM load-stores #145566

Merged

		return (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) \|\|
		SIInstrInfo::isVMEM(MI);

		HasVmem \|= (SIInstrInfo::isVMEM(MI) && !SIInstrInfo::isFLAT(MI)) \|\|
		SIInstrInfo::isSegmentSpecificFLAT(MI);

		return SIInstrInfo::isMUBUF(MI) \|\| SIInstrInfo::isMTBUF(MI) \|\|
		SIInstrInfo::isImage(MI);

	return SIInstrInfo::isMUBUF(MI) \|\| SIInstrInfo::isMTBUF(MI) \|\|
	SIInstrInfo::isImage(MI);
	return SIInstrInfo::isVMEM(MI);

[AMDGPU] Classify FLAT instructions as VMEM #137148

[AMDGPU] Classify FLAT instructions as VMEM #137148

Uh oh!

Conversation

ro-i commented Apr 24, 2025

Uh oh!

llvmbot commented Apr 24, 2025

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Apr 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jayfoad commented Apr 24, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jayfoad left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

ro-i commented May 7, 2025

Uh oh!

Uh oh!

github-actions bot commented Apr 24, 2025 •

edited

Loading