Skip to content

Revert "Reland "[amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic" (#108054)"" #108341

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 12, 2024

Conversation

rovka
Copy link
Collaborator

@rovka rovka commented Sep 12, 2024

Reverts #108173

si-init-whole-wave.mir crashes on some buildbots (although it passed both locally with sanitizers enabled and in pre-merge tests). Investigating.

@llvmbot
Copy link
Member

llvmbot commented Sep 12, 2024

@llvm/pr-subscribers-llvm-ir

Author: Diana Picus (rovka)

Changes

Reverts llvm/llvm-project#108173

si-init-whole-wave.mir crashes on some buildbots (although it passed both locally with sanitizers enabled and in pre-merge tests). Investigating.


Patch is 84.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/108341.diff

22 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (-10)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (-5)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (-10)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h (-5)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (-3)
  • (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+4-8)
  • (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (-10)
  • (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (-3)
  • (modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (+1-29)
  • (removed) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll (-1127)
  • (removed) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll (-140)
  • (modified) llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir (-29)
  • (removed) llvm/test/CodeGen/AMDGPU/si-init-whole-wave.mir (-133)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll (-2)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll (-1)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll (-1)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll (-1)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir (-4)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll (-4)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4cd32a0502c66d..e20c26eb837875 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -208,16 +208,6 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
   [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
    IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
 
-// Sets the function into whole-wave-mode and returns whether the lane was
-// active when entering the function. A branch depending on this return will
-// revert the EXEC mask to what it was when entering the function, thus
-// resulting in a no-op. This pattern is used to optimize branches when function
-// tails need to be run in whole-wave-mode. It may also have other consequences
-// (mostly related to WWM CSR handling) that differentiate it from using
-// a plain `amdgcn.init.exec -1`.
-def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
-    IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
-
 def int_amdgcn_wavefrontsize :
   ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
   DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 380dc7d3312f32..0daaf6b6576030 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2738,11 +2738,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
     SelectDSBvhStackIntrinsic(N);
     return;
-  case Intrinsic::amdgcn_init_whole_wave:
-    CurDAG->getMachineFunction()
-        .getInfo<SIMachineFunctionInfo>()
-        ->setInitWholeWave();
-    break;
   }
 
   SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 53085d423cefb8..4dfd3f087c1ae4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1772,14 +1772,6 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 }
 
-bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
-  MachineFunction *MF = MI.getParent()->getParent();
-  SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
-
-  MFInfo->setInitWholeWave();
-  return selectImpl(MI, *CoverageInfo);
-}
-
 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2107,8 +2099,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     return selectDSAppendConsume(I, true);
   case Intrinsic::amdgcn_ds_consume:
     return selectDSAppendConsume(I, false);
-  case Intrinsic::amdgcn_init_whole_wave:
-    return selectInitWholeWave(I);
   case Intrinsic::amdgcn_s_barrier:
     return selectSBarrier(I);
   case Intrinsic::amdgcn_raw_buffer_load_lds:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index df39ecbd61bce6..068db5c1c14496 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -120,7 +120,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
-  bool selectInitWholeWave(MachineInstr &MI) const;
   bool selectSBarrier(MachineInstr &MI) const;
   bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index b1022e48b8d34f..7efb7f825348e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -67,8 +67,6 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   // Kernel may need limited waves per EU for better performance.
   bool WaveLimiter = false;
 
-  bool HasInitWholeWave = false;
-
 public:
   AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
 
@@ -111,9 +109,6 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
     return WaveLimiter;
   }
 
-  bool hasInitWholeWave() const { return HasInitWholeWave; }
-  void setInitWholeWave() { HasInitWholeWave = true; }
-
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
     return allocateLDSGlobal(DL, GV, DynLDSAlign);
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f2c9619cb8276a..46d98cad963bc3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4997,7 +4997,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
       break;
     }
-    case Intrinsic::amdgcn_init_whole_wave:
     case Intrinsic::amdgcn_live_mask: {
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
       break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 2cd5fb2b94285c..95c4859674ecc4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -329,7 +329,6 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
 def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
 def : SourceOfDivergence<int_amdgcn_update_dpp>;
 def : SourceOfDivergence<int_amdgcn_writelane>;
-def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
 
 foreach intr = AMDGPUMFMAIntrinsics908 in
 def : SourceOfDivergence<intr>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f860b139945122..55d0de59bc49a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1740,9 +1740,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
                                            ? DenormalMode::IEEE
                                            : DenormalMode::PreserveSign;
 
-  if (YamlMFI.HasInitWholeWave)
-    MFI->setInitWholeWave();
-
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index dfdc7ad32b00c7..8c951105101d96 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1343,14 +1343,10 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
 
   // Allocate spill slots for WWM reserved VGPRs.
   // For chain functions, we only need to do this if we have calls to
-  // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
-  // chain functions do not return) and the function did not contain a call to
-  // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
-  // when entering the function).
-  bool IsChainWithoutRestores =
-      FuncInfo->isChainFunction() &&
-      (!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
-  if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
+  // llvm.amdgcn.cs.chain.
+  bool IsChainWithoutCalls =
+      FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
+  if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
     for (Register Reg : FuncInfo->getWWMReservedRegs()) {
       const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
       FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9afb29d95abd7d..284be72886ccef 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -570,16 +570,6 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
   let Defs = [EXEC];
 }
 
-// Sets EXEC to all lanes and returns the previous EXEC.
-def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
-  (outs SReg_1:$dst), (ins),
-  [(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
-  let Defs = [EXEC];
-  let Uses = [EXEC];
-
-  let isConvergent = 1;
-}
-
 // Return for returning shaders to a shader variant epilog.
 def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index aff0b34947d688..4cc60f50978996 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -295,8 +295,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   StringValue SGPRForEXECCopy;
   StringValue LongBranchReservedReg;
 
-  bool HasInitWholeWave = false;
-
   SIMachineFunctionInfo() = default;
   SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
                         const TargetRegisterInfo &TRI,
@@ -344,7 +342,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
                        StringValue()); // Don't print out when it's empty.
     YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
                        StringValue());
-    YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
   }
 };
 
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index ef6c92dfa9b9f2..8cedc34ca40de7 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -586,8 +586,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         KillInstrs.push_back(&MI);
         BBI.NeedsLowering = true;
       } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
-                 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
-                 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
+                 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
         InitExecInstrs.push_back(&MI);
       } else if (WQMOutputs) {
         // The function is in machine SSA form, which means that physical
@@ -1572,33 +1571,6 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
   MachineBasicBlock *MBB = MI.getParent();
   bool IsWave32 = ST->isWave32();
 
-  if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
-    assert(MBB == &MBB->getParent()->front() &&
-           "init whole wave not in entry block");
-    Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
-    MachineInstr *SaveExec =
-        BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
-                TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
-                                  : AMDGPU::S_OR_SAVEEXEC_B64),
-                EntryExec)
-            .addImm(-1);
-
-    // Replace all uses of MI's destination reg with EntryExec.
-    MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
-
-    if (LIS) {
-      LIS->RemoveMachineInstrFromMaps(MI);
-    }
-
-    MI.eraseFromParent();
-
-    if (LIS) {
-      LIS->InsertMachineInstrInMaps(*SaveExec);
-      LIS->createAndComputeVirtRegInterval(EntryExec);
-    }
-    return;
-  }
-
   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
     // This should be before all vector instructions.
     MachineInstr *InitMI =
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
deleted file mode 100644
index 353f4d90cad1f2..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ /dev/null
@@ -1,1127 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
-; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
-; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
-; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
-
-define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
-; GISEL12-LABEL: basic:
-; GISEL12:       ; %bb.0: ; %entry
-; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GISEL12-NEXT:    s_wait_expcnt 0x0
-; GISEL12-NEXT:    s_wait_samplecnt 0x0
-; GISEL12-NEXT:    s_wait_bvhcnt 0x0
-; GISEL12-NEXT:    s_wait_kmcnt 0x0
-; GISEL12-NEXT:    s_or_saveexec_b32 s8, -1
-; GISEL12-NEXT:    s_mov_b32 s6, s3
-; GISEL12-NEXT:    s_mov_b32 s7, s4
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    s_and_saveexec_b32 s3, s8
-; GISEL12-NEXT:  ; %bb.1: ; %shader
-; GISEL12-NEXT:    v_add_nc_u32_e32 v12, 42, v12
-; GISEL12-NEXT:    v_add_nc_u32_e32 v8, 5, v8
-; GISEL12-NEXT:  ; %bb.2: ; %tail
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GISEL12-NEXT:    v_add_nc_u32_e32 v11, 32, v12
-; GISEL12-NEXT:    s_mov_b32 exec_lo, s5
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    s_setpc_b64 s[6:7]
-;
-; DAGISEL12-LABEL: basic:
-; DAGISEL12:       ; %bb.0: ; %entry
-; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; DAGISEL12-NEXT:    s_wait_expcnt 0x0
-; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
-; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
-; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
-; DAGISEL12-NEXT:    s_or_saveexec_b32 s8, -1
-; DAGISEL12-NEXT:    s_mov_b32 s7, s4
-; DAGISEL12-NEXT:    s_mov_b32 s6, s3
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    s_and_saveexec_b32 s3, s8
-; DAGISEL12-NEXT:  ; %bb.1: ; %shader
-; DAGISEL12-NEXT:    v_add_nc_u32_e32 v12, 42, v12
-; DAGISEL12-NEXT:    v_add_nc_u32_e32 v8, 5, v8
-; DAGISEL12-NEXT:  ; %bb.2: ; %tail
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; DAGISEL12-NEXT:    v_add_nc_u32_e32 v11, 32, v12
-; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s5
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    s_setpc_b64 s[6:7]
-;
-; GISEL10-LABEL: basic:
-; GISEL10:       ; %bb.0: ; %entry
-; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL10-NEXT:    s_or_saveexec_b32 s8, -1
-; GISEL10-NEXT:    s_mov_b32 s6, s3
-; GISEL10-NEXT:    s_mov_b32 s7, s4
-; GISEL10-NEXT:    s_and_saveexec_b32 s3, s8
-; GISEL10-NEXT:  ; %bb.1: ; %shader
-; GISEL10-NEXT:    v_add_nc_u32_e32 v12, 42, v12
-; GISEL10-NEXT:    v_add_nc_u32_e32 v8, 5, v8
-; GISEL10-NEXT:  ; %bb.2: ; %tail
-; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GISEL10-NEXT:    v_add_nc_u32_e32 v11, 32, v12
-; GISEL10-NEXT:    s_mov_b32 exec_lo, s5
-; GISEL10-NEXT:    s_setpc_b64 s[6:7]
-;
-; DAGISEL10-LABEL: basic:
-; DAGISEL10:       ; %bb.0: ; %entry
-; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL10-NEXT:    s_or_saveexec_b32 s8, -1
-; DAGISEL10-NEXT:    s_mov_b32 s7, s4
-; DAGISEL10-NEXT:    s_mov_b32 s6, s3
-; DAGISEL10-NEXT:    s_and_saveexec_b32 s3, s8
-; DAGISEL10-NEXT:  ; %bb.1: ; %shader
-; DAGISEL10-NEXT:    v_add_nc_u32_e32 v12, 42, v12
-; DAGISEL10-NEXT:    v_add_nc_u32_e32 v8, 5, v8
-; DAGISEL10-NEXT:  ; %bb.2: ; %tail
-; DAGISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; DAGISEL10-NEXT:    v_add_nc_u32_e32 v11, 32, v12
-; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s5
-; DAGISEL10-NEXT:    s_setpc_b64 s[6:7]
-entry:
-  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %entry_exec, label %shader, label %tail
-
-shader:
-  %newx = add i32 %x, 42
-  %oldval = extractvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, 0
-  %newval = add i32 %oldval, 5
-  %newvgpr = insertvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %newval, 0
-
-  br label %tail
-
-tail:
-  %full.x = phi i32 [%x, %entry], [%newx, %shader]
-  %full.vgpr = phi { i32, ptr addrspace(5), i32, i32 } [%vgpr, %entry], [%newvgpr, %shader]
-  %modified.x = add i32 %full.x, 32
-  %vgpr.args = insertvalue { i32, ptr addrspace(5), i32, i32 } %full.vgpr, i32 %modified.x, 3
-  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr.args, i32 0)
-  unreachable
-}
-
-define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
-; GISEL12-LABEL: wwm_in_shader:
-; GISEL12:       ; %bb.0: ; %entry
-; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GISEL12-NEXT:    s_wait_expcnt 0x0
-; GISEL12-NEXT:    s_wait_samplecnt 0x0
-; GISEL12-NEXT:    s_wait_bvhcnt 0x0
-; GISEL12-NEXT:    s_wait_kmcnt 0x0
-; GISEL12-NEXT:    s_or_saveexec_b32 s8, -1
-; GISEL12-NEXT:    v_dual_mov_b32 v10, v12 :: v_dual_mov_b32 v11, v13
-; GISEL12-NEXT:    s_mov_b32 s6, s3
-; GISEL12-NEXT:    s_mov_b32 s7, s4
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    s_and_saveexec_b32 s3, s8
-; GISEL12-NEXT:  ; %bb.1: ; %shader
-; GISEL12-NEXT:    s_or_saveexec_b32 s4, -1
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v10, s4
-; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL12-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
-; GISEL12-NEXT:    v_mov_b32_e32 v0, s8
-; GISEL12-NEXT:    s_mov_b32 exec_lo, s4
-; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL12-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v10
-; GISEL12-NEXT:  ; %bb.2: ; %tail
-; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GISEL12-NEXT:    s_mov_b32 exec_lo, s5
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    s_setpc_b64 s[6:7]
-;
-; DAGISEL12-LABEL: wwm_in_shader:
-; DAGISEL12:       ; %bb.0: ; %entry
-; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; DAGISEL12-NEXT:    s_wait_expcnt 0x0
-; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
-; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
-; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
-; DAGISEL12-NEXT:    s_or_saveexec_b32 s8, -1
-; DAGISEL12-NEXT:    v_dual_mov_b32 v11, v13 :: v_dual_mov_b32 v10, v12
-; DAGISEL12-NEXT:    s_mov_b32 s7, s4
-; DAGISEL12-NEXT:    s_mov_b32 s6, s3
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    s_and_saveexec_b32 s3, s8
-; DAGISEL12-NEXT:  ; %bb.1: ; %shader
-; DAGISEL12-NEXT:    s_or_saveexec_b32 s4, -1
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v10, s4
-; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; DAGISEL12-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
-; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s4
-; DAGISEL12-NEXT:    v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v10
-; DAGISEL12-NEXT:  ; %bb.2: ; %tail
-; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s5
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    s_setpc_b64 s[6:7]
-;
-; GISEL10-LABEL: wwm_in_shader:
-; GISEL10:       ; %bb.0: ; %entry
-; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL10-NEXT:    s_or_saveexec_b32 s8, -1
-; GISEL10-NEXT:    v_mov_b32_e32 v10, v12
-; GISEL10-NEXT:    v_mov_b32_e32 v11, v13
-; GISEL10-NEXT:    s_mov_b32 s6, s3
-; GISEL10-NEXT:    s_mov_b32 s7, s4
-; GISEL10-NEXT:    s_and_saveexec_b32 s3, s8
-; GISEL10-NEXT:  ; %bb.1: ; %shader
-; GISEL10-NEXT:    s_or_saveexec_b32 s4, -1
-; GISEL10-NEXT:    v_cndmask_b32_e64 v0, 0x47, v10, s4
-; GISEL10-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
-; GISEL10-NEXT:    v_mov_b32_e32 v0, s8
-; GISEL10-NEXT:    s_mov_b32 exec_lo, s4
-; GISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v10
-; GISEL10-NEXT:    v_mov_b32_e32 v11, v0
-; GISEL10-NEXT:  ; %bb.2: ; %tail
-; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GISEL10-NEXT:    s_mov_b32 exec_lo, s5
-; GISEL10-NEXT:    s_setpc_b64 s[6:7]
-;
-; DAGISEL10-LABEL: wwm_in_shader:
-; DAGISEL10:       ; %bb.0: ; %entry
-; DAGISEL10-N...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Sep 12, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Diana Picus (rovka)

Changes

Reverts llvm/llvm-project#108173

si-init-whole-wave.mir crashes on some buildbots (although it passed both locally with sanitizers enabled and in pre-merge tests). Investigating.


Patch is 84.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/108341.diff

22 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (-10)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (-5)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (-10)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h (-5)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (-1)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (-3)
  • (modified) llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (+4-8)
  • (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (-10)
  • (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (-3)
  • (modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (+1-29)
  • (removed) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll (-1127)
  • (removed) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll (-140)
  • (modified) llvm/test/CodeGen/AMDGPU/pei-amdgpu-cs-chain.mir (-29)
  • (removed) llvm/test/CodeGen/AMDGPU/si-init-whole-wave.mir (-133)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll (-2)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll (-1)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll (-1)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll (-1)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir (-4)
  • (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll (-4)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4cd32a0502c66d..e20c26eb837875 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -208,16 +208,6 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
   [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
    IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
 
-// Sets the function into whole-wave-mode and returns whether the lane was
-// active when entering the function. A branch depending on this return will
-// revert the EXEC mask to what it was when entering the function, thus
-// resulting in a no-op. This pattern is used to optimize branches when function
-// tails need to be run in whole-wave-mode. It may also have other consequences
-// (mostly related to WWM CSR handling) that differentiate it from using
-// a plain `amdgcn.init.exec -1`.
-def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
-    IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
-
 def int_amdgcn_wavefrontsize :
   ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
   DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 380dc7d3312f32..0daaf6b6576030 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2738,11 +2738,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
     SelectDSBvhStackIntrinsic(N);
     return;
-  case Intrinsic::amdgcn_init_whole_wave:
-    CurDAG->getMachineFunction()
-        .getInfo<SIMachineFunctionInfo>()
-        ->setInitWholeWave();
-    break;
   }
 
   SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 53085d423cefb8..4dfd3f087c1ae4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1772,14 +1772,6 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 }
 
-bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
-  MachineFunction *MF = MI.getParent()->getParent();
-  SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
-
-  MFInfo->setInitWholeWave();
-  return selectImpl(MI, *CoverageInfo);
-}
-
 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2107,8 +2099,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     return selectDSAppendConsume(I, true);
   case Intrinsic::amdgcn_ds_consume:
     return selectDSAppendConsume(I, false);
-  case Intrinsic::amdgcn_init_whole_wave:
-    return selectInitWholeWave(I);
   case Intrinsic::amdgcn_s_barrier:
     return selectSBarrier(I);
   case Intrinsic::amdgcn_raw_buffer_load_lds:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index df39ecbd61bce6..068db5c1c14496 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -120,7 +120,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
-  bool selectInitWholeWave(MachineInstr &MI) const;
   bool selectSBarrier(MachineInstr &MI) const;
   bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index b1022e48b8d34f..7efb7f825348e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -67,8 +67,6 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   // Kernel may need limited waves per EU for better performance.
   bool WaveLimiter = false;
 
-  bool HasInitWholeWave = false;
-
 public:
   AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
 
@@ -111,9 +109,6 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
     return WaveLimiter;
   }
 
-  bool hasInitWholeWave() const { return HasInitWholeWave; }
-  void setInitWholeWave() { HasInitWholeWave = true; }
-
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
     return allocateLDSGlobal(DL, GV, DynLDSAlign);
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f2c9619cb8276a..46d98cad963bc3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4997,7 +4997,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
       break;
     }
-    case Intrinsic::amdgcn_init_whole_wave:
     case Intrinsic::amdgcn_live_mask: {
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
       break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 2cd5fb2b94285c..95c4859674ecc4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -329,7 +329,6 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
 def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
 def : SourceOfDivergence<int_amdgcn_update_dpp>;
 def : SourceOfDivergence<int_amdgcn_writelane>;
-def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
 
 foreach intr = AMDGPUMFMAIntrinsics908 in
 def : SourceOfDivergence<intr>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f860b139945122..55d0de59bc49a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1740,9 +1740,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
                                            ? DenormalMode::IEEE
                                            : DenormalMode::PreserveSign;
 
-  if (YamlMFI.HasInitWholeWave)
-    MFI->setInitWholeWave();
-
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index dfdc7ad32b00c7..8c951105101d96 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1343,14 +1343,10 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
 
   // Allocate spill slots for WWM reserved VGPRs.
   // For chain functions, we only need to do this if we have calls to
-  // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
-  // chain functions do not return) and the function did not contain a call to
-  // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
-  // when entering the function).
-  bool IsChainWithoutRestores =
-      FuncInfo->isChainFunction() &&
-      (!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
-  if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
+  // llvm.amdgcn.cs.chain.
+  bool IsChainWithoutCalls =
+      FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
+  if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
     for (Register Reg : FuncInfo->getWWMReservedRegs()) {
       const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
       FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9afb29d95abd7d..284be72886ccef 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -570,16 +570,6 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
   let Defs = [EXEC];
 }
 
-// Sets EXEC to all lanes and returns the previous EXEC.
-def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
-  (outs SReg_1:$dst), (ins),
-  [(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
-  let Defs = [EXEC];
-  let Uses = [EXEC];
-
-  let isConvergent = 1;
-}
-
 // Return for returning shaders to a shader variant epilog.
 def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index aff0b34947d688..4cc60f50978996 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -295,8 +295,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   StringValue SGPRForEXECCopy;
   StringValue LongBranchReservedReg;
 
-  bool HasInitWholeWave = false;
-
   SIMachineFunctionInfo() = default;
   SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
                         const TargetRegisterInfo &TRI,
@@ -344,7 +342,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
                        StringValue()); // Don't print out when it's empty.
     YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
                        StringValue());
-    YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
   }
 };
 
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index ef6c92dfa9b9f2..8cedc34ca40de7 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -586,8 +586,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         KillInstrs.push_back(&MI);
         BBI.NeedsLowering = true;
       } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
-                 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
-                 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
+                 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
         InitExecInstrs.push_back(&MI);
       } else if (WQMOutputs) {
         // The function is in machine SSA form, which means that physical
@@ -1572,33 +1571,6 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
   MachineBasicBlock *MBB = MI.getParent();
   bool IsWave32 = ST->isWave32();
 
-  if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
-    assert(MBB == &MBB->getParent()->front() &&
-           "init whole wave not in entry block");
-    Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
-    MachineInstr *SaveExec =
-        BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
-                TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
-                                  : AMDGPU::S_OR_SAVEEXEC_B64),
-                EntryExec)
-            .addImm(-1);
-
-    // Replace all uses of MI's destination reg with EntryExec.
-    MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
-
-    if (LIS) {
-      LIS->RemoveMachineInstrFromMaps(MI);
-    }
-
-    MI.eraseFromParent();
-
-    if (LIS) {
-      LIS->InsertMachineInstrInMaps(*SaveExec);
-      LIS->createAndComputeVirtRegInterval(EntryExec);
-    }
-    return;
-  }
-
   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
     // This should be before all vector instructions.
     MachineInstr *InitMI =
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
deleted file mode 100644
index 353f4d90cad1f2..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ /dev/null
@@ -1,1127 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL12 %s
-; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL12 %s
-; RUN: llc -global-isel=1 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL10 %s
-; RUN: llc -global-isel=0 -O2 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL10 %s
-
-define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
-; GISEL12-LABEL: basic:
-; GISEL12:       ; %bb.0: ; %entry
-; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GISEL12-NEXT:    s_wait_expcnt 0x0
-; GISEL12-NEXT:    s_wait_samplecnt 0x0
-; GISEL12-NEXT:    s_wait_bvhcnt 0x0
-; GISEL12-NEXT:    s_wait_kmcnt 0x0
-; GISEL12-NEXT:    s_or_saveexec_b32 s8, -1
-; GISEL12-NEXT:    s_mov_b32 s6, s3
-; GISEL12-NEXT:    s_mov_b32 s7, s4
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    s_and_saveexec_b32 s3, s8
-; GISEL12-NEXT:  ; %bb.1: ; %shader
-; GISEL12-NEXT:    v_add_nc_u32_e32 v12, 42, v12
-; GISEL12-NEXT:    v_add_nc_u32_e32 v8, 5, v8
-; GISEL12-NEXT:  ; %bb.2: ; %tail
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GISEL12-NEXT:    v_add_nc_u32_e32 v11, 32, v12
-; GISEL12-NEXT:    s_mov_b32 exec_lo, s5
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    s_setpc_b64 s[6:7]
-;
-; DAGISEL12-LABEL: basic:
-; DAGISEL12:       ; %bb.0: ; %entry
-; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; DAGISEL12-NEXT:    s_wait_expcnt 0x0
-; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
-; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
-; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
-; DAGISEL12-NEXT:    s_or_saveexec_b32 s8, -1
-; DAGISEL12-NEXT:    s_mov_b32 s7, s4
-; DAGISEL12-NEXT:    s_mov_b32 s6, s3
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    s_and_saveexec_b32 s3, s8
-; DAGISEL12-NEXT:  ; %bb.1: ; %shader
-; DAGISEL12-NEXT:    v_add_nc_u32_e32 v12, 42, v12
-; DAGISEL12-NEXT:    v_add_nc_u32_e32 v8, 5, v8
-; DAGISEL12-NEXT:  ; %bb.2: ; %tail
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; DAGISEL12-NEXT:    v_add_nc_u32_e32 v11, 32, v12
-; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s5
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    s_setpc_b64 s[6:7]
-;
-; GISEL10-LABEL: basic:
-; GISEL10:       ; %bb.0: ; %entry
-; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL10-NEXT:    s_or_saveexec_b32 s8, -1
-; GISEL10-NEXT:    s_mov_b32 s6, s3
-; GISEL10-NEXT:    s_mov_b32 s7, s4
-; GISEL10-NEXT:    s_and_saveexec_b32 s3, s8
-; GISEL10-NEXT:  ; %bb.1: ; %shader
-; GISEL10-NEXT:    v_add_nc_u32_e32 v12, 42, v12
-; GISEL10-NEXT:    v_add_nc_u32_e32 v8, 5, v8
-; GISEL10-NEXT:  ; %bb.2: ; %tail
-; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GISEL10-NEXT:    v_add_nc_u32_e32 v11, 32, v12
-; GISEL10-NEXT:    s_mov_b32 exec_lo, s5
-; GISEL10-NEXT:    s_setpc_b64 s[6:7]
-;
-; DAGISEL10-LABEL: basic:
-; DAGISEL10:       ; %bb.0: ; %entry
-; DAGISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL10-NEXT:    s_or_saveexec_b32 s8, -1
-; DAGISEL10-NEXT:    s_mov_b32 s7, s4
-; DAGISEL10-NEXT:    s_mov_b32 s6, s3
-; DAGISEL10-NEXT:    s_and_saveexec_b32 s3, s8
-; DAGISEL10-NEXT:  ; %bb.1: ; %shader
-; DAGISEL10-NEXT:    v_add_nc_u32_e32 v12, 42, v12
-; DAGISEL10-NEXT:    v_add_nc_u32_e32 v8, 5, v8
-; DAGISEL10-NEXT:  ; %bb.2: ; %tail
-; DAGISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; DAGISEL10-NEXT:    v_add_nc_u32_e32 v11, 32, v12
-; DAGISEL10-NEXT:    s_mov_b32 exec_lo, s5
-; DAGISEL10-NEXT:    s_setpc_b64 s[6:7]
-entry:
-  %entry_exec = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %entry_exec, label %shader, label %tail
-
-shader:
-  %newx = add i32 %x, 42
-  %oldval = extractvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, 0
-  %newval = add i32 %oldval, 5
-  %newvgpr = insertvalue { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %newval, 0
-
-  br label %tail
-
-tail:
-  %full.x = phi i32 [%x, %entry], [%newx, %shader]
-  %full.vgpr = phi { i32, ptr addrspace(5), i32, i32 } [%vgpr, %entry], [%newvgpr, %shader]
-  %modified.x = add i32 %full.x, 32
-  %vgpr.args = insertvalue { i32, ptr addrspace(5), i32, i32 } %full.vgpr, i32 %modified.x, 3
-  call void(ptr, i32, <3 x i32>, { i32, ptr addrspace(5), i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain(ptr %callee, i32 %exec, <3 x i32> inreg %sgpr, { i32, ptr addrspace(5), i32, i32 } %vgpr.args, i32 0)
-  unreachable
-}
-
-define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %callee, i32 inreg %exec, { i32, ptr addrspace(5), i32, i32 } %vgpr, i32 %x, i32 %y) {
-; GISEL12-LABEL: wwm_in_shader:
-; GISEL12:       ; %bb.0: ; %entry
-; GISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GISEL12-NEXT:    s_wait_expcnt 0x0
-; GISEL12-NEXT:    s_wait_samplecnt 0x0
-; GISEL12-NEXT:    s_wait_bvhcnt 0x0
-; GISEL12-NEXT:    s_wait_kmcnt 0x0
-; GISEL12-NEXT:    s_or_saveexec_b32 s8, -1
-; GISEL12-NEXT:    v_dual_mov_b32 v10, v12 :: v_dual_mov_b32 v11, v13
-; GISEL12-NEXT:    s_mov_b32 s6, s3
-; GISEL12-NEXT:    s_mov_b32 s7, s4
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    s_and_saveexec_b32 s3, s8
-; GISEL12-NEXT:  ; %bb.1: ; %shader
-; GISEL12-NEXT:    s_or_saveexec_b32 s4, -1
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v10, s4
-; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL12-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
-; GISEL12-NEXT:    v_mov_b32_e32 v0, s8
-; GISEL12-NEXT:    s_mov_b32 exec_lo, s4
-; GISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GISEL12-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v10
-; GISEL12-NEXT:  ; %bb.2: ; %tail
-; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GISEL12-NEXT:    s_mov_b32 exec_lo, s5
-; GISEL12-NEXT:    s_wait_alu 0xfffe
-; GISEL12-NEXT:    s_setpc_b64 s[6:7]
-;
-; DAGISEL12-LABEL: wwm_in_shader:
-; DAGISEL12:       ; %bb.0: ; %entry
-; DAGISEL12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; DAGISEL12-NEXT:    s_wait_expcnt 0x0
-; DAGISEL12-NEXT:    s_wait_samplecnt 0x0
-; DAGISEL12-NEXT:    s_wait_bvhcnt 0x0
-; DAGISEL12-NEXT:    s_wait_kmcnt 0x0
-; DAGISEL12-NEXT:    s_or_saveexec_b32 s8, -1
-; DAGISEL12-NEXT:    v_dual_mov_b32 v11, v13 :: v_dual_mov_b32 v10, v12
-; DAGISEL12-NEXT:    s_mov_b32 s7, s4
-; DAGISEL12-NEXT:    s_mov_b32 s6, s3
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    s_and_saveexec_b32 s3, s8
-; DAGISEL12-NEXT:  ; %bb.1: ; %shader
-; DAGISEL12-NEXT:    s_or_saveexec_b32 s4, -1
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    v_cndmask_b32_e64 v0, 0x47, v10, s4
-; DAGISEL12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; DAGISEL12-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
-; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s4
-; DAGISEL12-NEXT:    v_dual_mov_b32 v11, s8 :: v_dual_add_nc_u32 v10, 42, v10
-; DAGISEL12-NEXT:  ; %bb.2: ; %tail
-; DAGISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; DAGISEL12-NEXT:    s_mov_b32 exec_lo, s5
-; DAGISEL12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL12-NEXT:    s_setpc_b64 s[6:7]
-;
-; GISEL10-LABEL: wwm_in_shader:
-; GISEL10:       ; %bb.0: ; %entry
-; GISEL10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL10-NEXT:    s_or_saveexec_b32 s8, -1
-; GISEL10-NEXT:    v_mov_b32_e32 v10, v12
-; GISEL10-NEXT:    v_mov_b32_e32 v11, v13
-; GISEL10-NEXT:    s_mov_b32 s6, s3
-; GISEL10-NEXT:    s_mov_b32 s7, s4
-; GISEL10-NEXT:    s_and_saveexec_b32 s3, s8
-; GISEL10-NEXT:  ; %bb.1: ; %shader
-; GISEL10-NEXT:    s_or_saveexec_b32 s4, -1
-; GISEL10-NEXT:    v_cndmask_b32_e64 v0, 0x47, v10, s4
-; GISEL10-NEXT:    v_cmp_ne_u32_e64 s8, 0, v0
-; GISEL10-NEXT:    v_mov_b32_e32 v0, s8
-; GISEL10-NEXT:    s_mov_b32 exec_lo, s4
-; GISEL10-NEXT:    v_add_nc_u32_e32 v10, 42, v10
-; GISEL10-NEXT:    v_mov_b32_e32 v11, v0
-; GISEL10-NEXT:  ; %bb.2: ; %tail
-; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GISEL10-NEXT:    s_mov_b32 exec_lo, s5
-; GISEL10-NEXT:    s_setpc_b64 s[6:7]
-;
-; DAGISEL10-LABEL: wwm_in_shader:
-; DAGISEL10:       ; %bb.0: ; %entry
-; DAGISEL10-N...
[truncated]

@rovka rovka merged commit 7792b4a into main Sep 12, 2024
8 of 11 checks passed
@rovka rovka deleted the revert-108173-init-whole-wave-2 branch September 12, 2024 08:12
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants