llvm
diff --git a/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 1 addition & 2 deletions b/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 1 addition & 2 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 0 additions & 133 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 0 additions & 133 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstructions.td
Lines changed: 2 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIInstructions.td
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
Lines changed: 24 additions & 1 deletion b/‎llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
Lines changed: 24 additions & 1 deletion
@@ -216,11 +216,10 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
 // (mostly related to WWM CSR handling) that differentiate it from using
 // a plain `amdgcn.init.exec -1`.
 //
-// Can only be used in functions with the `amdgpu_cs_chain` calling convention.
 // Using this intrinsic without immediately branching on its return value is an
 // error.
 def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
-    IntrHasSideEffects, IntrNoMem, IntrNoDuplicate, IntrConvergent]>;
+    IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
 
 def int_amdgcn_wavefrontsize :
   ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
 
@@ -15677,133 +15677,6 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
   }
 }
 
-static void removeInitWholeWaveBranch(MachineFunction &MF,
-                                      MachineRegisterInfo &MRI,
-                                      const SIInstrInfo *TII) {
-  // Remove SI_INIT_WHOLE_WAVE and the following SI_IF/END_CF and instead set
-  // EXEC to -1 at SI_END_CF.
-  auto IWWIt = find_if(MF.begin()->instrs(), [](const MachineInstr &MI) {
-    return MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE;
-  });
-  if (IWWIt == MF.begin()->instr_end())
-    return; // We've been here before (GISel runs finalizeLowering twice).
-
-  MachineInstr &If = *MRI.use_begin(IWWIt->getOperand(0).getReg())->getParent();
-  assert(If.getOpcode() == AMDGPU::SI_IF &&
-         "Unexpected user for init.whole.wave result");
-  assert(MRI.hasOneUse(IWWIt->getOperand(0).getReg()) &&
-         "Expected simple control flow");
-
-  MachineInstr &EndCf = *MRI.use_begin(If.getOperand(0).getReg())->getParent();
-  MachineBasicBlock *EndBB = EndCf.getParent();
-
-  // Update all the Phis: since we're removing a predecessor, we need to remove
-  // the corresponding pair of operands. However, we can't just drop the value
-  // coming from the 'if' block - that's going to be the value of the inactive
-  // lanes.
-  // %v = phi (%inactive, %if), (%active1, %shader1), ... (%activeN, %shaderN)
-  // should become
-  // %t = phi (%active1, %shader1), ... (%activeN, %shaderN)
-  // %v = v_set_inactive %t, %inactive
-  // Note that usually EndCf will be the first instruction after the phis and as
-  // such will serve as the end of the range when iterating over phis.
-  // Therefore, we shouldn't introduce any new instructions before it.
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
-  auto AfterEndCf = std::next(EndCf.getIterator());
-  for (auto &Phi : EndBB->phis()) {
-    Register PhiDest = Phi.getOperand(0).getReg();
-    const TargetRegisterClass *PhiRC = MRI.getRegClass(PhiDest);
-
-    Register NewPhiDest = MRI.createVirtualRegister(PhiRC);
-    Phi.getOperand(0).setReg(NewPhiDest);
-
-    unsigned InactiveOpIdx = 0;
-    for (unsigned I = 1; I < Phi.getNumOperands(); I += 2) {
-      if (Phi.getOperand(I + 1).getMBB() == If.getParent()) {
-        InactiveOpIdx = I;
-        break;
-      }
-    }
-    assert(InactiveOpIdx != 0 && "Broken phi?");
-
-    // At this point, the register class could be larger than 32 or 64, so we
-    // might have to use more than one V_SET_INACTIVE instruction.
-    unsigned Size = TRI.getRegSizeInBits(*PhiRC);
-    switch (Size) {
-    case 32:
-      BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
-              TII->get(AMDGPU::V_SET_INACTIVE_B32), PhiDest)
-          .addReg(NewPhiDest)
-          .add(Phi.getOperand(InactiveOpIdx));
-      break;
-    case 64:
-      BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
-              TII->get(AMDGPU::V_SET_INACTIVE_B64), PhiDest)
-          .addReg(NewPhiDest)
-          .add(Phi.getOperand(InactiveOpIdx));
-      break;
-    default: {
-      // For each 32-bit subregister of the register at InactiveOpIdx, insert
-      // a COPY to a new register, and a V_SET_INACTIVE_B32 using the
-      // corresponding subregisters of PhiDest and NewPhiDest.
-      // FIXME: There has to be a better way to iterate over this...
-      llvm::SmallVector<Register, 16> PhiSubRegs;
-      const unsigned SubRegIndices[] = {
-          AMDGPU::sub0,  AMDGPU::sub1,  AMDGPU::sub2,  AMDGPU::sub3,
-          AMDGPU::sub4,  AMDGPU::sub5,  AMDGPU::sub6,  AMDGPU::sub7,
-          AMDGPU::sub8,  AMDGPU::sub9,  AMDGPU::sub10, AMDGPU::sub11,
-          AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
-          AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
-          AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
-          AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
-          AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31};
-      const unsigned NumSubRegs = Size / 32;
-      assert(sizeof(SubRegIndices) / sizeof(SubRegIndices[0]) >= NumSubRegs &&
-             "Not enough subregister indices");
-      for (unsigned I = 0; I != NumSubRegs; ++I) {
-        unsigned SubRegIdx = SubRegIndices[I];
-        Register InactiveSubReg =
-            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-        BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(), TII->get(AMDGPU::COPY),
-                InactiveSubReg)
-            .addReg(Phi.getOperand(InactiveOpIdx).getReg(), 0, SubRegIdx);
-
-        Register AllLanesSubReg =
-            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-        BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
-                TII->get(AMDGPU::V_SET_INACTIVE_B32), AllLanesSubReg)
-            .addReg(NewPhiDest, 0, SubRegIdx)
-            .addReg(InactiveSubReg);
-        PhiSubRegs.push_back(AllLanesSubReg);
-      }
-      // Now we need to combine the subregisters into the original register.
-      auto RegSequence = BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
-                                 TII->get(AMDGPU::REG_SEQUENCE), PhiDest);
-      for (unsigned I = 0; I < NumSubRegs; ++I) {
-        RegSequence.addReg(PhiSubRegs[I]);
-        RegSequence.addImm(SubRegIndices[I]);
-      }
-      break;
-    }
-    }
-
-    Phi.removeOperand(InactiveOpIdx + 1);
-    Phi.removeOperand(InactiveOpIdx);
-  }
-  If.getParent()->removeSuccessor(EndBB);
-
-  BuildMI(*EndBB, AfterEndCf, IWWIt->getDebugLoc(),
-          TII->get(MF.getSubtarget<GCNSubtarget>().isWave32()
-                       ? AMDGPU::S_MOV_B32
-                       : AMDGPU::S_MOV_B64),
-          TII->getRegisterInfo().getExec())
-      .addImm(-1);
-
-  EndCf.eraseFromParent();
-  If.eraseFromParent();
-  IWWIt->eraseFromParent();
-}
-
 // Figure out which registers should be reserved for stack access. Only after
 // the function is legalized do we know all of the non-spill stack objects or if
 // calls are present.
@@ -15814,12 +15687,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
-  if (Info->hasInitWholeWave()) {
-    assert(Info->isChainFunction() &&
-           "init.whole.wave may only be used in chain functions");
-    removeInitWholeWaveBranch(MF, MRI, TII);
-  }
-
   if (Info->isEntryFunction()) {
     // Callable functions have fixed registers used for stack access.
     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
 
@@ -589,6 +589,8 @@ def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
   [(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
   let Defs = [EXEC];
   let Uses = [EXEC];
+
+  let isConvergent = 1;
 }
 
 // Return for returning shaders to a shader variant epilog.
 
@@ -594,7 +594,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         KillInstrs.push_back(&MI);
         BBI.NeedsLowering = true;
       } else if (Opcode == AMDGPU::SI_INIT_EXEC ||
-                 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
+                 Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
+                 Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
         InitExecInstrs.push_back(&MI);
       } else if (WQMOutputs) {
         // The function is in machine SSA form, which means that physical
@@ -1582,6 +1583,28 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
   MachineBasicBlock *MBB = MI.getParent();
   bool IsWave32 = ST->isWave32();
 
+  if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
+    // TODO: Assert that it's in the entry block
+    Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
+    MachineInstr *SaveExec =
+        BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+                TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
+                                  : AMDGPU::S_OR_SAVEEXEC_B64),
+                EntryExec)
+            .addImm(-1);
+
+    // Replace all uses of MI's destination reg with EntryExec.
+    MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
+    MI.eraseFromParent();
+
+    if (LIS) {
+      LIS->RemoveMachineInstrFromMaps(MI);
+      LIS->InsertMachineInstrInMaps(*SaveExec);
+      LIS->createAndComputeVirtRegInterval(EntryExec);
+    }
+    return;
+  }
+
   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
     // This should be before all vector instructions.
     MachineInstr *InitMI =
Original file line number	Diff line number	Diff line change
`@@ -589,6 +589,8 @@ def SI_INIT_WHOLE_WAVE : SPseudoInstSI <`
`589`	`589`	`[(set i1:$dst, (int_amdgcn_init_whole_wave))]> {`
`590`	`590`	`let Defs = [EXEC];`
`591`	`591`	`let Uses = [EXEC];`
	`592`	`+`
	`593`	`+ let isConvergent = 1;`
`592`	`594`	`}`
`593`	`595`
`594`	`596`	`// Return for returning shaders to a shader variant epilog.`