[AMDGPU] Come back patch for the 'Assign register class for cross blo…

…ck values according to the divergence.' Detailed description: After https://reviews.llvm.org/D59990 submit several issues were discovered. Changes in common code were preserved but AMDGPU specific part was reverted to keep the backend working correctly. Discovered issues were addressed in the following commits: https://reviews.llvm.org/D67662 https://reviews.llvm.org/D67101 https://reviews.llvm.org/D63953 https://reviews.llvm.org/D63731 This change brings back AMDGPU specific changes. Reviewed by: rampitec, arsenm Differential Revision: https://reviews.llvm.org/D68635 llvm-svn: 374767
jaebaek · Oct 14, 2019 · c4d256a · c4d256a
1 parent d34822a
commit c4d256a
Show file tree

Hide file tree

Showing 46 changed files with 562 additions and 440 deletions.
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -113,10 +113,16 @@ class SIFixSGPRCopies : public MachineFunctionPass {
 public:
   static char ID;
 
+  MachineRegisterInfo *MRI;
+  const SIRegisterInfo *TRI;
+  const SIInstrInfo *TII;
+
   SIFixSGPRCopies() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  void processPHINode(MachineInstr &MI);
+
   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -313,52 +319,6 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
   return true;
 }
 
-static bool phiHasVGPROperands(const MachineInstr &PHI,
-                               const MachineRegisterInfo &MRI,
-                               const SIRegisterInfo *TRI,
-                               const SIInstrInfo *TII) {
-  for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
-    Register Reg = PHI.getOperand(i).getReg();
-    if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
-      return true;
-  }
-  return false;
-}
-
-static bool phiHasBreakDef(const MachineInstr &PHI,
-                           const MachineRegisterInfo &MRI,
-                           SmallSet<unsigned, 8> &Visited) {
-  for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
-    Register Reg = PHI.getOperand(i).getReg();
-    if (Visited.count(Reg))
-      continue;
-
-    Visited.insert(Reg);
-
-    MachineInstr *DefInstr = MRI.getVRegDef(Reg);
-    switch (DefInstr->getOpcode()) {
-    default:
-      break;
-    case AMDGPU::SI_IF_BREAK:
-      return true;
-    case AMDGPU::PHI:
-      if (phiHasBreakDef(*DefInstr, MRI, Visited))
-        return true;
-    }
-  }
-  return false;
-}
-
-static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
-                                          const TargetRegisterInfo &TRI) {
-  for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
-       E = MBB.end(); I != E; ++I) {
-    if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
-      return true;
-  }
-  return false;
-}
-
 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
                                     const MachineInstr *MoveImm,
                                     const SIInstrInfo *TII,
@@ -420,12 +380,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
   return false;
 }
 
-static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
-                                        const TargetRegisterInfo *TRI) {
-  return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
-           return hasTerminatorThatModifiesExec(*MBB, *TRI); });
-}
-
 // Checks if there is potential path From instruction To instruction.
 // If CutOff is specified and it sits in between of that path we ignore
 // a higher portion of the path and report it is not reachable.
@@ -633,9 +587,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
 
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  const SIInstrInfo *TII = ST.getInstrInfo();
+  MRI = &MF.getRegInfo();
+  TRI = ST.getRegisterInfo();
+  TII = ST.getInstrInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
 
   SmallVector<MachineInstr *, 16> Worklist;
@@ -657,7 +611,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         Register DstReg = MI.getOperand(0).getReg();
 
         const TargetRegisterClass *SrcRC, *DstRC;
-        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
 
         if (!Register::isVirtualRegister(DstReg)) {
           // If the destination register is a physical register there isn't
@@ -666,7 +620,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
           // the first lane. Insert a readfirstlane and hope for the best.
           if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
             Register TmpReg
-              = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+              = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
             BuildMI(MBB, MI, MI.getDebugLoc(),
                     TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
@@ -684,7 +638,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
             break;
           }
 
-          MachineInstr *DefMI = MRI.getVRegDef(SrcReg);
+          MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
           unsigned SMovOp;
           int64_t Imm;
           // If we are just copying an immediate, we can replace the copy with
@@ -703,70 +657,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         break;
       }
       case AMDGPU::PHI: {
-        Register Reg = MI.getOperand(0).getReg();
-        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
-          break;
-
-        // We don't need to fix the PHI if the common dominator of the
-        // two incoming blocks terminates with a uniform branch.
-        bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
-        if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
-          MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
-          MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
-
-          if (!predsHasDivergentTerminator(MBB0, TRI) &&
-              !predsHasDivergentTerminator(MBB1, TRI)) {
-            LLVM_DEBUG(dbgs()
-                       << "Not fixing PHI for uniform branch: " << MI << '\n');
-            break;
-          }
-        }
-
-        // If a PHI node defines an SGPR and any of its operands are VGPRs,
-        // then we need to move it to the VALU.
-        //
-        // Also, if a PHI node defines an SGPR and has all SGPR operands
-        // we must move it to the VALU, because the SGPR operands will
-        // all end up being assigned the same register, which means
-        // there is a potential for a conflict if different threads take
-        // different control flow paths.
-        //
-        // For Example:
-        //
-        // sgpr0 = def;
-        // ...
-        // sgpr1 = def;
-        // ...
-        // sgpr2 = PHI sgpr0, sgpr1
-        // use sgpr2;
-        //
-        // Will Become:
-        //
-        // sgpr2 = def;
-        // ...
-        // sgpr2 = def;
-        // ...
-        // use sgpr2
-        //
-        // The one exception to this rule is when one of the operands
-        // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
-        // instruction.  In this case, there we know the program will
-        // never enter the second block (the loop) without entering
-        // the first block (where the condition is computed), so there
-        // is no chance for values to be over-written.
-
-        SmallSet<unsigned, 8> Visited;
-        if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
-          LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
-          TII->moveToVALU(MI, MDT);
-        }
-
+        processPHINode(MI);
         break;
       }
       case AMDGPU::REG_SEQUENCE:
         if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
             !hasVectorOperands(MI, TRI)) {
-          foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
+          foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
           continue;
         }
 
@@ -776,9 +673,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         break;
       case AMDGPU::INSERT_SUBREG: {
         const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
-        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
-        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
-        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+        DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
+        Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
+        Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
         if (TRI->isSGPRClass(DstRC) &&
             (TRI->hasVectorRegisters(Src0RC) ||
              TRI->hasVectorRegisters(Src1RC))) {
@@ -792,7 +689,78 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   }
 
   if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
-    hoistAndMergeSGPRInits(AMDGPU::M0, MRI, TRI, *MDT, TII);
+    hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
 
   return true;
 }
+
+void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
+  unsigned numVGPRUses = 0;
+  SetVector<const MachineInstr *> worklist;
+  worklist.insert(&MI);
+  while (!worklist.empty()) {
+    const MachineInstr *Instr = worklist.pop_back_val();
+    unsigned Reg = Instr->getOperand(0).getReg();
+    for (const auto &Use : MRI->use_operands(Reg)) {
+      const MachineInstr *UseMI = Use.getParent();
+      if (UseMI->isCopy() || UseMI->isRegSequence()) {
+        if (UseMI->isCopy() &&
+          UseMI->getOperand(0).getReg().isPhysical() &&
+          !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
+          numVGPRUses++;
+        }
+        worklist.insert(UseMI);
+        continue;
+      }
+
+      if (UseMI->isPHI()) {
+        const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
+        if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
+          UseRC != &AMDGPU::VReg_1RegClass)
+          numVGPRUses++;
+        continue;
+      }
+
+      const TargetRegisterClass *OpRC =
+        TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
+      if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
+        OpRC != &AMDGPU::VS_64RegClass) {
+        numVGPRUses++;
+      }
+    }
+  }
+  bool hasVGPRInput = false;
+  for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+    unsigned InputReg = MI.getOperand(i).getReg();
+    MachineInstr *Def = MRI->getVRegDef(InputReg);
+    if (TRI->isVGPR(*MRI, InputReg)) {
+      if (Def->isCopy()) {
+        unsigned SrcReg = Def->getOperand(1).getReg();
+        const TargetRegisterClass *RC =
+          TRI->getRegClassForReg(*MRI, SrcReg);
+        if (TRI->isSGPRClass(RC))
+          continue;
+      }
+      hasVGPRInput = true;
+      break;
+    }
+    else if (Def->isCopy() &&
+      TRI->isVGPR(*MRI, Def->getOperand(1).getReg())) {
+      hasVGPRInput = true;
+      break;
+    }
+  }
+  unsigned PHIRes = MI.getOperand(0).getReg();
+  const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
+
+  if ((!TRI->isVGPR(*MRI, PHIRes) && RC0 != &AMDGPU::VReg_1RegClass) &&
+    (hasVGPRInput || numVGPRUses > 1)) {
+    LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
+    TII->moveToVALU(MI);
+  }
+  else {
+    LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
+    TII->legalizeOperands(MI, MDT);
+  }
+
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10944,3 +10944,110 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
 
   return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
 }
+
+const TargetRegisterClass *
+SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
+  const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
+  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
+    return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
+                                               : &AMDGPU::SReg_32RegClass;
+  if (!TRI->isSGPRClass(RC) && !isDivergent)
+    return TRI->getEquivalentSGPRClass(RC);
+  else if (TRI->isSGPRClass(RC) && isDivergent)
+    return TRI->getEquivalentVGPRClass(RC);
+
+  return RC;
+}
+
+static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
+  if (!Visited.insert(V).second)
+    return false;
+  bool Result = false;
+  for (auto U : V->users()) {
+    if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
+      if (V == U->getOperand(1)) {
+        switch (Intrinsic->getIntrinsicID()) {
+        default:
+          Result = false;
+          break;
+        case Intrinsic::amdgcn_if_break:
+        case Intrinsic::amdgcn_if:
+        case Intrinsic::amdgcn_else:
+          Result = true;
+          break;
+        }
+      }
+      if (V == U->getOperand(0)) {
+        switch (Intrinsic->getIntrinsicID()) {
+        default:
+          Result = false;
+          break;
+        case Intrinsic::amdgcn_end_cf:
+        case Intrinsic::amdgcn_loop:
+          Result = true;
+          break;
+        }
+      }
+    } else {
+      Result = hasCFUser(U, Visited);
+    }
+    if (Result)
+      break;
+  }
+  return Result;
+}
+
+bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
+                                               const Value *V) const {
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+    switch (Intrinsic->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::amdgcn_if_break:
+      return true;
+    }
+  }
+  if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) {
+    if (const IntrinsicInst *Intrinsic =
+            dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) {
+      switch (Intrinsic->getIntrinsicID()) {
+      default:
+        return false;
+      case Intrinsic::amdgcn_if:
+      case Intrinsic::amdgcn_else: {
+        ArrayRef<unsigned> Indices = ExtValue->getIndices();
+        if (Indices.size() == 1 && Indices[0] == 1) {
+          return true;
+        }
+      }
+      }
+    }
+  }
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (isa<InlineAsm>(CI->getCalledValue())) {
+      const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
+      ImmutableCallSite CS(CI);
+      TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
+          MF.getDataLayout(), Subtarget->getRegisterInfo(), CS);
+      for (auto &TC : TargetConstraints) {
+        if (TC.Type == InlineAsm::isOutput) {
+          ComputeConstraintToUse(TC, SDValue());
+          unsigned AssignedReg;
+          const TargetRegisterClass *RC;
+          std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint(
+              SIRI, TC.ConstraintCode, TC.ConstraintVT);
+          if (RC) {
+            MachineRegisterInfo &MRI = MF.getRegInfo();
+            if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg))
+              return true;
+            else if (SIRI->isSGPRClass(RC))
+              return true;
+          }
+        }
+      }
+    }
+  }
+  SmallPtrSet<const Value *, 16> Visited;
+  return hasCFUser(V, Visited);
+}