Skip to content

Commit

Permalink
[AMDGPU] Come back patch for the 'Assign register class for cross blo…
Browse files Browse the repository at this point in the history
…ck values according to the divergence.'

  Detailed description:

    After https://reviews.llvm.org/D59990 submit several issues were discovered.
    Changes in common code were preserved but AMDGPU specific part was reverted to keep the backend working correctly.

    Discovered issues were addressed in the following commits:

    https://reviews.llvm.org/D67662
    https://reviews.llvm.org/D67101
    https://reviews.llvm.org/D63953
    https://reviews.llvm.org/D63731

    This change brings back AMDGPU specific changes.

  Reviewed by: rampitec, arsenm

  Differential Revision: https://reviews.llvm.org/D68635

llvm-svn: 374767
  • Loading branch information
alex-t committed Oct 14, 2019
1 parent d34822a commit c4d256a
Show file tree
Hide file tree
Showing 46 changed files with 562 additions and 440 deletions.
210 changes: 89 additions & 121 deletions llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,16 @@ class SIFixSGPRCopies : public MachineFunctionPass {
public:
static char ID;

MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;

SIFixSGPRCopies() : MachineFunctionPass(ID) {}

bool runOnMachineFunction(MachineFunction &MF) override;

void processPHINode(MachineInstr &MI);

StringRef getPassName() const override { return "SI Fix SGPR copies"; }

void getAnalysisUsage(AnalysisUsage &AU) const override {
Expand Down Expand Up @@ -313,52 +319,6 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
return true;
}

static bool phiHasVGPROperands(const MachineInstr &PHI,
const MachineRegisterInfo &MRI,
const SIRegisterInfo *TRI,
const SIInstrInfo *TII) {
for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
Register Reg = PHI.getOperand(i).getReg();
if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
return true;
}
return false;
}

static bool phiHasBreakDef(const MachineInstr &PHI,
const MachineRegisterInfo &MRI,
SmallSet<unsigned, 8> &Visited) {
for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
Register Reg = PHI.getOperand(i).getReg();
if (Visited.count(Reg))
continue;

Visited.insert(Reg);

MachineInstr *DefInstr = MRI.getVRegDef(Reg);
switch (DefInstr->getOpcode()) {
default:
break;
case AMDGPU::SI_IF_BREAK:
return true;
case AMDGPU::PHI:
if (phiHasBreakDef(*DefInstr, MRI, Visited))
return true;
}
}
return false;
}

static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
const TargetRegisterInfo &TRI) {
for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
E = MBB.end(); I != E; ++I) {
if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
return true;
}
return false;
}

static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
const MachineInstr *MoveImm,
const SIInstrInfo *TII,
Expand Down Expand Up @@ -420,12 +380,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
return false;
}

static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
const TargetRegisterInfo *TRI) {
return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
return hasTerminatorThatModifiesExec(*MBB, *TRI); });
}

// Checks if there is potential path From instruction To instruction.
// If CutOff is specified and it sits in between of that path we ignore
// a higher portion of the path and report it is not reachable.
Expand Down Expand Up @@ -633,9 +587,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,

bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const SIInstrInfo *TII = ST.getInstrInfo();
MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();

SmallVector<MachineInstr *, 16> Worklist;
Expand All @@ -657,7 +611,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
Register DstReg = MI.getOperand(0).getReg();

const TargetRegisterClass *SrcRC, *DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);

if (!Register::isVirtualRegister(DstReg)) {
// If the destination register is a physical register there isn't
Expand All @@ -666,7 +620,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
// the first lane. Insert a readfirstlane and hope for the best.
if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
Register TmpReg
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
= MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

BuildMI(MBB, MI, MI.getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
Expand All @@ -684,7 +638,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
}

MachineInstr *DefMI = MRI.getVRegDef(SrcReg);
MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
unsigned SMovOp;
int64_t Imm;
// If we are just copying an immediate, we can replace the copy with
Expand All @@ -703,70 +657,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
}
case AMDGPU::PHI: {
Register Reg = MI.getOperand(0).getReg();
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
break;

// We don't need to fix the PHI if the common dominator of the
// two incoming blocks terminates with a uniform branch.
bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();

if (!predsHasDivergentTerminator(MBB0, TRI) &&
!predsHasDivergentTerminator(MBB1, TRI)) {
LLVM_DEBUG(dbgs()
<< "Not fixing PHI for uniform branch: " << MI << '\n');
break;
}
}

// If a PHI node defines an SGPR and any of its operands are VGPRs,
// then we need to move it to the VALU.
//
// Also, if a PHI node defines an SGPR and has all SGPR operands
// we must move it to the VALU, because the SGPR operands will
// all end up being assigned the same register, which means
// there is a potential for a conflict if different threads take
// different control flow paths.
//
// For Example:
//
// sgpr0 = def;
// ...
// sgpr1 = def;
// ...
// sgpr2 = PHI sgpr0, sgpr1
// use sgpr2;
//
// Will Become:
//
// sgpr2 = def;
// ...
// sgpr2 = def;
// ...
// use sgpr2
//
// The one exception to this rule is when one of the operands
// is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
// instruction. In this case, there we know the program will
// never enter the second block (the loop) without entering
// the first block (where the condition is computed), so there
// is no chance for values to be over-written.

SmallSet<unsigned, 8> Visited;
if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
TII->moveToVALU(MI, MDT);
}

processPHINode(MI);
break;
}
case AMDGPU::REG_SEQUENCE:
if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
!hasVectorOperands(MI, TRI)) {
foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
continue;
}

Expand All @@ -776,9 +673,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
case AMDGPU::INSERT_SUBREG: {
const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
if (TRI->isSGPRClass(DstRC) &&
(TRI->hasVectorRegisters(Src0RC) ||
TRI->hasVectorRegisters(Src1RC))) {
Expand All @@ -792,7 +689,78 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}

if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
hoistAndMergeSGPRInits(AMDGPU::M0, MRI, TRI, *MDT, TII);
hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);

return true;
}

void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
unsigned numVGPRUses = 0;
SetVector<const MachineInstr *> worklist;
worklist.insert(&MI);
while (!worklist.empty()) {
const MachineInstr *Instr = worklist.pop_back_val();
unsigned Reg = Instr->getOperand(0).getReg();
for (const auto &Use : MRI->use_operands(Reg)) {
const MachineInstr *UseMI = Use.getParent();
if (UseMI->isCopy() || UseMI->isRegSequence()) {
if (UseMI->isCopy() &&
UseMI->getOperand(0).getReg().isPhysical() &&
!TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
numVGPRUses++;
}
worklist.insert(UseMI);
continue;
}

if (UseMI->isPHI()) {
const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
UseRC != &AMDGPU::VReg_1RegClass)
numVGPRUses++;
continue;
}

const TargetRegisterClass *OpRC =
TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
OpRC != &AMDGPU::VS_64RegClass) {
numVGPRUses++;
}
}
}
bool hasVGPRInput = false;
for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
unsigned InputReg = MI.getOperand(i).getReg();
MachineInstr *Def = MRI->getVRegDef(InputReg);
if (TRI->isVGPR(*MRI, InputReg)) {
if (Def->isCopy()) {
unsigned SrcReg = Def->getOperand(1).getReg();
const TargetRegisterClass *RC =
TRI->getRegClassForReg(*MRI, SrcReg);
if (TRI->isSGPRClass(RC))
continue;
}
hasVGPRInput = true;
break;
}
else if (Def->isCopy() &&
TRI->isVGPR(*MRI, Def->getOperand(1).getReg())) {
hasVGPRInput = true;
break;
}
}
unsigned PHIRes = MI.getOperand(0).getReg();
const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);

if ((!TRI->isVGPR(*MRI, PHIRes) && RC0 != &AMDGPU::VReg_1RegClass) &&
(hasVGPRInput || numVGPRUses > 1)) {
LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
TII->moveToVALU(MI);
}
else {
LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
TII->legalizeOperands(MI, MDT);
}

}
107 changes: 107 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10944,3 +10944,110 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {

return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
}

const TargetRegisterClass *
SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
: &AMDGPU::SReg_32RegClass;
if (!TRI->isSGPRClass(RC) && !isDivergent)
return TRI->getEquivalentSGPRClass(RC);
else if (TRI->isSGPRClass(RC) && isDivergent)
return TRI->getEquivalentVGPRClass(RC);

return RC;
}

static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
if (!Visited.insert(V).second)
return false;
bool Result = false;
for (auto U : V->users()) {
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
if (V == U->getOperand(1)) {
switch (Intrinsic->getIntrinsicID()) {
default:
Result = false;
break;
case Intrinsic::amdgcn_if_break:
case Intrinsic::amdgcn_if:
case Intrinsic::amdgcn_else:
Result = true;
break;
}
}
if (V == U->getOperand(0)) {
switch (Intrinsic->getIntrinsicID()) {
default:
Result = false;
break;
case Intrinsic::amdgcn_end_cf:
case Intrinsic::amdgcn_loop:
Result = true;
break;
}
}
} else {
Result = hasCFUser(U, Visited);
}
if (Result)
break;
}
return Result;
}

bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
const Value *V) const {
if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
switch (Intrinsic->getIntrinsicID()) {
default:
return false;
case Intrinsic::amdgcn_if_break:
return true;
}
}
if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) {
if (const IntrinsicInst *Intrinsic =
dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) {
switch (Intrinsic->getIntrinsicID()) {
default:
return false;
case Intrinsic::amdgcn_if:
case Intrinsic::amdgcn_else: {
ArrayRef<unsigned> Indices = ExtValue->getIndices();
if (Indices.size() == 1 && Indices[0] == 1) {
return true;
}
}
}
}
}
if (const CallInst *CI = dyn_cast<CallInst>(V)) {
if (isa<InlineAsm>(CI->getCalledValue())) {
const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
ImmutableCallSite CS(CI);
TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
MF.getDataLayout(), Subtarget->getRegisterInfo(), CS);
for (auto &TC : TargetConstraints) {
if (TC.Type == InlineAsm::isOutput) {
ComputeConstraintToUse(TC, SDValue());
unsigned AssignedReg;
const TargetRegisterClass *RC;
std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint(
SIRI, TC.ConstraintCode, TC.ConstraintVT);
if (RC) {
MachineRegisterInfo &MRI = MF.getRegInfo();
if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg))
return true;
else if (SIRI->isSGPRClass(RC))
return true;
}
}
}
}
}
SmallPtrSet<const Value *, 16> Visited;
return hasCFUser(V, Visited);
}
Loading

0 comments on commit c4d256a

Please sign in to comment.