Skip to content

Commit

Permalink
[AMDGPU] Enable PreRARematerialize scheduling pass with multiple high…
Browse files Browse the repository at this point in the history
… RP regions

Enable the PreRARematerialize pass when there are multiple high RP scheduling
regions present. Require the occupancy in all high RP regions be improved
before finalizing sinking. If any high RP region did not improve in occupancy
then un-do all sinking and restore the state to before the pass.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D122501
  • Loading branch information
vangthao95 committed Apr 8, 2022
1 parent 61df26c commit 311edc6
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 127 deletions.
295 changes: 182 additions & 113 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
}

if (Stage == PreRARematerialize) {
if (RegionsWithMinOcc.count() != 1 || Regions.size() == 1)
if (RegionsWithMinOcc.none() || Regions.size() == 1)
break;

const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Expand All @@ -648,16 +648,15 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
MinOccupancy)
break;

// FIXME: This pass will invalidate cached LiveIns, MBBLiveIns and
// Pressure for regions inbetween the defs and region we sinked the def
// to. Will need to be fixed if there is another pass after this pass.
// FIXME: This pass will invalidate cached MBBLiveIns for regions
// inbetween the defs and region we sinked the def to. Cached pressure
// for regions where a def is sinked from will also be invalidated. Will
// need to be fixed if there is another pass after this pass.
static_assert(LastStage == PreRARematerialize,
"Passes after PreRARematerialize are not supported");

unsigned HighRPIdx = RegionsWithMinOcc.find_first();
collectRematerializableInstructions(HighRPIdx);
if (RematerializableInsts.empty() ||
!sinkTriviallyRematInsts(ST, TII, HighRPIdx))
collectRematerializableInstructions();
if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
break;

LLVM_DEBUG(
Expand Down Expand Up @@ -720,10 +719,8 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
} while (Stage != LastStage);
}

void GCNScheduleDAGMILive::collectRematerializableInstructions(
unsigned HighRPIdx) {
void GCNScheduleDAGMILive::collectRematerializableInstructions() {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
const GCNRPTracker::LiveRegSet &HighRPLiveIns = LiveIns[HighRPIdx];
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
Register Reg = Register::index2VirtReg(I);
if (!LIS->hasInterval(Reg))
Expand All @@ -734,12 +731,6 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions(
!MRI.hasOneNonDBGUse(Reg))
continue;

// We are only collecting defs that are live-through or defined in another
// block and used inside this region. This means that the register must be
// in the live-in set for this region, else skip this def.
if (HighRPLiveIns.find(Reg) == HighRPLiveIns.end())
continue;

MachineOperand *Op = MRI.getOneDef(Reg);
MachineInstr *Def = Op->getParent();
if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def, AA))
Expand All @@ -749,82 +740,164 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions(
if (Def->getParent() == UseI->getParent())
continue;

RematerializableInsts.push_back(std::make_pair(Def, UseI));
// We are only collecting defs that are defined in another block and are
// live-through or used inside regions at MinOccupancy. This means that the
// register must be in the live-in set for the region.
bool AddedToRematList = false;
for (unsigned I = 0, E = Regions.size(); I != E; ++I) {
auto It = LiveIns[I].find(Reg);
if (It != LiveIns[I].end() && !It->second.none()) {
if (RegionsWithMinOcc[I]) {
RematerializableInsts[I][Def] = UseI;
AddedToRematList = true;
}

// Collect regions with rematerializable reg as live-in to avoid
// searching later when updating RP.
RematDefToLiveInRegions[Def].push_back(I);
}
}
if (!AddedToRematList)
RematDefToLiveInRegions.erase(Def);
}
}

bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
const TargetInstrInfo *TII,
unsigned HighRPIdx) {
RescheduleRegions.reset();
GCNRPTracker::LiveRegSet NewLiveIns;
// We may not need to rematerialize all instructions. Keep a list of
// instructions we are rematerializing at the end.
SmallVector<std::pair<MachineInstr *, MachineInstr *>, 4>
TrivialRematDefsToSink;

GCNRegPressure RegionPressure = Pressure[HighRPIdx];
int VGPRUsage = RegionPressure.getVGPRNum(ST.hasGFX90AInsts());
int SGPRUsage = RegionPressure.getSGPRNum();

// TODO: Handle occupancy drop due to AGPR and SGPR.
// Check if cause of occupancy drop is due to VGPR usage.
if (ST.getOccupancyWithNumVGPRs(VGPRUsage) > MinOccupancy ||
ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy)
return false;
const TargetInstrInfo *TII) {
// Temporary copies of cached variables we will be modifying and replacing if
// sinking succeeds.
SmallVector<
std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
NewRegions;
DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
DenseMap<unsigned, GCNRegPressure> NewPressure;
BitVector NewRescheduleRegions;

NewRegions.resize(Regions.size());
NewRescheduleRegions.resize(Regions.size());

// Collect only regions that has a rematerializable def as a live-in.
SmallSet<unsigned, 16> ImpactedRegions;
for (const auto &It : RematDefToLiveInRegions)
ImpactedRegions.insert(It.second.begin(), It.second.end());

// Make copies of register pressure and live-ins cache that will be updated
// as we rematerialize.
for (auto Idx : ImpactedRegions) {
NewPressure[Idx] = Pressure[Idx];
NewLiveIns[Idx] = LiveIns[Idx];
}
NewRegions = Regions;
NewRescheduleRegions.reset();

NewLiveIns.copyFrom(LiveIns[HighRPIdx]);
// First check if we have enough trivially rematerializable instructions to
// improve occupancy. Optimistically assume all instructions we are able to
// sink decreased RP.
int TotalSinkableRegs = 0;
for (auto &It : RematerializableInsts) {
Register DefReg = It.first->getOperand(0).getReg();
TotalSinkableRegs += SIRegisterInfo::getNumCoveredRegs(NewLiveIns[DefReg]);
}
int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
// If in the most optimistic scenario, we cannot improve occupancy, then do
// not attempt to sink any instructions.
if (OptimisticOccupancy <= MinOccupancy)
return false;

// Keep a list of newly rematerialized instructions so that we can easily
// undo if occupancy is not improved.
DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
GCNDownwardRPTracker RPT(*LIS);
auto *NonDbgMI = &*skipDebugInstructionsForward(Regions[HighRPIdx].first,
Regions[HighRPIdx].second);
unsigned ImproveOccupancy = 0;
for (auto &It : RematerializableInsts) {
MachineInstr *Def = It.first;
MachineBasicBlock::iterator InsertPos =
MachineBasicBlock::iterator(It.second);
Register Reg = Def->getOperand(0).getReg();
// Rematerialize MI to its use block. Since we are only rematerializing
// instructions that do not have any virtual reg uses, we do not need to
// call LiveRangeEdit::allUsesAvailableAt() and
// LiveRangeEdit::canRematerializeAt().
NewLiveIns[Reg] = LaneBitmask::getNone();
TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
Def->getOperand(0).getSubReg(), *Def, *TRI);
MachineInstr *NewMI = &*(--InsertPos);
LIS->InsertMachineInstrInMaps(*NewMI);
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
InsertedMIToOldDef[NewMI] = Def;

// FIXME: Need better way to update RP without re-iterating over region
RPT.reset(*NonDbgMI, &NewLiveIns);
RPT.advance(Regions[HighRPIdx].second);
GCNRegPressure RPAfterSinking = RPT.moveMaxPressure();
ImproveOccupancy = RPAfterSinking.getOccupancy(ST);
if (ImproveOccupancy > MinOccupancy)
bool Improved = false;
for (auto I : ImpactedRegions) {
if (!RegionsWithMinOcc[I])
continue;

Improved = false;
int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
int SGPRUsage = NewPressure[I].getSGPRNum();

// TODO: Handle occupancy drop due to AGPR and SGPR.
// Check if cause of occupancy drop is due to VGPR usage and not SGPR.
if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy)
break;

// The occupancy of this region could have been improved by a previous
// iteration's sinking of defs.
if (NewPressure[I].getOccupancy(ST) > MinOccupancy) {
NewRescheduleRegions[I] = true;
Improved = true;
continue;
}

// First check if we have enough trivially rematerializable instructions to
// improve occupancy. Optimistically assume all instructions we are able to
// sink decreased RP.
int TotalSinkableRegs = 0;
for (const auto &It : RematerializableInsts[I]) {
MachineInstr *Def = It.first;
Register DefReg = Def->getOperand(0).getReg();
TotalSinkableRegs +=
SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
}
int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
// If in the most optimistic scenario, we cannot improve occupancy, then do
// not attempt to sink any instructions.
if (OptimisticOccupancy <= MinOccupancy)
break;

unsigned ImproveOccupancy = 0;
SmallVector<MachineInstr *, 4> SinkedDefs;
for (auto &It : RematerializableInsts[I]) {
MachineInstr *Def = It.first;
MachineBasicBlock::iterator InsertPos =
MachineBasicBlock::iterator(It.second);
Register Reg = Def->getOperand(0).getReg();
// Rematerialize MI to its use block. Since we are only rematerializing
// instructions that do not have any virtual reg uses, we do not need to
// call LiveRangeEdit::allUsesAvailableAt() and
// LiveRangeEdit::canRematerializeAt().
TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
Def->getOperand(0).getSubReg(), *Def, *TRI);
MachineInstr *NewMI = &*(--InsertPos);
LIS->InsertMachineInstrInMaps(*NewMI);
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
InsertedMIToOldDef[NewMI] = Def;

// Update region boundaries in scheduling region we sinked from since we
// may sink an instruction that was at the beginning or end of its region
updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
/*Removing =*/true);

// Update region boundaries in region we sinked to.
updateRegionBoundaries(NewRegions, InsertPos, NewMI);

LaneBitmask PrevMask = NewLiveIns[I][Reg];
// FIXME: Also update cached pressure for where the def was sinked from.
// Update RP for all regions that has this reg as a live-in and remove
// the reg from all regions as a live-in.
for (auto Idx : RematDefToLiveInRegions[Def]) {
NewLiveIns[Idx].erase(Reg);
if (InsertPos->getParent() != Regions[Idx].first->getParent()) {
// Def is live-through and not used in this block.
NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), MRI);
} else {
// Def is used and rematerialized into this block.
GCNDownwardRPTracker RPT(*LIS);
auto *NonDbgMI = &*skipDebugInstructionsForward(
NewRegions[Idx].first, NewRegions[Idx].second);
RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
RPT.advance(NewRegions[Idx].second);
NewPressure[Idx] = RPT.moveMaxPressure();
}
}

SinkedDefs.push_back(Def);
ImproveOccupancy = NewPressure[I].getOccupancy(ST);
if (ImproveOccupancy > MinOccupancy)
break;
}

// Remove defs we just sinked from all regions' list of sinkable defs
for (auto &Def : SinkedDefs)
for (auto TrackedIdx : RematDefToLiveInRegions[Def])
RematerializableInsts[TrackedIdx].erase(Def);

if (ImproveOccupancy <= MinOccupancy)
break;

NewRescheduleRegions[I] = true;
Improved = true;
}

if (ImproveOccupancy <= MinOccupancy) {
// Occupancy is not improved. Undo sinking for the region
if (!Improved) {
// Occupancy was not improved for all regions that were at MinOccupancy.
// Undo sinking and remove newly rematerialized instructions.
for (auto &Entry : InsertedMIToOldDef) {
MachineInstr *MI = Entry.first;
MachineInstr *OldMI = Entry.second;
Expand All @@ -838,13 +911,10 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
return false;
}

// Occupancy is improved.
// Occupancy was improved for all regions.
for (auto &Entry : InsertedMIToOldDef) {
MachineInstr *MI = Entry.first;
MachineInstr *OldMI = Entry.second;
// Update region boundaries in scheduling region we sinked from since we
// may sink an instruction that was at the beginning or end of its region
updateRegionBoundaries(OldMI, /*NewMI =*/nullptr, /*Removing =*/true);

// Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
BBLiveInMap.erase(OldMI);
Expand All @@ -855,25 +925,19 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
OldMI->eraseFromParent();
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);

// Update region boundaries in region we sinked to.
MachineBasicBlock::iterator InsertPos =
std::next(MachineBasicBlock::iterator(MI));
updateRegionBoundaries(InsertPos, MI);
}

// Update cached live-ins and register pressure after rematerializing
LiveIns[HighRPIdx].copyFrom(NewLiveIns);
MBBLiveIns.erase(Regions[HighRPIdx].first->getParent());

GCNDownwardRPTracker RPTracker(*LIS);
RPTracker.advance(Regions[HighRPIdx].first, Regions[HighRPIdx].second,
&LiveIns[HighRPIdx]);
Pressure[HighRPIdx] = RPTracker.moveMaxPressure();
// Update live-ins, register pressure, and regions caches.
for (auto Idx : ImpactedRegions) {
LiveIns[Idx] = NewLiveIns[Idx];
Pressure[Idx] = NewPressure[Idx];
MBBLiveIns.erase(Regions[Idx].first->getParent());
}
Regions = NewRegions;
RescheduleRegions = NewRescheduleRegions;

SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
MFI.increaseOccupancy(MF, ++MinOccupancy);
RescheduleRegions[HighRPIdx] = true;

return true;
}
Expand All @@ -896,34 +960,39 @@ bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI,
// of a scheduling region and do not need to check the ending since we will only
// ever be inserting before an already existing MI.
void GCNScheduleDAGMILive::updateRegionBoundaries(
SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>> &RegionBoundaries,
MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
unsigned I = 0, E = Regions.size();
unsigned I = 0, E = RegionBoundaries.size();
// Search for first region of the block where MI is located
while (I != E && MI->getParent() != Regions[I].first->getParent())
while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
++I;

for (; I != E; ++I) {
if (MI->getParent() != Regions[I].first->getParent())
if (MI->getParent() != RegionBoundaries[I].first->getParent())
return;

if (Removing && MI == Regions[I].first && MI == Regions[I].second) {
if (Removing && MI == RegionBoundaries[I].first &&
MI == RegionBoundaries[I].second) {
// MI is in a region with size 1, after removing, the region will be
// size 0, set RegionBegin and RegionEnd to pass end of block iterator.
Regions[I] =
RegionBoundaries[I] =
std::make_pair(MI->getParent()->end(), MI->getParent()->end());
return;
}
if (MI == Regions[I].first) {
if (MI == RegionBoundaries[I].first) {
if (Removing)
Regions[I] = std::make_pair(std::next(MI), Regions[I].second);
RegionBoundaries[I] =
std::make_pair(std::next(MI), RegionBoundaries[I].second);
else
// Inserted NewMI in front of region, set new RegionBegin to NewMI
Regions[I] = std::make_pair(MachineBasicBlock::iterator(NewMI),
Regions[I].second);
RegionBoundaries[I] = std::make_pair(MachineBasicBlock::iterator(NewMI),
RegionBoundaries[I].second);
return;
}
if (Removing && MI == Regions[I].second) {
Regions[I] = std::make_pair(Regions[I].first, std::prev(MI));
if (Removing && MI == RegionBoundaries[I].second) {
RegionBoundaries[I] =
std::make_pair(RegionBoundaries[I].first, std::prev(MI));
return;
}
}
Expand Down
Loading

0 comments on commit 311edc6

Please sign in to comment.