Skip to content

Commit 28322c2

Browse files
committed
[AMDGPU] Add scheduler pass to rematerialize trivial defs
Add a new pass in the pre-ra AMDGPU scheduler to check if sinking trivially rematerializable defs that only has one use outside of the defining block will increase occupancy. If we can determine that occupancy can be increased, then rematerialize only the minimum amount of defs required to increase occupancy. Also re-schedule all regions that had occupancy matching the previous min occupancy using the new occupancy. This is based off of the discussion in https://reviews.llvm.org/D117562. The logic to determine the defs we should collect and determining if sinking would be beneficial is mostly the same. Main differences is that we are no longer limiting it to immediate defs and the def and use does not have to be part of a loop. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D119475
1 parent b0f7dc2 commit 28322c2

7 files changed

+6109
-102
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 250 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,9 @@ void GCNScheduleDAGMILive::schedule() {
362362
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
363363
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
364364
Pressure[RegionIdx] = PressureAfter;
365+
RegionsWithMinOcc[RegionIdx] =
366+
PressureAfter.getOccupancy(ST) == MinOccupancy;
367+
365368
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
366369
return;
367370
}
@@ -378,6 +381,7 @@ void GCNScheduleDAGMILive::schedule() {
378381
// occupancy before was higher, or if the current schedule has register
379382
// pressure higher than the excess limits which could lead to more spilling.
380383
unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
384+
381385
// Allow memory bound functions to drop to 4 waves if not limited by an
382386
// attribute.
383387
if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy &&
@@ -390,6 +394,7 @@ void GCNScheduleDAGMILive::schedule() {
390394
if (NewOccupancy < MinOccupancy) {
391395
MinOccupancy = NewOccupancy;
392396
MFI.limitOccupancy(MinOccupancy);
397+
RegionsWithMinOcc.reset();
393398
LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
394399
<< MinOccupancy << ".\n");
395400
}
@@ -416,6 +421,8 @@ void GCNScheduleDAGMILive::schedule() {
416421
PressureAfter.less(ST, PressureBefore) ||
417422
!RescheduleRegions[RegionIdx]) {
418423
Pressure[RegionIdx] = PressureAfter;
424+
RegionsWithMinOcc[RegionIdx] =
425+
PressureAfter.getOccupancy(ST) == MinOccupancy;
419426
if (!RegionsWithClusters[RegionIdx] &&
420427
(Stage + 1) == UnclusteredReschedule)
421428
RescheduleRegions[RegionIdx] = false;
@@ -425,6 +432,8 @@ void GCNScheduleDAGMILive::schedule() {
425432
}
426433
}
427434

435+
RegionsWithMinOcc[RegionIdx] =
436+
PressureBefore.getOccupancy(ST) == MinOccupancy;
428437
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
429438
RescheduleRegions[RegionIdx] = RegionsWithClusters[RegionIdx] ||
430439
(Stage + 1) != UnclusteredReschedule;
@@ -585,9 +594,11 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
585594
RescheduleRegions.resize(Regions.size());
586595
RegionsWithClusters.resize(Regions.size());
587596
RegionsWithHighRP.resize(Regions.size());
597+
RegionsWithMinOcc.resize(Regions.size());
588598
RescheduleRegions.set();
589599
RegionsWithClusters.reset();
590600
RegionsWithHighRP.reset();
601+
RegionsWithMinOcc.reset();
591602

592603
if (!Regions.empty())
593604
BBLiveInMap = getBBLiveInMap();
@@ -624,13 +635,42 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
624635
<< "Retrying function scheduling with lowest recorded occupancy "
625636
<< MinOccupancy << ".\n");
626637
}
638+
639+
if (Stage == PreRARematerialize) {
640+
if (RegionsWithMinOcc.count() != 1 || Regions.size() == 1)
641+
break;
642+
643+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
644+
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
645+
// Check maximum occupancy
646+
if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
647+
MinOccupancy)
648+
break;
649+
650+
// FIXME: This pass will invalidate cached LiveIns, MBBLiveIns and
651+
// Pressure for regions inbetween the defs and region we sinked the def
652+
// to. Will need to be fixed if there is another pass after this pass.
653+
static_assert(LastStage == PreRARematerialize,
654+
"Passes after PreRARematerialize are not supported");
655+
656+
unsigned HighRPIdx = RegionsWithMinOcc.find_first();
657+
collectRematerializableInstructions(HighRPIdx);
658+
if (RematerializableInsts.empty() ||
659+
!sinkTriviallyRematInsts(ST, TII, HighRPIdx))
660+
break;
661+
662+
LLVM_DEBUG(
663+
dbgs() << "Retrying function scheduling with improved occupancy of "
664+
<< MinOccupancy << " from rematerializing\n");
665+
}
627666
}
628667

629668
if (Stage == UnclusteredReschedule)
630669
SavedMutations.swap(Mutations);
631670

632671
for (auto Region : Regions) {
633-
if ((Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx]) ||
672+
if (((Stage == UnclusteredReschedule || Stage == PreRARematerialize) &&
673+
!RescheduleRegions[RegionIdx]) ||
634674
(Stage == ClusteredLowOccupancyReschedule &&
635675
!RegionsWithClusters[RegionIdx] && !RegionsWithHighRP[RegionIdx])) {
636676

@@ -655,6 +695,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
655695
// Skip empty scheduling regions (0 or 1 schedulable instructions).
656696
if (begin() == end() || begin() == std::prev(end())) {
657697
exitRegion();
698+
++RegionIdx;
658699
continue;
659700
}
660701

@@ -677,3 +718,211 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
677718
SavedMutations.swap(Mutations);
678719
} while (Stage != LastStage);
679720
}
721+
722+
void GCNScheduleDAGMILive::collectRematerializableInstructions(
723+
unsigned HighRPIdx) {
724+
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
725+
const GCNRPTracker::LiveRegSet &HighRPLiveIns = LiveIns[HighRPIdx];
726+
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
727+
Register Reg = Register::index2VirtReg(I);
728+
if (!LIS->hasInterval(Reg))
729+
continue;
730+
731+
// TODO: Handle AGPR and SGPR rematerialization
732+
if (!SRI->isVGPRClass(MRI.getRegClass(Reg)) || !MRI.hasOneDef(Reg) ||
733+
!MRI.hasOneUse(Reg))
734+
continue;
735+
736+
// We are only collecting defs that are live-through or defined in another
737+
// block and used inside this region. This means that the register must be
738+
// in the live-in set for this region, else skip this def.
739+
if (HighRPLiveIns.find(Reg) == HighRPLiveIns.end())
740+
continue;
741+
742+
MachineInstr *Def = MRI.getOneDef(Reg)->getParent();
743+
if (!Def || !isTriviallyReMaterializable(*Def, AA))
744+
continue;
745+
746+
MachineInstr *UseI = &*MRI.use_instr_begin(Reg);
747+
if (Def->getParent() == UseI->getParent())
748+
continue;
749+
750+
RematerializableInsts.push_back(std::make_pair(Def, UseI));
751+
}
752+
}
753+
754+
bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
755+
const TargetInstrInfo *TII,
756+
unsigned HighRPIdx) {
757+
RescheduleRegions.reset();
758+
GCNRPTracker::LiveRegSet NewLiveIns;
759+
// We may not need to rematerialize all instructions. Keep a list of
760+
// instructions we are rematerializing at the end.
761+
SmallVector<std::pair<MachineInstr *, MachineInstr *>, 4>
762+
TrivialRematDefsToSink;
763+
764+
GCNRegPressure RegionPressure = Pressure[HighRPIdx];
765+
int VGPRUsage = RegionPressure.getVGPRNum(ST.hasGFX90AInsts());
766+
int SGPRUsage = RegionPressure.getSGPRNum();
767+
768+
// TODO: Handle occupancy drop due to AGPR and SGPR.
769+
// Check if cause of occupancy drop is due to VGPR usage.
770+
if (ST.getOccupancyWithNumVGPRs(VGPRUsage) > MinOccupancy ||
771+
ST.getOccupancyWithNumSGPRs(SGPRUsage) == MinOccupancy)
772+
return false;
773+
774+
NewLiveIns.copyFrom(LiveIns[HighRPIdx]);
775+
// First check if we have enough trivially rematerializable instructions to
776+
// improve occupancy. Optimistically assume all instructions we are able to
777+
// sink decreased RP.
778+
int TotalSinkableRegs = 0;
779+
for (auto &It : RematerializableInsts) {
780+
Register DefReg = It.first->getOperand(0).getReg();
781+
TotalSinkableRegs += SIRegisterInfo::getNumCoveredRegs(NewLiveIns[DefReg]);
782+
}
783+
int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
784+
unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
785+
// If in the most optimistic scenario, we cannot improve occupancy, then do
786+
// not attempt to sink any instructions.
787+
if (OptimisticOccupancy <= MinOccupancy)
788+
return false;
789+
790+
// Keep a list of newly rematerialized instructions so that we can easily
791+
// undo if occupancy is not improved.
792+
DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
793+
GCNDownwardRPTracker RPT(*LIS);
794+
auto *NonDbgMI = &*skipDebugInstructionsForward(Regions[HighRPIdx].first,
795+
Regions[HighRPIdx].second);
796+
unsigned ImproveOccupancy = 0;
797+
for (auto &It : RematerializableInsts) {
798+
MachineInstr *Def = It.first;
799+
MachineBasicBlock::iterator InsertPos =
800+
MachineBasicBlock::iterator(It.second);
801+
Register Reg = Def->getOperand(0).getReg();
802+
// Rematerialize MI to its use block. Since we are only rematerializing
803+
// instructions that do not have any virtual reg uses, we do not need to
804+
// call LiveRangeEdit::allUsesAvailableAt() and
805+
// LiveRangeEdit::canRematerializeAt().
806+
NewLiveIns[Reg] = LaneBitmask::getNone();
807+
TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
808+
Def->getOperand(0).getSubReg(), *Def, *TRI);
809+
MachineInstr *NewMI = &*(--InsertPos);
810+
LIS->InsertMachineInstrInMaps(*NewMI);
811+
LIS->removeInterval(Reg);
812+
LIS->createAndComputeVirtRegInterval(Reg);
813+
InsertedMIToOldDef[NewMI] = Def;
814+
815+
// FIXME: Need better way to update RP without re-iterating over region
816+
RPT.reset(*NonDbgMI, &NewLiveIns);
817+
RPT.advance(Regions[HighRPIdx].second);
818+
GCNRegPressure RPAfterSinking = RPT.moveMaxPressure();
819+
ImproveOccupancy = RPAfterSinking.getOccupancy(ST);
820+
if (ImproveOccupancy > MinOccupancy)
821+
break;
822+
}
823+
824+
if (ImproveOccupancy <= MinOccupancy) {
825+
// Occupancy is not improved. Undo sinking for the region
826+
for (auto &Entry : InsertedMIToOldDef) {
827+
MachineInstr *MI = Entry.first;
828+
MachineInstr *OldMI = Entry.second;
829+
Register Reg = MI->getOperand(0).getReg();
830+
LIS->RemoveMachineInstrFromMaps(*MI);
831+
MI->eraseFromParent();
832+
OldMI->clearRegisterDeads(Reg);
833+
LIS->removeInterval(Reg);
834+
LIS->createAndComputeVirtRegInterval(Reg);
835+
}
836+
return false;
837+
}
838+
839+
// Occupancy is improved.
840+
for (auto &Entry : InsertedMIToOldDef) {
841+
MachineInstr *MI = Entry.first;
842+
MachineInstr *OldMI = Entry.second;
843+
// Update region boundaries in scheduling region we sinked from since we
844+
// may sink an instruction that was at the beginning or end of its region
845+
updateRegionBoundaries(OldMI, /*NewMI =*/nullptr, /*Removing =*/true);
846+
847+
// Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
848+
BBLiveInMap.erase(OldMI);
849+
850+
// Remove OldMI and update LIS
851+
Register Reg = MI->getOperand(0).getReg();
852+
LIS->RemoveMachineInstrFromMaps(*OldMI);
853+
OldMI->eraseFromParent();
854+
LIS->removeInterval(Reg);
855+
LIS->createAndComputeVirtRegInterval(Reg);
856+
857+
// Update region boundaries in region we sinked to.
858+
MachineBasicBlock::iterator InsertPos =
859+
std::next(MachineBasicBlock::iterator(MI));
860+
updateRegionBoundaries(InsertPos, MI);
861+
}
862+
863+
// Update cached live-ins and register pressure after rematerializing
864+
LiveIns[HighRPIdx].copyFrom(NewLiveIns);
865+
MBBLiveIns.erase(Regions[HighRPIdx].first->getParent());
866+
867+
GCNDownwardRPTracker RPTracker(*LIS);
868+
RPTracker.advance(Regions[HighRPIdx].first, Regions[HighRPIdx].second,
869+
&LiveIns[HighRPIdx]);
870+
Pressure[HighRPIdx] = RPTracker.moveMaxPressure();
871+
872+
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
873+
MFI.increaseOccupancy(MF, ++MinOccupancy);
874+
RescheduleRegions[HighRPIdx] = true;
875+
876+
return true;
877+
}
878+
879+
// Copied from MachineLICM
880+
bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI,
881+
AAResults *AA) {
882+
if (!TII->isTriviallyReMaterializable(MI, AA))
883+
return false;
884+
885+
for (const MachineOperand &MO : MI.operands())
886+
if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual())
887+
return false;
888+
889+
return true;
890+
}
891+
892+
// When removing, we will have to check both beginning and ending of the region.
893+
// When inserting, we will only have to check if we are inserting NewMI in front
894+
// of a scheduling region and do not need to check the ending since we will only
895+
// ever be inserting before an already existing MI.
896+
void GCNScheduleDAGMILive::updateRegionBoundaries(
897+
MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
898+
unsigned I = 0, E = Regions.size();
899+
// Search for first region of the block where MI is located
900+
while (I != E && MI->getParent() != Regions[I].first->getParent())
901+
++I;
902+
903+
for (; I != E; ++I) {
904+
if (MI->getParent() != Regions[I].first->getParent())
905+
return;
906+
907+
if (Removing && MI == Regions[I].first && MI == Regions[I].second) {
908+
// MI is in a region with size 1, after removing, the region will be
909+
// size 0, set RegionBegin and RegionEnd to pass end of block iterator.
910+
Regions[I] =
911+
std::make_pair(MI->getParent()->end(), MI->getParent()->end());
912+
return;
913+
}
914+
if (MI == Regions[I].first) {
915+
if (Removing)
916+
Regions[I] = std::make_pair(std::next(MI), Regions[I].second);
917+
else
918+
// Inserted NewMI in front of region, set new RegionBegin to NewMI
919+
Regions[I] = std::make_pair(MachineBasicBlock::iterator(NewMI),
920+
Regions[I].second);
921+
return;
922+
}
923+
if (Removing && MI == Regions[I].second) {
924+
Regions[I] = std::make_pair(Regions[I].first, std::prev(MI));
925+
return;
926+
}
927+
}
928+
}

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
7777
InitialSchedule,
7878
UnclusteredReschedule,
7979
ClusteredLowOccupancyReschedule,
80-
LastStage = ClusteredLowOccupancyReschedule
80+
PreRARematerialize,
81+
LastStage = PreRARematerialize
8182
};
8283

8384
const GCNSubtarget &ST;
@@ -110,24 +111,47 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
110111
// Record regions with high register pressure.
111112
BitVector RegionsWithHighRP;
112113

114+
// Regions that has the same occupancy as the latest MinOccupancy
115+
BitVector RegionsWithMinOcc;
116+
113117
// Region live-in cache.
114118
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
115119

116120
// Region pressure cache.
117121
SmallVector<GCNRegPressure, 32> Pressure;
118122

123+
// List of trivially rematerializable instructions we can remat to reduce RP.
124+
// First MI is the MI to remat and second MI is the position we should remat
125+
// before, usually the MI using the rematerializable instruction.
126+
SmallVector<std::pair<MachineInstr *, MachineInstr *>> RematerializableInsts;
127+
119128
// Temporary basic block live-in cache.
120129
DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
121130

122131
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;
123132
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getBBLiveInMap() const;
124133

134+
// Collect all trivially rematerializable VGPR instructions with a single def
135+
// and single use outside the defining block into RematerializableInsts.
136+
void collectRematerializableInstructions(unsigned HighRPIdx);
137+
138+
bool isTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA);
139+
140+
// TODO: Should also attempt to reduce RP of SGPRs and AGPRs
141+
// Attempt to reduce RP of VGPR by sinking trivially rematerializable
142+
// instructions. Returns true if we were able to sink instruction(s).
143+
bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
144+
const TargetInstrInfo *TII, unsigned HighRPIdx);
145+
125146
// Return current region pressure.
126147
GCNRegPressure getRealRegPressure() const;
127148

128149
// Compute and cache live-ins and pressure for all regions in block.
129150
void computeBlockPressure(const MachineBasicBlock *MBB);
130151

152+
// Update region boundaries when removing MI or inserting NewMI before MI.
153+
void updateRegionBoundaries(MachineBasicBlock::iterator MI,
154+
MachineInstr *NewMI, bool Removing = false);
131155

132156
public:
133157
GCNScheduleDAGMILive(MachineSchedContext *C,

0 commit comments

Comments
 (0)