Skip to content

Commit dcb7764

Browse files
committed
Reapply [CodeGen][ARM] Enable Swing Module Scheduling for ARM
Fixed "private field is not used" warning when compiled with clang. original commit: 28d09bb reverted in: fa49021 ------ This patch permits Swing Modulo Scheduling for ARM targets turns it on by default for the Cortex-M7. The t2Bcc instruction is recognized as a loop-ending branch. MachinePipeliner is extended by adding support for "unpipelineable" instructions. These instructions are those which contribute to the loop exit test; in the SMS papers they are removed before creating the dependence graph and then inserted into the final schedule of the kernel and prologues. Support for these instructions was not previously necessary because current targets supporting SMS have only supported it for hardware loop branches, which have no loop-exit-contributing instructions in the loop body. The current structure of the MachinePipeliner makes it difficult to remove/exclude these instructions from the dependence graph. Therefore, this patch leaves them in the graph, but adds a "normalization" method which moves them in the schedule to stage 0, which causes them to appear properly in kernel and prologues. It was also necessary to be more careful about boundary nodes when iterating across successors in the dependence graph because the loop exit branch is now a non-artificial successor to instructions in the graph. In additional, schedules with physical use/def pairs in the same cycle should be treated as creating an invalid schedule because the scheduling logic doesn't respect physical register dependence once scheduled to the same cycle. Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D122672
1 parent ef87865 commit dcb7764

13 files changed

+628
-19
lines changed

llvm/include/llvm/CodeGen/MachinePipeliner.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ class MachinePipeliner : public MachineFunctionPass {
8484
SmallVector<MachineOperand, 4> BrCond;
8585
MachineInstr *LoopInductionVar = nullptr;
8686
MachineInstr *LoopCompare = nullptr;
87+
std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopPipelinerInfo =
88+
nullptr;
8789
};
8890
LoopInfo LI;
8991

@@ -119,6 +121,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
119121
LiveIntervals &LIS;
120122
const RegisterClassInfo &RegClassInfo;
121123
unsigned II_setByPragma = 0;
124+
TargetInstrInfo::PipelinerLoopInfo *LoopPipelinerInfo = nullptr;
122125

123126
/// A toplogical ordering of the SUnits, which is needed for changing
124127
/// dependences and iterating over the SUnits.
@@ -196,9 +199,11 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
196199

197200
public:
198201
SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
199-
const RegisterClassInfo &rci, unsigned II)
202+
const RegisterClassInfo &rci, unsigned II,
203+
TargetInstrInfo::PipelinerLoopInfo *PLI)
200204
: ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
201-
RegClassInfo(rci), II_setByPragma(II), Topo(SUnits, &ExitSU) {
205+
RegClassInfo(rci), II_setByPragma(II), LoopPipelinerInfo(PLI),
206+
Topo(SUnits, &ExitSU) {
202207
P.MF->getSubtarget().getSMSMutations(Mutations);
203208
if (SwpEnableCopyToPhi)
204209
Mutations.push_back(std::make_unique<CopyToPhiMutation>());
@@ -589,6 +594,13 @@ class SMSchedule {
589594
return ScheduledInstrs[cycle];
590595
}
591596

597+
SmallSet<SUnit *, 8>
598+
computeUnpipelineableNodes(SwingSchedulerDAG *SSD,
599+
TargetInstrInfo::PipelinerLoopInfo *PLI);
600+
601+
bool
602+
normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD,
603+
TargetInstrInfo::PipelinerLoopInfo *PLI);
592604
bool isValidSchedule(SwingSchedulerDAG *SSD);
593605
void finalizeSchedule(SwingSchedulerDAG *SSD);
594606
void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,

llvm/include/llvm/CodeGen/ModuloSchedule.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,8 @@ class ModuloScheduleExpander {
191191
void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
192192
ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
193193
void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
194-
ValueMapTy *VRMap, MBBVectorTy &EpilogBBs,
195-
MBBVectorTy &PrologBBs);
194+
MachineBasicBlock *OrigBB, ValueMapTy *VRMap,
195+
MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
196196
void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
197197
MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
198198
ValueMapTy *VRMap, InstrMapTy &InstrMap,

llvm/lib/CodeGen/MachinePipeliner.cpp

Lines changed: 89 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -255,13 +255,15 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
255255
<< "Failed to pipeline loop";
256256
});
257257

258+
LI.LoopPipelinerInfo.reset();
258259
return Changed;
259260
}
260261

261262
++NumTrytoPipeline;
262263

263264
Changed = swingModuloScheduler(L);
264265

266+
LI.LoopPipelinerInfo.reset();
265267
return Changed;
266268
}
267269

@@ -354,7 +356,8 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
354356

355357
LI.LoopInductionVar = nullptr;
356358
LI.LoopCompare = nullptr;
357-
if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
359+
LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
360+
if (!LI.LoopPipelinerInfo) {
358361
LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
359362
NumFailLoop++;
360363
ORE->emit([&]() {
@@ -419,7 +422,7 @@ bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
419422
assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
420423

421424
SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
422-
II_setByPragma);
425+
II_setByPragma, LI.LoopPipelinerInfo.get());
423426

424427
MachineBasicBlock *MBB = L.getHeader();
425428
// The kernel should not include any terminator instructions. These
@@ -1422,7 +1425,7 @@ void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
14221425
/// We ignore the back-edge recurrence in order to avoid unbounded recursion
14231426
/// in the calculation of the ASAP, ALAP, etc functions.
14241427
static bool ignoreDependence(const SDep &D, bool isPred) {
1425-
if (D.isArtificial())
1428+
if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
14261429
return true;
14271430
return D.getKind() == SDep::Anti && isPred;
14281431
}
@@ -1471,6 +1474,8 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
14711474
SUnit *SU = &SUnits[I];
14721475
for (const SDep &S : SU->Succs) {
14731476
SUnit *succ = S.getSUnit();
1477+
if (succ->isBoundaryNode())
1478+
continue;
14741479
if (S.getLatency() == 0)
14751480
zeroLatencyHeight =
14761481
std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
@@ -1788,7 +1793,8 @@ void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
17881793
NodesAdded.insert(SU);
17891794
for (auto &SI : SU->Succs) {
17901795
SUnit *Successor = SI.getSUnit();
1791-
if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
1796+
if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
1797+
NodesAdded.count(Successor) == 0)
17921798
addConnectedNodes(Successor, NewSet, NodesAdded);
17931799
}
17941800
for (auto &PI : SU->Preds) {
@@ -2080,6 +2086,11 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
20802086
});
20812087
} while (++NI != NE && scheduleFound);
20822088

2089+
// If a schedule is found, ensure non-pipelined instructions are in stage 0
2090+
if (scheduleFound)
2091+
scheduleFound =
2092+
Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2093+
20832094
// If a schedule is found, check if it is a valid schedule too.
20842095
if (scheduleFound)
20852096
scheduleFound = Schedule.isValidSchedule(this);
@@ -2263,7 +2274,7 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
22632274
bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
22642275
bool isSucc) {
22652276
if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2266-
Dep.isArtificial())
2277+
Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
22672278
return false;
22682279

22692280
if (!SwpPruneLoopCarried)
@@ -2430,7 +2441,7 @@ int SMSchedule::latestCycleInChain(const SDep &Dep) {
24302441
while (!Worklist.empty()) {
24312442
const SDep &Cur = Worklist.pop_back_val();
24322443
SUnit *SuccSU = Cur.getSUnit();
2433-
if (Visited.count(SuccSU))
2444+
if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
24342445
continue;
24352446
std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
24362447
if (it == InstrToCycle.end())
@@ -2697,21 +2708,91 @@ bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
26972708
return false;
26982709
}
26992710

2711+
/// Determine transitive dependences of unpipelineable instructions
2712+
SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
2713+
SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
2714+
SmallSet<SUnit *, 8> DoNotPipeline;
2715+
SmallVector<SUnit *, 8> Worklist;
2716+
2717+
for (auto &SU : SSD->SUnits)
2718+
if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
2719+
Worklist.push_back(&SU);
2720+
2721+
while (!Worklist.empty()) {
2722+
auto SU = Worklist.pop_back_val();
2723+
if (DoNotPipeline.count(SU))
2724+
continue;
2725+
LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
2726+
DoNotPipeline.insert(SU);
2727+
for (auto &Dep : SU->Preds)
2728+
Worklist.push_back(Dep.getSUnit());
2729+
if (SU->getInstr()->isPHI())
2730+
for (auto &Dep : SU->Succs)
2731+
if (Dep.getKind() == SDep::Anti)
2732+
Worklist.push_back(Dep.getSUnit());
2733+
}
2734+
return DoNotPipeline;
2735+
}
2736+
2737+
// Determine all instructions upon which any unpipelineable instruction depends
2738+
// and ensure that they are in stage 0. If unable to do so, return false.
2739+
bool SMSchedule::normalizeNonPipelinedInstructions(
2740+
SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
2741+
SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
2742+
2743+
int NewLastCycle = INT_MIN;
2744+
for (SUnit &SU : SSD->SUnits) {
2745+
if (!SU.isInstr())
2746+
continue;
2747+
if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
2748+
NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
2749+
continue;
2750+
}
2751+
2752+
// Put the non-pipelined instruction as early as possible in the schedule
2753+
int NewCycle = getFirstCycle();
2754+
for (auto &Dep : SU.Preds)
2755+
NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
2756+
2757+
int OldCycle = InstrToCycle[&SU];
2758+
if (OldCycle != NewCycle) {
2759+
InstrToCycle[&SU] = NewCycle;
2760+
auto &OldS = getInstructions(OldCycle);
2761+
OldS.erase(std::remove(OldS.begin(), OldS.end(), &SU), OldS.end());
2762+
getInstructions(NewCycle).emplace_back(&SU);
2763+
LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
2764+
<< ") is not pipelined; moving from cycle " << OldCycle
2765+
<< " to " << NewCycle << " Instr:" << *SU.getInstr());
2766+
}
2767+
NewLastCycle = std::max(NewLastCycle, NewCycle);
2768+
}
2769+
LastCycle = NewLastCycle;
2770+
return true;
2771+
}
2772+
27002773
// Check if the generated schedule is valid. This function checks if
27012774
// an instruction that uses a physical register is scheduled in a
27022775
// different stage than the definition. The pipeliner does not handle
27032776
// physical register values that may cross a basic block boundary.
2777+
// Furthermore, if a physical def/use pair is assigned to the same
2778+
// cycle, orderDependence does not guarantee def/use ordering, so that
2779+
// case should be considered invalid. (The test checks for both
2780+
// earlier and same-cycle use to be more robust.)
27042781
bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
27052782
for (SUnit &SU : SSD->SUnits) {
27062783
if (!SU.hasPhysRegDefs)
27072784
continue;
27082785
int StageDef = stageScheduled(&SU);
2786+
int CycleDef = InstrToCycle[&SU];
27092787
assert(StageDef != -1 && "Instruction should have been scheduled.");
27102788
for (auto &SI : SU.Succs)
2711-
if (SI.isAssignedRegDep())
2712-
if (Register::isPhysicalRegister(SI.getReg()))
2789+
if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
2790+
if (Register::isPhysicalRegister(SI.getReg())) {
27132791
if (stageScheduled(SI.getSUnit()) != StageDef)
27142792
return false;
2793+
if (InstrToCycle[SI.getSUnit()] <= CycleDef)
2794+
return false;
2795+
}
27152796
}
27162797
return true;
27172798
}

llvm/lib/CodeGen/ModuloSchedule.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ void ModuloScheduleExpander::generatePipelinedLoop() {
158158

159159
SmallVector<MachineBasicBlock *, 4> EpilogBBs;
160160
// Generate the epilog instructions to complete the pipeline.
161-
generateEpilog(MaxStageCount, KernelBB, VRMap, EpilogBBs, PrologBBs);
161+
generateEpilog(MaxStageCount, KernelBB, BB, VRMap, EpilogBBs, PrologBBs);
162162

163163
// We need this step because the register allocation doesn't handle some
164164
// situations well, so we insert copies to help out.
@@ -240,11 +240,9 @@ void ModuloScheduleExpander::generateProlog(unsigned LastStage,
240240
/// Generate the pipeline epilog code. The epilog code finishes the iterations
241241
/// that were started in either the prolog or the kernel. We create a basic
242242
/// block for each stage that needs to complete.
243-
void ModuloScheduleExpander::generateEpilog(unsigned LastStage,
244-
MachineBasicBlock *KernelBB,
245-
ValueMapTy *VRMap,
246-
MBBVectorTy &EpilogBBs,
247-
MBBVectorTy &PrologBBs) {
243+
void ModuloScheduleExpander::generateEpilog(
244+
unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
245+
ValueMapTy *VRMap, MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs) {
248246
// We need to change the branch from the kernel to the first epilog block, so
249247
// this call to analyze branch uses the kernel rather than the original BB.
250248
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
@@ -314,7 +312,12 @@ void ModuloScheduleExpander::generateEpilog(unsigned LastStage,
314312
// Create a branch to the new epilog from the kernel.
315313
// Remove the original branch and add a new branch to the epilog.
316314
TII->removeBranch(*KernelBB);
317-
TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
315+
assert((OrigBB == TBB || OrigBB == FBB) &&
316+
"Unable to determine looping branch direction");
317+
if (OrigBB != TBB)
318+
TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc());
319+
else
320+
TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
318321
// Add a branch to the loop exit.
319322
if (EpilogBBs.size() > 0) {
320323
MachineBasicBlock *LastEpilogBB = EpilogBBs.back();

llvm/lib/Target/ARM/ARM.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,10 @@ def FeatureNoNegativeImmediates
494494
def FeatureUseMISched: SubtargetFeature<"use-misched", "UseMISched", "true",
495495
"Use the MachineScheduler">;
496496

497+
// Use the MachinePipeliner for instruction scheduling for the subtarget.
498+
def FeatureUseMIPipeliner: SubtargetFeature<"use-mipipeliner", "UseMIPipeliner", "true",
499+
"Use the MachinePipeliner">;
500+
497501
// False if scheduling should happen again after register allocation.
498502
def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
499503
"DisablePostRAScheduler", "true",
@@ -1395,6 +1399,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em,
13951399
def : ProcessorModel<"cortex-m7", CortexM7Model, [ARMv7em,
13961400
ProcM7,
13971401
FeatureFPARMv8_D16,
1402+
FeatureUseMIPipeliner,
13981403
FeatureUseMISched]>;
13991404

14001405
def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,

llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6721,3 +6721,77 @@ unsigned llvm::getBLXpredOpcode(const MachineFunction &MF) {
67216721
return (MF.getSubtarget<ARMSubtarget>().hardenSlsBlr()) ? ARM::BLX_pred_noip
67226722
: ARM::BLX_pred;
67236723
}
6724+
6725+
namespace {
6726+
class ARMPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
6727+
MachineInstr *EndLoop, *LoopCount;
6728+
MachineFunction *MF;
6729+
const TargetInstrInfo *TII;
6730+
6731+
// Meanings of the various stuff with loop types:
6732+
// t2Bcc:
6733+
// Loop = null -- there is no setup.
6734+
// EndLoop = branch at end of original BB that will become a kernel
6735+
// LoopCount = CC setter live into branch
6736+
public:
6737+
ARMPipelinerLoopInfo(MachineInstr *EndLoop, MachineInstr *LoopCount)
6738+
: EndLoop(EndLoop), LoopCount(LoopCount),
6739+
MF(EndLoop->getParent()->getParent()),
6740+
TII(MF->getSubtarget().getInstrInfo()) {}
6741+
6742+
bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
6743+
// Only ignore the terminator.
6744+
return MI == EndLoop || MI == LoopCount;
6745+
}
6746+
6747+
Optional<bool> createTripCountGreaterCondition(
6748+
int TC, MachineBasicBlock &MBB,
6749+
SmallVectorImpl<MachineOperand> &Cond) override {
6750+
6751+
if (isCondBranchOpcode(EndLoop->getOpcode())) {
6752+
Cond.push_back(EndLoop->getOperand(1));
6753+
Cond.push_back(EndLoop->getOperand(2));
6754+
if (EndLoop->getOperand(0).getMBB() == EndLoop->getParent()) {
6755+
TII->reverseBranchCondition(Cond);
6756+
}
6757+
return {};
6758+
} else
6759+
llvm_unreachable("Unknown EndLoop");
6760+
}
6761+
6762+
void setPreheader(MachineBasicBlock *NewPreheader) override {}
6763+
6764+
void adjustTripCount(int TripCountAdjust) override {}
6765+
6766+
void disposed() override {}
6767+
};
6768+
} // namespace
6769+
6770+
std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
6771+
ARMBaseInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
6772+
MachineBasicBlock::iterator I = LoopBB->getFirstTerminator();
6773+
MachineBasicBlock *Preheader = *LoopBB->pred_begin();
6774+
if (Preheader == LoopBB)
6775+
Preheader = *std::next(LoopBB->pred_begin());
6776+
6777+
if (I != LoopBB->end() && I->getOpcode() == ARM::t2Bcc) {
6778+
// If the branch is a Bcc, then the CPSR should be set somewhere within the
6779+
// block. We need to determine the reaching definition of CPSR so that
6780+
// it can be marked as non-pipelineable, allowing the pipeliner to force
6781+
// it into stage 0 or give up if it cannot or will not do so.
6782+
MachineInstr *CCSetter = nullptr;
6783+
for (auto &L : LoopBB->instrs()) {
6784+
if (L.isCall())
6785+
return nullptr;
6786+
if (isCPSRDefined(L))
6787+
CCSetter = &L;
6788+
}
6789+
if (CCSetter)
6790+
return std::make_unique<ARMPipelinerLoopInfo>(&*I, CCSetter);
6791+
else
6792+
return nullptr; // Unable to find the CC setter, so unable to guarantee
6793+
// that pipeline will work
6794+
}
6795+
6796+
return nullptr;
6797+
}

llvm/lib/Target/ARM/ARMBaseInstrInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,11 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
372372
MI->getOpcode() == ARM::t2WhileLoopStartTP;
373373
}
374374

375+
/// Analyze loop L, which must be a single-basic-block loop, and if the
376+
/// conditions can be understood enough produce a PipelinerLoopInfo object.
377+
std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
378+
analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
379+
375380
private:
376381
/// Returns an unused general-purpose register which can be used for
377382
/// constructing an outlined call if one exists. Returns 0 otherwise.

0 commit comments

Comments
 (0)