Skip to content

MachineScheduler: Improve instruction clustering #137784

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions llvm/include/llvm/CodeGen/MachineScheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,10 +303,6 @@ class ScheduleDAGMI : public ScheduleDAGInstrs {
/// The bottom of the unscheduled zone.
MachineBasicBlock::iterator CurrentBottom;

/// Record the next node in a scheduled cluster.
const SUnit *NextClusterPred = nullptr;
const SUnit *NextClusterSucc = nullptr;

#if LLVM_ENABLE_ABI_BREAKING_CHECKS
/// The number of instructions scheduled so far. Used to cut off the
/// scheduler at the point determined by misched-cutoff.
Expand Down Expand Up @@ -367,10 +363,6 @@ class ScheduleDAGMI : public ScheduleDAGInstrs {
/// live ranges and region boundary iterators.
void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);

const SUnit *getNextClusterPred() const { return NextClusterPred; }

const SUnit *getNextClusterSucc() const { return NextClusterSucc; }

void viewGraph(const Twine &Name, const Twine &Title) override;
void viewGraph() override;

Expand Down Expand Up @@ -1295,6 +1287,9 @@ class GenericScheduler : public GenericSchedulerBase {
SchedBoundary Top;
SchedBoundary Bot;

ClusterInfo *TopCluster;
ClusterInfo *BotCluster;

/// Candidate last picked from Top boundary.
SchedCandidate TopCand;
/// Candidate last picked from Bot boundary.
Expand Down Expand Up @@ -1335,6 +1330,9 @@ class PostGenericScheduler : public GenericSchedulerBase {
/// Candidate last picked from Bot boundary.
SchedCandidate BotCand;

ClusterInfo *TopCluster;
ClusterInfo *BotCluster;

public:
PostGenericScheduler(const MachineSchedContext *C)
: GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ"),
Expand Down
7 changes: 7 additions & 0 deletions llvm/include/llvm/CodeGen/ScheduleDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/PointerIntPair.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/iterator.h"
#include "llvm/CodeGen/MachineInstr.h"
Expand Down Expand Up @@ -234,6 +235,10 @@ class TargetRegisterInfo;
void dump(const TargetRegisterInfo *TRI = nullptr) const;
};

/// Keep record of which SUnit are in the same cluster group.
typedef SmallSet<SUnit *, 8> ClusterInfo;
constexpr unsigned InvalidClusterId = ~0u;

/// Scheduling unit. This is a node in the scheduling DAG.
class SUnit {
private:
Expand Down Expand Up @@ -274,6 +279,8 @@ class TargetRegisterInfo;
unsigned TopReadyCycle = 0; ///< Cycle relative to start when node is ready.
unsigned BotReadyCycle = 0; ///< Cycle relative to end when node is ready.

unsigned ParentClusterIdx = InvalidClusterId; ///< The parent cluster id.

private:
unsigned Depth = 0; ///< Node depth.
unsigned Height = 0; ///< Node height.
Expand Down
10 changes: 10 additions & 0 deletions llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ namespace llvm {
/// case of a huge region that gets reduced).
SUnit *BarrierChain = nullptr;

SmallVector<ClusterInfo> Clusters;

public:
/// A list of SUnits, used in Value2SUsMap, during DAG construction.
/// Note: to gain speed it might be worth investigating an optimized
Expand Down Expand Up @@ -383,6 +385,14 @@ namespace llvm {
/// equivalent edge already existed (false indicates failure).
bool addEdge(SUnit *SuccSU, const SDep &PredDep);

/// Returns the array of the clusters.
SmallVector<ClusterInfo> &getClusters() { return Clusters; }

/// Get the specific cluster, return nullptr for InvalidClusterId.
ClusterInfo *getCluster(unsigned Idx) {
return Idx != InvalidClusterId ? &Clusters[Idx] : nullptr;
}

protected:
void initSUnits();
void addPhysRegDataDeps(SUnit *SU, unsigned OperIdx);
Expand Down
75 changes: 52 additions & 23 deletions llvm/lib/CodeGen/MachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/ADT/PriorityQueue.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
Expand Down Expand Up @@ -943,8 +944,6 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {

if (SuccEdge->isWeak()) {
--SuccSU->WeakPredsLeft;
if (SuccEdge->isCluster())
NextClusterSucc = SuccSU;
return;
}
#ifndef NDEBUG
Expand Down Expand Up @@ -980,8 +979,6 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {

if (PredEdge->isWeak()) {
--PredSU->WeakSuccsLeft;
if (PredEdge->isCluster())
NextClusterPred = PredSU;
return;
}
#ifndef NDEBUG
Expand Down Expand Up @@ -1177,11 +1174,8 @@ findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
}

/// Identify DAG roots and setup scheduler queues.
void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
ArrayRef<SUnit*> BotRoots) {
NextClusterSucc = nullptr;
NextClusterPred = nullptr;

void ScheduleDAGMI::initQueues(ArrayRef<SUnit *> TopRoots,
ArrayRef<SUnit *> BotRoots) {
// Release all DAG roots for scheduling, not including EntrySU/ExitSU.
//
// Nodes with unreleased weak edges can still be roots.
Expand Down Expand Up @@ -2109,6 +2103,7 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
ScheduleDAGInstrs *DAG) {
// Keep track of the current cluster length and bytes for each SUnit.
DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
EquivalenceClasses<SUnit *> Clusters;

// At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
// cluster mem ops collected within `MemOpRecords` array.
Expand Down Expand Up @@ -2148,13 +2143,15 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(

SUnit *SUa = MemOpa.SU;
SUnit *SUb = MemOpb.SU;

if (!ReorderWhileClustering && SUa->NodeNum > SUb->NodeNum)
std::swap(SUa, SUb);

// FIXME: Is this check really required?
if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
continue;

Clusters.unionSets(SUa, SUb);
LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
<< SUb->NodeNum << ")\n");
++NumClustered;
Expand Down Expand Up @@ -2194,6 +2191,21 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
<< ", Curr cluster bytes: " << CurrentClusterBytes
<< "\n");
}

// Add cluster group information.
// Iterate over all of the equivalence sets.
auto &AllClusters = DAG->getClusters();
for (const EquivalenceClasses<SUnit *>::ECValue *I : Clusters) {
if (!I->isLeader())
continue;
ClusterInfo Group;
unsigned ClusterIdx = AllClusters.size();
for (SUnit *MemberI : Clusters.members(*I)) {
MemberI->ParentClusterIdx = ClusterIdx;
Group.insert(MemberI);
}
AllClusters.push_back(Group);
}
}

void BaseMemOpClusterMutation::collectMemOpRecords(
Expand Down Expand Up @@ -3681,6 +3693,9 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
}
TopCand.SU = nullptr;
BotCand.SU = nullptr;

TopCluster = nullptr;
BotCluster = nullptr;
}

/// Initialize the per-region scheduling policy.
Expand Down Expand Up @@ -3990,13 +4005,11 @@ bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU,
TryCand, Cand, Cluster))
const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;

if (SameBoundary) {
Expand Down Expand Up @@ -4255,11 +4268,25 @@ void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
if (IsTopNode) {
SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
TopCluster = DAG->getCluster(SU->ParentClusterIdx);
LLVM_DEBUG(if (TopCluster) {
dbgs() << " Top Cluster: ";
for (auto *N : *TopCluster)
dbgs() << N->NodeNum << '\t';
dbgs() << '\n';
});
Top.bumpNode(SU);
if (SU->hasPhysRegUses)
reschedulePhysReg(SU, true);
} else {
SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
BotCluster = DAG->getCluster(SU->ParentClusterIdx);
LLVM_DEBUG(if (BotCluster) {
dbgs() << " Bot Cluster: ";
for (auto *N : *BotCluster)
dbgs() << N->NodeNum << '\t';
dbgs() << '\n';
});
Bot.bumpNode(SU);
if (SU->hasPhysRegDefs)
reschedulePhysReg(SU, false);
Expand Down Expand Up @@ -4316,6 +4343,8 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
if (!Bot.HazardRec) {
Bot.HazardRec = DAG->TII->CreateTargetMIHazardRecognizer(Itin, DAG);
}
TopCluster = nullptr;
BotCluster = nullptr;
}

void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
Expand Down Expand Up @@ -4380,14 +4409,12 @@ bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
return TryCand.Reason != NoCand;

// Keep clustered nodes together.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;

// Avoid critical resource consumption and balance the schedule.
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
Expand Down Expand Up @@ -4584,9 +4611,11 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
if (IsTopNode) {
SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
TopCluster = DAG->getCluster(SU->ParentClusterIdx);
Top.bumpNode(SU);
} else {
SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
BotCluster = DAG->getCluster(SU->ParentClusterIdx);
Bot.bumpNode(SU);
}
}
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/CodeGen/MacroFusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
for (SDep &SI : SecondSU.Preds)
if (SI.isCluster())
return false;

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the MacroFusion changes have no effect, do I understand it correctly?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a little bit confusing here. The "cluster" weak edge actually does not function as before. I thought about removing them at very first, but it is hard to detect like:

  // Check that neither instr is already paired with another along the edge
  // between them."

We only keep the set of SUnit in a cluster group, no order anymore, so we cannot detect "the edge between them". I am not sure whether it is important to check for this pattern. Maybe we just need to check that they are not being clustered with anyone else? I don't have clear answer yet. So, I still keep the "cluster" weak edge. But the way nodes will be clustered during scheduling will only be determined by the Clusters in the DAG. Maybe we can cleanup it later if people have clear idea how to best handle this.

unsigned FirstCluster = FirstSU.ParentClusterIdx;
unsigned SecondCluster = SecondSU.ParentClusterIdx;
assert(FirstCluster == InvalidClusterId && SecondCluster == InvalidClusterId);

// Though the reachability checks above could be made more generic,
// perhaps as part of ScheduleDAGInstrs::addEdge(), since such edges are valid,
// the extra computation cost makes it less interesting in general cases.
Expand All @@ -70,6 +75,14 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
if (!DAG.addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster)))
return false;

auto &Clusters = DAG.getClusters();

FirstSU.ParentClusterIdx = Clusters.size();
SecondSU.ParentClusterIdx = Clusters.size();

SmallSet<SUnit *, 8> Cluster{{&FirstSU, &SecondSU}};
Clusters.emplace_back(Cluster);

// TODO - If we want to chain more than two instructions, we need to create
// artifical edges to make dependencies from the FirstSU also dependent
// on other chained instructions, and other chained instructions also
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/CodeGen/ScheduleDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,9 @@ LLVM_DUMP_METHOD void ScheduleDAG::dumpNodeName(const SUnit &SU) const {
LLVM_DUMP_METHOD void ScheduleDAG::dumpNodeAll(const SUnit &SU) const {
dumpNode(SU);
SU.dumpAttributes();
if (SU.ParentClusterIdx != InvalidClusterId)
dbgs() << " Parent Cluster Index: " << SU.ParentClusterIdx << '\n';

if (SU.Preds.size() > 0) {
dbgs() << " Predecessors:\n";
for (const SDep &Dep : SU.Preds) {
Expand Down
22 changes: 10 additions & 12 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -589,12 +589,11 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;

// Avoid increasing the max critical pressure in the scheduled region.
Expand Down Expand Up @@ -664,12 +663,11 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,

// MaxMemoryClause-specific: We prioritize clustered instructions as we would
// get more benefit from clausing these memory instructions.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;

// We only compare a subset of features when comparing nodes between
Expand Down
18 changes: 10 additions & 8 deletions llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,11 @@ bool PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;

if (SameBoundary) {
Expand Down Expand Up @@ -190,8 +189,11 @@ bool PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
return TryCand.Reason != NoCand;

// Keep clustered nodes together.
if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
Cand.SU == DAG->getNextClusterSucc(), TryCand, Cand, Cluster))
const ClusterInfo *CandCluster = Cand.AtTop ? TopCluster : BotCluster;
const ClusterInfo *TryCandCluster = TryCand.AtTop ? TopCluster : BotCluster;
if (tryGreater(TryCandCluster && TryCandCluster->contains(TryCand.SU),
CandCluster && CandCluster->contains(Cand.SU), TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;

// Avoid critical resource consumption and balance the schedule.
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll
Original file line number Diff line number Diff line change
Expand Up @@ -581,9 +581,8 @@ define void @callee_in_memory(%T_IN_MEMORY %a) {
; CHECK-SD-NEXT: add x8, x8, :lo12:in_memory_store
; CHECK-SD-NEXT: ldr d0, [sp, #64]
; CHECK-SD-NEXT: str d0, [x8, #64]
; CHECK-SD-NEXT: ldr q0, [sp, #16]
; CHECK-SD-NEXT: str q2, [x8, #48]
; CHECK-SD-NEXT: ldr q2, [sp]
; CHECK-SD-NEXT: ldp q2, q0, [sp]
; CHECK-SD-NEXT: stp q0, q1, [x8, #16]
; CHECK-SD-NEXT: str q2, [x8]
; CHECK-SD-NEXT: ret
Expand Down
Loading
Loading