Skip to content

[MachinePipeliner] Use AliasAnalysis properly when analyzing loop-carried dependencies #136691

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 23, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions llvm/include/llvm/CodeGen/MachinePipeliner.h
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,13 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
/// Ordered list of DAG postprocessing steps.
std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;

/// Used to compute single-iteration dependencies (i.e., buildSchedGraph).
AliasAnalysis *AA;

/// Used to compute loop-carried dependencies (i.e.,
/// addLoopCarriedDependences).
BatchAAResults BAA;

/// Helper class to implement Johnson's circuit finding algorithm.
class Circuits {
std::vector<SUnit> &SUnits;
Expand Down Expand Up @@ -323,13 +330,14 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
public:
SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
const RegisterClassInfo &rci, unsigned II,
TargetInstrInfo::PipelinerLoopInfo *PLI)
TargetInstrInfo::PipelinerLoopInfo *PLI, AliasAnalysis *AA)
: ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
RegClassInfo(rci), II_setByPragma(II), LoopPipelinerInfo(PLI),
Topo(SUnits, &ExitSU) {
Topo(SUnits, &ExitSU), AA(AA), BAA(*AA) {
P.MF->getSubtarget().getSMSMutations(Mutations);
if (SwpEnableCopyToPhi)
Mutations.push_back(std::make_unique<CopyToPhiMutation>());
BAA.enableCrossIterationMode();
}

void schedule() override;
Expand Down Expand Up @@ -394,7 +402,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
const MachineInstr *OtherMI) const;

private:
void addLoopCarriedDependences(AAResults *AA);
void addLoopCarriedDependences();
void updatePhiDependences();
void changeDependences();
unsigned calculateResMII();
Expand Down
225 changes: 137 additions & 88 deletions llvm/lib/CodeGen/MachinePipeliner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,37 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
"Modulo Software Pipelining", false, false)

namespace {

/// This class holds an SUnit corresponing to a memory operation and other
/// information related to the instruction.
struct SUnitWithMemInfo {
SUnit *SU;
SmallVector<const Value *, 2> UnderlyingObjs;

/// The value of a memory operand.
const Value *MemOpValue = nullptr;

/// The offset of a memory operand.
int64_t MemOpOffset = 0;

AAMDNodes AATags;

/// True if all the underlying objects are identified.
bool IsAllIdentified = false;

SUnitWithMemInfo(SUnit *SU);

bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;

bool isUnknown() const { return MemOpValue == nullptr; }

private:
bool getUnderlyingObjects();
};

} // end anonymous namespace

/// The "main" function for implementing Swing Modulo Scheduling.
bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
if (skipFunction(mf.getFunction()))
Expand Down Expand Up @@ -470,9 +501,10 @@ void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");

AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
SwingSchedulerDAG SMS(
*this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
II_setByPragma, LI.LoopPipelinerInfo.get());
II_setByPragma, LI.LoopPipelinerInfo.get(), AA);

MachineBasicBlock *MBB = L.getHeader();
// The kernel should not include any terminator instructions. These
Expand Down Expand Up @@ -560,9 +592,8 @@ void SwingSchedulerDAG::setMAX_II() {
/// We override the schedule function in ScheduleDAGInstrs to implement the
/// scheduling part of the Swing Modulo Scheduling algorithm.
void SwingSchedulerDAG::schedule() {
AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
buildSchedGraph(AA);
addLoopCarriedDependences(AA);
addLoopCarriedDependences();
updatePhiDependences();
Topo.InitDAGTopologicalSorting();
changeDependences();
Expand Down Expand Up @@ -810,113 +841,131 @@ static bool isDependenceBarrier(MachineInstr &MI) {
(!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
}

/// Return the underlying objects for the memory references of an instruction.
SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) {
if (!getUnderlyingObjects())
return;
for (const Value *Obj : UnderlyingObjs)
if (!isIdentifiedObject(Obj)) {
IsAllIdentified = false;
break;
}
}

bool SUnitWithMemInfo::isTriviallyDisjoint(
const SUnitWithMemInfo &Other) const {
// If all underlying objects are identified objects and there is no overlap
// between them, then these two instructions are disjoint.
if (!IsAllIdentified || !Other.IsAllIdentified)
return false;
for (const Value *Obj : UnderlyingObjs)
if (llvm::is_contained(Other.UnderlyingObjs, Obj))
return false;
return true;
}

/// Collect the underlying objects for the memory references of an instruction.
/// This function calls the code in ValueTracking, but first checks that the
/// instruction has a memory operand.
static void getUnderlyingObjects(const MachineInstr *MI,
SmallVectorImpl<const Value *> &Objs) {
/// Returns false if we cannot find the underlying objects.
bool SUnitWithMemInfo::getUnderlyingObjects() {
const MachineInstr *MI = SU->getInstr();
if (!MI->hasOneMemOperand())
return;
return false;
MachineMemOperand *MM = *MI->memoperands_begin();
if (!MM->getValue())
return;
getUnderlyingObjects(MM->getValue(), Objs);
for (const Value *V : Objs) {
if (!isIdentifiedObject(V)) {
Objs.clear();
return;
}
}
return false;
MemOpValue = MM->getValue();
MemOpOffset = MM->getOffset();
llvm::getUnderlyingObjects(MemOpValue, UnderlyingObjs);

// TODO: A no alias scope may be valid only in a single iteration. In this
// case we need to peel off it like LoopAccessAnalysis does.
AATags = MM->getAAInfo();
return true;
}

/// Add a chain edge between a load and store if the store can be an
/// alias of the load on a subsequent iteration, i.e., a loop carried
/// dependence. This code is very similar to the code in ScheduleDAGInstrs
/// but that code doesn't create loop carried dependences.
void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
Value *UnknownValue =
UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
void SwingSchedulerDAG::addLoopCarriedDependences() {
SmallVector<SUnitWithMemInfo, 4> PendingLoads;
for (auto &SU : SUnits) {
MachineInstr &MI = *SU.getInstr();
if (isDependenceBarrier(MI))
PendingLoads.clear();
else if (MI.mayLoad()) {
SmallVector<const Value *, 4> Objs;
::getUnderlyingObjects(&MI, Objs);
if (Objs.empty())
Objs.push_back(UnknownValue);
for (const auto *V : Objs) {
SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
SUs.push_back(&SU);
}
PendingLoads.emplace_back(&SU);
} else if (MI.mayStore()) {
SmallVector<const Value *, 4> Objs;
::getUnderlyingObjects(&MI, Objs);
if (Objs.empty())
Objs.push_back(UnknownValue);
for (const auto *V : Objs) {
MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
PendingLoads.find(V);
if (I == PendingLoads.end())
SUnitWithMemInfo Store(&SU);
for (const SUnitWithMemInfo &Load : PendingLoads) {
if (Load.isTriviallyDisjoint(Store))
continue;
for (auto *Load : I->second) {
if (isSuccOrder(Load, &SU))
continue;
MachineInstr &LdMI = *Load->getInstr();
// First, perform the cheaper check that compares the base register.
// If they are the same and the load offset is less than the store
// offset, then mark the dependence as loop carried potentially.
const MachineOperand *BaseOp1, *BaseOp2;
int64_t Offset1, Offset2;
bool Offset1IsScalable, Offset2IsScalable;
if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
Offset1IsScalable, TRI) &&
TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
Offset2IsScalable, TRI)) {
if (BaseOp1->isIdenticalTo(*BaseOp2) &&
Offset1IsScalable == Offset2IsScalable &&
(int)Offset1 < (int)Offset2) {
assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
"What happened to the chain edge?");
SDep Dep(Load, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
continue;
}
}
// Second, the more expensive check that uses alias analysis on the
// base registers. If they alias, and the load offset is less than
// the store offset, the mark the dependence as loop carried.
if (!AA) {
SDep Dep(Load, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
continue;
}
MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
MachineMemOperand *MMO2 = *MI.memoperands_begin();
if (!MMO1->getValue() || !MMO2->getValue()) {
SDep Dep(Load, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
continue;
}
if (MMO1->getValue() == MMO2->getValue() &&
MMO1->getOffset() <= MMO2->getOffset()) {
SDep Dep(Load, SDep::Barrier);
if (isSuccOrder(Load.SU, Store.SU))
continue;
MachineInstr &LdMI = *Load.SU->getInstr();
// First, perform the cheaper check that compares the base register.
// If they are the same and the load offset is less than the store
// offset, then mark the dependence as loop carried potentially.
const MachineOperand *BaseOp1, *BaseOp2;
int64_t Offset1, Offset2;
bool Offset1IsScalable, Offset2IsScalable;
if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
Offset1IsScalable, TRI) &&
TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
Offset2IsScalable, TRI)) {
if (BaseOp1->isIdenticalTo(*BaseOp2) &&
Offset1IsScalable == Offset2IsScalable &&
(int)Offset1 < (int)Offset2) {
assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
"What happened to the chain edge?");
SDep Dep(Load.SU, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
continue;
}
if (!AA->isNoAlias(
MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()),
MemoryLocation::getAfter(MMO2->getValue(),
MMO2->getAAInfo()))) {
SDep Dep(Load, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
}
}
// Second, the more expensive check that uses alias analysis on the
// base registers. If they alias, and the load offset is less than
// the store offset, the mark the dependence as loop carried.
if (Load.isUnknown() || Store.isUnknown()) {
SDep Dep(Load.SU, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
continue;
}
if (Load.MemOpValue == Store.MemOpValue &&
Load.MemOpOffset <= Store.MemOpOffset) {
SDep Dep(Load.SU, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
continue;
}

bool IsNoAlias = [&] {
if (BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(Load.MemOpValue,
Load.AATags),
MemoryLocation::getBeforeOrAfter(Store.MemOpValue,
Store.AATags)))
return true;

// AliasAnalysis sometimes gives up on following the underlying
// object. In such a case, separate checks for underlying objects may
// prove that there are no aliases between two accesses.
for (const Value *LoadObj : Load.UnderlyingObjs)
for (const Value *StoreObj : Store.UnderlyingObjs)
if (!BAA.isNoAlias(
MemoryLocation::getBeforeOrAfter(LoadObj, Load.AATags),
MemoryLocation::getBeforeOrAfter(StoreObj, Store.AATags)))
return false;

return true;
}();

if (!IsNoAlias) {
SDep Dep(Load.SU, SDep::Barrier);
Dep.setLatency(1);
SU.addPred(Dep);
}
}
}
Expand Down
72 changes: 72 additions & 0 deletions llvm/test/CodeGen/Hexagon/swp-alias-cross-iteration.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s
# REQUIRES: asserts

# Test that pipeliner correctly detects the loop-carried dependency between the
# load and the store, which is indicated by `Ord` dependency from SU(2) to
# SU(4). Note that there is no dependency within a single iteration.

# CHECK: SU(2): %7:intregs = L2_loadri_io %5:intregs, 0 :: (load (s32) from %ir.ptr.load)
# CHECK-NEXT: # preds left
# CHECK-NEXT: # succs left
# CHECK-NEXT: # rdefs left
# CHECK-NEXT: Latency
# CHECK-NEXT: Depth
# CHECK-NEXT: Height
# CHECK-NEXT: Predecessors:
# CHECK-NEXT: SU(0): Data Latency=0 Reg=%5
# CHECK-NEXT: Successors:
# CHECK-DAG: SU(3): Data Latency=2 Reg=%7
# CHECK-DAG: SU(4): Ord Latency=1 Barrier
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an example of a dependency that has been missed.

# CHECK-NEXT: SU(3): %8:intregs = F2_sfadd %7:intregs, %3:intregs, implicit $usr
# CHECK: SU(4): S2_storeri_io %6:intregs, 0, %8:intregs :: (store (s32) into %ir.ptr.store)


--- |
define void @foo(ptr noalias %p0, ptr noalias %p1, i32 %n) {
entry:
br label %body

body: ; preds = %body, %entry
%i = phi i32 [ 0, %entry ], [ %i.next, %body ]
%ptr.load = phi ptr [ %p0, %entry ], [ %p1, %body ]
%ptr.store = phi ptr [ %p1, %entry ], [ %p0, %body ]
%v = load float, ptr %ptr.load, align 4
%add = fadd float %v, 1.000000e+00
store float %add, ptr %ptr.store, align 4
%i.next = add i32 %i, 1
%cond = icmp slt i32 %i.next, %n
br i1 %cond, label %body, label %exit

exit: ; preds = %body
ret void
}
...
---
name: foo
tracksRegLiveness: true
body: |
bb.0.entry:
successors: %bb.1(0x80000000)
liveins: $r0, $r1, $r2

%6:intregs = COPY $r2
%5:intregs = COPY $r1
%4:intregs = COPY $r0
%9:intregs = A2_tfrsi 1065353216
%12:intregs = COPY %6
J2_loop0r %bb.1, %12, implicit-def $lc0, implicit-def $sa0, implicit-def $usr

bb.1.body (machine-block-address-taken):
successors: %bb.1(0x7c000000), %bb.2(0x04000000)

%1:intregs = PHI %4, %bb.0, %5, %bb.1
%2:intregs = PHI %5, %bb.0, %4, %bb.1
%8:intregs = L2_loadri_io %1, 0 :: (load (s32) from %ir.ptr.load)
%10:intregs = F2_sfadd killed %8, %9, implicit $usr
S2_storeri_io %2, 0, killed %10 :: (store (s32) into %ir.ptr.store)
ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
J2_jump %bb.2, implicit-def dead $pc

bb.2.exit:
PS_jmpret $r31, implicit-def dead $pc
...
Loading
Loading