Skip to content

Commit

Permalink
[MemProf] Reduce cloning overhead by sharing nodes when possible (#99…
Browse files Browse the repository at this point in the history
…832)

When assigning calls to nodes while building the graph, we can share
nodes between multiple calls in some cases. Specifically, when we
process the list of calls that had the same stack ids (possibly pruned,
because we are looking at the stack ids that actually had nodes in the
graph due to stack ids in the pruned allocation MIBs), for calls that
are located in the same function, we know that they will behave exactly
the same through cloning and function assignment. Therefore, instead of
creating nodes for all of them (requiring context id duplication), keep
a list of additional "matching calls" on the nodes. During function
assignment we simply update all the matching calls the same way as the
primary call.

This change not only reduces the number of nodes (both original and
cloned), but also greatly reduces the number of duplicated context ids
and the time to propagate them.

For a large target, I measured a 25% peak memory reduction and 42% time
reduction.
  • Loading branch information
teresajohnson authored Jul 23, 2024
1 parent 7e1fcf5 commit 055e431
Showing 1 changed file with 109 additions and 15 deletions.
124 changes: 109 additions & 15 deletions llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,16 @@ class CallsiteContextGraph {
// recursion.
bool Recursive = false;

// The corresponding allocation or interior call.
// The corresponding allocation or interior call. This is the primary call
// for which we have created this node.
CallInfo Call;

// List of other calls that can be treated the same as the primary call
// through cloning. I.e. located in the same function and have the same
// (possibly pruned) stack ids. They will be updated the same way as the
// primary call when assigning to function clones.
std::vector<CallInfo> MatchingCalls;

// For alloc nodes this is a unique id assigned when constructed, and for
// callsite stack nodes it is the original stack id when the node is
// constructed from the memprof MIB metadata on the alloc nodes. Note that
Expand Down Expand Up @@ -457,6 +464,9 @@ class CallsiteContextGraph {
/// iteration.
MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;

/// Records the function each call is located in.
DenseMap<CallInfo, const FuncTy *> CallToFunc;

/// Map from callsite node to the enclosing caller function.
std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;

Expand All @@ -474,7 +484,8 @@ class CallsiteContextGraph {
/// StackIdToMatchingCalls map.
void assignStackNodesPostOrder(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
DenseMap<CallInfo, CallInfo> &CallToMatchingCall);

/// Duplicates the given set of context ids, updating the provided
/// map from each original id with the newly generated context ids,
Expand Down Expand Up @@ -1230,10 +1241,11 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
assignStackNodesPostOrder(ContextNode *Node,
DenseSet<const ContextNode *> &Visited,
DenseMap<uint64_t, std::vector<CallContextInfo>>
&StackIdToMatchingCalls) {
assignStackNodesPostOrder(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseMap<uint64_t, std::vector<CallContextInfo>>
&StackIdToMatchingCalls,
DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
auto Inserted = Visited.insert(Node);
if (!Inserted.second)
return;
Expand All @@ -1246,7 +1258,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
// Skip any that have been removed during the recursion.
if (!Edge)
continue;
assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
CallToMatchingCall);
}

// If this node's stack id is in the map, update the graph to contain new
Expand Down Expand Up @@ -1289,8 +1302,19 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
// Skip any for which we didn't assign any ids, these don't get a node in
// the graph.
if (SavedContextIds.empty())
if (SavedContextIds.empty()) {
// If this call has a matching call (located in the same function and
// having the same stack ids), simply add it to the context node created
// for its matching call earlier. These can be treated the same through
// cloning and get updated at the same time.
if (!CallToMatchingCall.contains(Call))
continue;
auto MatchingCall = CallToMatchingCall[Call];
assert(NonAllocationCallToContextNodeMap.contains(MatchingCall));
NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
Call);
continue;
}

assert(LastId == Ids.back());

Expand Down Expand Up @@ -1422,6 +1446,10 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
// there is more than one call with the same stack ids. Their (possibly newly
// duplicated) context ids are saved in the StackIdToMatchingCalls map.
DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
// Save a map from each call to any that are found to match it. I.e. located
// in the same function and have the same (possibly pruned) stack ids. We use
// this to avoid creating extra graph nodes as they can be treated the same.
DenseMap<CallInfo, CallInfo> CallToMatchingCall;
for (auto &It : StackIdToMatchingCalls) {
auto &Calls = It.getSecond();
// Skip single calls with a single stack id. These don't need a new node.
Expand Down Expand Up @@ -1460,6 +1488,13 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
assert(!LastNodeContextIds.empty());

// Map from function to the first call from the below list (with matching
// stack ids) found in that function. Note that calls from different
// functions can have the same stack ids because this is the list of stack
// ids that had (possibly pruned) nodes after building the graph from the
// allocation MIBs.
DenseMap<const FuncTy *, CallInfo> FuncToCallMap;

for (unsigned I = 0; I < Calls.size(); I++) {
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
assert(SavedContextIds.empty());
Expand Down Expand Up @@ -1533,6 +1568,18 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
continue;
}

const FuncTy *CallFunc = CallToFunc[Call];

// If the prior call had the same stack ids this map would not be empty.
// Check if we already have a call that "matches" because it is located
// in the same function.
if (FuncToCallMap.contains(CallFunc)) {
// Record the matching call found for this call, and skip it. We
// will subsequently combine it into the same node.
CallToMatchingCall[Call] = FuncToCallMap[CallFunc];
continue;
}

// Check if the next set of stack ids is the same (since the Calls vector
// of tuples is sorted by the stack ids we can just look at the next one).
bool DuplicateContextIds = false;
Expand Down Expand Up @@ -1562,7 +1609,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
set_subtract(LastNodeContextIds, StackSequenceContextIds);
if (LastNodeContextIds.empty())
break;
}
// No longer possibly in a sequence of calls with duplicate stack ids,
// clear the map.
FuncToCallMap.clear();
} else
// Record the call with its function, so we can locate it the next time
// we find a call from this function when processing the calls with the
// same stack ids.
FuncToCallMap[CallFunc] = Call;
}
}

Expand All @@ -1579,7 +1633,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
// associated context ids over to the new nodes.
DenseSet<const ContextNode *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap)
assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
CallToMatchingCall);
if (VerifyCCG)
check();
}
Expand Down Expand Up @@ -1679,6 +1734,7 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
continue;
if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
CallsWithMetadata.push_back(&I);
CallToFunc[&I] = &F;
auto *AllocNode = addAllocNode(&I, &F);
auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
assert(CallsiteMD);
Expand All @@ -1700,8 +1756,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
I.setMetadata(LLVMContext::MD_callsite, nullptr);
}
// For callsite metadata, add to list for this function for later use.
else if (I.getMetadata(LLVMContext::MD_callsite))
else if (I.getMetadata(LLVMContext::MD_callsite)) {
CallsWithMetadata.push_back(&I);
CallToFunc[&I] = &F;
}
}
}
if (!CallsWithMetadata.empty())
Expand Down Expand Up @@ -1756,8 +1814,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
// correlate properly in applyImport in the backends.
if (AN.MIBs.empty())
continue;
CallsWithMetadata.push_back({&AN});
auto *AllocNode = addAllocNode({&AN}, FS);
IndexCall AllocCall(&AN);
CallsWithMetadata.push_back(AllocCall);
CallToFunc[AllocCall] = FS;
auto *AllocNode = addAllocNode(AllocCall, FS);
// Pass an empty CallStack to the CallsiteContext (second)
// parameter, since for ThinLTO we already collapsed out the inlined
// stack ids on the allocation call during ModuleSummaryAnalysis.
Expand Down Expand Up @@ -1788,8 +1848,11 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
}
// For callsite metadata, add to list for this function for later use.
if (!FS->callsites().empty())
for (auto &SN : FS->mutableCallsites())
CallsWithMetadata.push_back({&SN});
for (auto &SN : FS->mutableCallsites()) {
IndexCall StackNodeCall(&SN);
CallsWithMetadata.push_back(StackNodeCall);
CallToFunc[StackNodeCall] = FS;
}

if (!CallsWithMetadata.empty())
FuncToCallsWithMetadata[FS] = CallsWithMetadata;
Expand Down Expand Up @@ -2225,6 +2288,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
if (Recursive)
OS << " (recursive)";
OS << "\n";
if (!MatchingCalls.empty()) {
OS << "\tMatchingCalls:\n";
for (auto &MatchingCall : MatchingCalls) {
OS << "\t";
MatchingCall.print(OS);
OS << "\n";
}
}
OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
OS << "\tContextIds:";
// Make a copy of the computed context ids that we can sort for stability.
Expand Down Expand Up @@ -2478,6 +2549,7 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
ContextNode *Clone = NodeOwner.back().get();
Node->addClone(Clone);
Clone->MatchingCalls = Node->MatchingCalls;
assert(NodeToCallingFunc.count(Node));
NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
Expand Down Expand Up @@ -3021,6 +3093,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
if (CallMap.count(Call))
CallClone = CallMap[Call];
CallsiteClone->setCall(CallClone);
// Need to do the same for all matching calls.
for (auto &MatchingCall : Node->MatchingCalls) {
CallInfo CallClone(MatchingCall);
if (CallMap.count(MatchingCall))
CallClone = CallMap[MatchingCall];
// Updates the call in the list.
MatchingCall = CallClone;
}
};

// Keep track of the clones of callsite Node that need to be assigned to
Expand Down Expand Up @@ -3187,6 +3267,16 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
CallInfo NewCall(CallMap[OrigCall]);
assert(NewCall);
NewClone->setCall(NewCall);
// Need to do the same for all matching calls.
for (auto &MatchingCall : NewClone->MatchingCalls) {
CallInfo OrigMatchingCall(MatchingCall);
OrigMatchingCall.setCloneNo(0);
assert(CallMap.count(OrigMatchingCall));
CallInfo NewCall(CallMap[OrigMatchingCall]);
assert(NewCall);
// Updates the call in the list.
MatchingCall = NewCall;
}
}
}
// Fall through to handling below to perform the recording of the
Expand Down Expand Up @@ -3373,6 +3463,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {

if (Node->IsAllocation) {
updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
assert(Node->MatchingCalls.empty());
return;
}

Expand All @@ -3381,6 +3472,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {

auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
updateCall(Node->Call, CalleeFunc);
// Update all the matching calls as well.
for (auto &Call : Node->MatchingCalls)
updateCall(Call, CalleeFunc);
};

// Performs DFS traversal starting from allocation nodes to update calls to
Expand Down

0 comments on commit 055e431

Please sign in to comment.