Skip to content

Commit

Permalink
[MemProf] Support cloning for indirect calls with ThinLTO (#110625)
Browse files Browse the repository at this point in the history
This patch enables support for cloning in indirect callsites.

This is done by synthesizing callsite records for each virtual call
target from the profile metadata. In the thin link all the synthesized
records for a particular indirect callsite initially share the same
context node, but support is added to partition the callsites and
outgoing edges based on the callee function, creating a separate node
for each target.

In the LTO backend, when cloning is needed we first perform indirect
call promotion, then change the target of the new direct call to the
desired clone.

Note this is ThinLTO-specific, since for regular LTO indirect call
promotion should have already occurred.
  • Loading branch information
teresajohnson authored Oct 11, 2024
1 parent 111b062 commit 1de7165
Show file tree
Hide file tree
Showing 7 changed files with 919 additions and 53 deletions.
5 changes: 5 additions & 0 deletions llvm/include/llvm/IR/ModuleSummaryIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,12 @@ struct ValueInfo {
return getRef()->second.SummaryList;
}

// Even if the index is built with GVs available, we may not have one for
// summary entries synthesized for profiled indirect call targets.
bool hasName() const { return !haveGVs() || getValue(); }

StringRef name() const {
assert(!haveGVs() || getRef()->second.U.GV);
return haveGVs() ? getRef()->second.U.GV->getName()
: getRef()->second.U.Name;
}
Expand Down
45 changes: 44 additions & 1 deletion llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
#ifndef LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
#define LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H

#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/ModuleSummaryIndex.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <functional>

namespace llvm {
Expand All @@ -36,15 +38,56 @@ class MemProfContextDisambiguation
/// the IR.
bool applyImport(Module &M);

// Builds the symtab and analysis used for ICP during ThinLTO backends.
bool initializeIndirectCallPromotionInfo(Module &M);

// Data structure for saving indirect call profile info for use in ICP with
// cloning.
struct ICallAnalysisData {
CallBase *CB;
std::vector<InstrProfValueData> CandidateProfileData;
uint32_t NumCandidates;
uint64_t TotalCount;
size_t CallsiteInfoStartIndex;
};

// Record information needed for ICP of an indirect call, depending on its
// profile information and the clone information recorded in the corresponding
// CallsiteInfo records. The SI iterator point to the current iteration point
// through AllCallsites in this function, and will be updated in this method
// as we iterate through profiled targets. The number of clones recorded for
// this indirect call is returned. The necessary information is recorded in
// the ICallAnalysisInfo list for later ICP.
unsigned recordICPInfo(CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
ArrayRef<CallsiteInfo>::iterator &SI,
SmallVector<ICallAnalysisData> &ICallAnalysisInfo);

// Actually performs any needed ICP in the function, using the information
// recorded in the ICallAnalysisInfo list.
void performICP(Module &M, ArrayRef<CallsiteInfo> AllCallsites,
ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
OptimizationRemarkEmitter &ORE);

/// Import summary containing cloning decisions for the ThinLTO backend.
const ModuleSummaryIndex *ImportSummary;

// Owns the import summary specified by internal options for testing the
// ThinLTO backend via opt (to simulate distributed ThinLTO).
std::unique_ptr<ModuleSummaryIndex> ImportSummaryForTesting;

// Whether we are building with SamplePGO. This is needed for correctly
// updating profile metadata on speculatively promoted calls.
bool isSamplePGO;

// Used when performing indirect call analysis and promotion when cloning in
// the ThinLTO backend during applyImport.
std::unique_ptr<InstrProfSymtab> Symtab;
std::unique_ptr<ICallPromotionAnalysis> ICallAnalysis;

public:
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr);
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr,
bool isSamplePGO = false);

PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);

Expand Down
59 changes: 36 additions & 23 deletions llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ static cl::opt<std::string> ModuleSummaryDotFile(
"module-summary-dot-file", cl::Hidden, cl::value_desc("filename"),
cl::desc("File to emit dot graph of new summary into"));

static cl::opt<bool> EnableMemProfIndirectCallSupport(
"enable-memprof-indirect-call-support", cl::init(true), cl::Hidden,
cl::desc(
"Enable MemProf support for summarizing and cloning indirect calls"));

extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;

extern cl::opt<unsigned> MaxNumVTableAnnotations;
Expand Down Expand Up @@ -404,6 +409,11 @@ static void computeFunctionSummary(
if (HasLocalsInUsedOrAsm && CI && CI->isInlineAsm())
HasInlineAsmMaybeReferencingInternal = true;

// Compute this once per indirect call.
uint32_t NumCandidates = 0;
uint64_t TotalCount = 0;
MutableArrayRef<InstrProfValueData> CandidateProfileData;

auto *CalledValue = CB->getCalledOperand();
auto *CalledFunction = CB->getCalledFunction();
if (CalledValue && !CalledFunction) {
Expand Down Expand Up @@ -481,9 +491,7 @@ static void computeFunctionSummary(
}
}

uint32_t NumCandidates;
uint64_t TotalCount;
auto CandidateProfileData =
CandidateProfileData =
ICallAnalysis.getPromotionCandidatesForInstruction(&I, TotalCount,
NumCandidates);
for (const auto &Candidate : CandidateProfileData)
Expand All @@ -495,16 +503,6 @@ static void computeFunctionSummary(
if (!IsThinLTO)
continue;

// TODO: Skip indirect calls for now. Need to handle these better, likely
// by creating multiple Callsites, one per target, then speculatively
// devirtualize while applying clone info in the ThinLTO backends. This
// will also be important because we will have a different set of clone
// versions per target. This handling needs to match that in the ThinLTO
// backend so we handle things consistently for matching of callsite
// summaries to instructions.
if (!CalledFunction)
continue;

// Ensure we keep this analysis in sync with the handling in the ThinLTO
// backend (see MemProfContextDisambiguation::applyImport). Save this call
// so that we can skip it in checking the reverse case later.
Expand Down Expand Up @@ -555,13 +553,24 @@ static void computeFunctionSummary(
SmallVector<unsigned> StackIdIndices;
for (auto StackId : InstCallsite)
StackIdIndices.push_back(Index.addOrGetStackIdIndex(StackId));
// Use the original CalledValue, in case it was an alias. We want
// to record the call edge to the alias in that case. Eventually
// an alias summary will be created to associate the alias and
// aliasee.
auto CalleeValueInfo =
Index.getOrInsertValueInfo(cast<GlobalValue>(CalledValue));
Callsites.push_back({CalleeValueInfo, StackIdIndices});
if (CalledFunction) {
// Use the original CalledValue, in case it was an alias. We want
// to record the call edge to the alias in that case. Eventually
// an alias summary will be created to associate the alias and
// aliasee.
auto CalleeValueInfo =
Index.getOrInsertValueInfo(cast<GlobalValue>(CalledValue));
Callsites.push_back({CalleeValueInfo, StackIdIndices});
} else if (EnableMemProfIndirectCallSupport) {
// For indirect callsites, create multiple Callsites, one per target.
// This enables having a different set of clone versions per target,
// and we will apply the cloning decisions while speculatively
// devirtualizing in the ThinLTO backends.
for (const auto &Candidate : CandidateProfileData) {
auto CalleeValueInfo = Index.getOrInsertValueInfo(Candidate.Value);
Callsites.push_back({CalleeValueInfo, StackIdIndices});
}
}
}
}
}
Expand Down Expand Up @@ -1214,9 +1223,13 @@ bool llvm::mayHaveMemprofSummary(const CallBase *CB) {
if (CI && CalledFunction->isIntrinsic())
return false;
} else {
// TODO: For now skip indirect calls. See comments in
// computeFunctionSummary for what is needed to handle this.
return false;
// Skip inline assembly calls.
if (CI && CI->isInlineAsm())
return false;
// Skip direct calls via Constant.
if (!CalledValue || isa<Constant>(CalledValue))
return false;
return true;
}
return true;
}
4 changes: 2 additions & 2 deletions llvm/lib/IR/AsmWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3609,7 +3609,7 @@ void AssemblyWriter::printSummary(const GlobalValueSummary &Summary) {

void AssemblyWriter::printSummaryInfo(unsigned Slot, const ValueInfo &VI) {
Out << "^" << Slot << " = gv: (";
if (!VI.name().empty())
if (VI.hasName() && !VI.name().empty())
Out << "name: \"" << VI.name() << "\"";
else
Out << "guid: " << VI.getGUID();
Expand All @@ -3623,7 +3623,7 @@ void AssemblyWriter::printSummaryInfo(unsigned Slot, const ValueInfo &VI) {
Out << ")";
}
Out << ")";
if (!VI.name().empty())
if (VI.hasName() && !VI.name().empty())
Out << " ; guid = " << VI.getGUID();
Out << "\n";
}
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Passes/PassBuilderPipelines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1710,7 +1710,8 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
// For ThinLTO we must apply the context disambiguation decisions early, to
// ensure we can correctly match the callsites to summary data.
if (EnableMemProfContextDisambiguation)
MPM.addPass(MemProfContextDisambiguation(ImportSummary));
MPM.addPass(MemProfContextDisambiguation(
ImportSummary, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));

// These passes import type identifier resolutions for whole-program
// devirtualization and CFI. They must run early because other passes may
Expand Down Expand Up @@ -1923,7 +1924,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
// amount of additional cloning required to distinguish the allocation
// contexts.
if (EnableMemProfContextDisambiguation)
MPM.addPass(MemProfContextDisambiguation());
MPM.addPass(MemProfContextDisambiguation(
/*Summary=*/nullptr,
PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));

// Optimize globals again after we ran the inliner.
MPM.addPass(GlobalOptPass());
Expand Down
Loading

0 comments on commit 1de7165

Please sign in to comment.