#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
cl::desc("Emit a warning if less than N% of samples in the input profile "
"are matched to the IR."));
-static cl::opt<double> SampleProfileHotThreshold(
- "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"),
- cl::desc("Inlined functions that account for more than N% of all samples "
- "collected in the parent function, will be inlined again."));
-
namespace {
using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
uint32_t Discriminator, uint64_t Samples);
unsigned computeCoverage(unsigned Used, unsigned Total) const;
- unsigned countUsedRecords(const FunctionSamples *FS) const;
- unsigned countBodyRecords(const FunctionSamples *FS) const;
+ unsigned countUsedRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
+ unsigned countBodyRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
- uint64_t countBodySamples(const FunctionSamples *FS) const;
+ uint64_t countBodySamples(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
void clear() {
SampleCoverage.clear();
IsThinLTOPreLink(IsThinLTOPreLink) {}
bool doInitialization(Module &M);
- bool runOnModule(Module &M, ModuleAnalysisManager *AM);
+ bool runOnModule(Module &M, ModuleAnalysisManager *AM,
+ ProfileSummaryInfo *_PSI);
void dump() { Reader->dump(); }
/// Instead, we will mark GUIDs that needs to be annotated to the function.
bool IsThinLTOPreLink;
+ /// Profile Summary Info computed from sample profile.
+ ProfileSummaryInfo *PSI = nullptr;
+
/// Total number of samples collected in this profile.
///
/// This is the sum of all the samples collected in all the functions executed
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
}
private:
} // end anonymous namespace
-/// Return true if the given callsite is hot wrt to its caller.
+/// Return true if the given callsite is hot wrt to hot cutoff threshold.
///
/// Functions that were inlined in the original binary will be represented
/// in the inline stack in the sample profile. If the profile shows that
/// frequently), then we will recreate the inline decision and apply the
/// profile from the inlined callsite.
///
-/// To decide whether an inlined callsite is hot, we compute the fraction
-/// of samples used by the callsite with respect to the total number of samples
-/// collected in the caller.
-///
-/// If that fraction is larger than the default given by
-/// SampleProfileHotThreshold, the callsite will be inlined again.
-static bool callsiteIsHot(const FunctionSamples *CallerFS,
- const FunctionSamples *CallsiteFS) {
+/// To decide whether an inlined callsite is hot, we compare the callsite
+/// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
+/// regarded as hot if the count is above the cutoff value.
+static bool callsiteIsHot(const FunctionSamples *CallsiteFS,
+ ProfileSummaryInfo *PSI) {
if (!CallsiteFS)
return false; // The callsite was not inlined in the original binary.
- uint64_t ParentTotalSamples = CallerFS->getTotalSamples();
- if (ParentTotalSamples == 0)
- return false; // Avoid division by zero.
-
+ assert(PSI && "PSI is expected to be non null");
uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
- if (CallsiteTotalSamples == 0)
- return false; // Callsite is trivially cold.
-
- double PercentSamples =
- (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0;
- return PercentSamples >= SampleProfileHotThreshold;
+ return PSI->isHotCount(CallsiteTotalSamples);
}
/// Mark as used the sample record for the given function samples at
///
/// This count does not include records from cold inlined callsites.
unsigned
-SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
+SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const {
auto I = SampleCoverage.find(FS);
// The size of the coverage map for FS represents the number of records
for (const auto &I : FS->getCallsiteSamples())
for (const auto &J : I.second) {
const FunctionSamples *CalleeSamples = &J.second;
- if (callsiteIsHot(FS, CalleeSamples))
- Count += countUsedRecords(CalleeSamples);
+ if (callsiteIsHot(CalleeSamples, PSI))
+ Count += countUsedRecords(CalleeSamples, PSI);
}
return Count;
///
/// This count does not include records from cold inlined callsites.
unsigned
-SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const {
+SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const {
unsigned Count = FS->getBodySamples().size();
// Only count records in hot callsites.
for (const auto &I : FS->getCallsiteSamples())
for (const auto &J : I.second) {
const FunctionSamples *CalleeSamples = &J.second;
- if (callsiteIsHot(FS, CalleeSamples))
- Count += countBodyRecords(CalleeSamples);
+ if (callsiteIsHot(CalleeSamples, PSI))
+ Count += countBodyRecords(CalleeSamples, PSI);
}
return Count;
///
/// This count does not include samples from cold inlined callsites.
uint64_t
-SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const {
+SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const {
uint64_t Total = 0;
for (const auto &I : FS->getBodySamples())
Total += I.second.getSamples();
for (const auto &I : FS->getCallsiteSamples())
for (const auto &J : I.second) {
const FunctionSamples *CalleeSamples = &J.second;
- if (callsiteIsHot(FS, CalleeSamples))
- Total += countBodySamples(CalleeSamples);
+ if (callsiteIsHot(CalleeSamples, PSI))
+ Total += countBodySamples(CalleeSamples, PSI);
}
return Total;
if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
Candidates.push_back(&I);
- if (callsiteIsHot(Samples, FS))
+ if (callsiteIsHot(FS, PSI))
Hot = true;
}
}
for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
if (IsThinLTOPreLink) {
FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
- Samples->getTotalSamples() *
- SampleProfileHotThreshold / 100);
+ PSI->getOrCompHotCountThreshold());
continue;
}
auto CalleeFunctionName = FS->getName();
LocalChanged = true;
} else if (IsThinLTOPreLink) {
findCalleeFunctionSamples(*I)->findInlinedFunctions(
- InlinedGUIDs, F.getParent(),
- Samples->getTotalSamples() * SampleProfileHotThreshold / 100);
+ InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
}
}
if (LocalChanged) {
// If coverage checking was requested, compute it now.
if (SampleProfileRecordCoverage) {
- unsigned Used = CoverageTracker.countUsedRecords(Samples);
- unsigned Total = CoverageTracker.countBodyRecords(Samples);
+ unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI);
+ unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI);
unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
if (Coverage < SampleProfileRecordCoverage) {
F.getContext().diagnose(DiagnosticInfoSampleProfile(
if (SampleProfileSampleCoverage) {
uint64_t Used = CoverageTracker.getTotalUsedSamples();
- uint64_t Total = CoverageTracker.countBodySamples(Samples);
+ uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI);
unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
if (Coverage < SampleProfileSampleCoverage) {
F.getContext().diagnose(DiagnosticInfoSampleProfile(
"Sample Profile loader", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
"Sample Profile loader", false, false)
return new SampleProfileLoaderLegacyPass(Name);
}
-bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) {
+bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
+ ProfileSummaryInfo *_PSI) {
if (!ProfileIsValid)
return false;
+ PSI = _PSI;
+ if (M.getProfileSummary() == nullptr)
+ M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
+
// Compute the total number of samples collected in this profile.
for (const auto &I : Reader->getProfiles())
TotalCollectedSamples += I.second.getTotalSamples();
clearFunctionData();
retval |= runOnFunction(F, AM);
}
- if (M.getProfileSummary() == nullptr)
- M.setProfileSummary(Reader->getSummary().getMD(M.getContext()));
return retval;
}
bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
ACT = &getAnalysis<AssumptionCacheTracker>();
TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
- return SampleLoader.runOnModule(M, nullptr);
+ ProfileSummaryInfo *PSI =
+ getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ return SampleLoader.runOnModule(M, nullptr, PSI);
}
bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
SampleLoader.doInitialization(M);
- if (!SampleLoader.runOnModule(M, &AM))
+ ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+ if (!SampleLoader.runOnModule(M, &AM, PSI))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
--- /dev/null
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/warm-inline-instance.prof -S | FileCheck %s
+; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/warm-inline-instance.prof -S | FileCheck %s
+
+@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %x, i32 %y) !dbg !4 {
+entry:
+ %x.addr = alloca i32, align 4
+ %y.addr = alloca i32, align 4
+ store i32 %x, i32* %x.addr, align 4
+ store i32 %y, i32* %y.addr, align 4
+ %t0 = load i32, i32* %x.addr, align 4, !dbg !11
+ %t1 = load i32, i32* %y.addr, align 4, !dbg !11
+ %add = add nsw i32 %t0, %t1, !dbg !11
+ ret i32 %add, !dbg !11
+}
+
+define i32 @goo(i32 %x, i32 %y) {
+entry:
+ %x.addr = alloca i32, align 4
+ %y.addr = alloca i32, align 4
+ store i32 %x, i32* %x.addr, align 4
+ store i32 %y, i32* %y.addr, align 4
+ %t0 = load i32, i32* %x.addr, align 4, !dbg !11
+ %t1 = load i32, i32* %y.addr, align 4, !dbg !11
+ %add = add nsw i32 %t0, %t1, !dbg !11
+ ret i32 %add, !dbg !11
+}
+
+; Function Attrs: uwtable
+define i32 @main() !dbg !7 {
+entry:
+ %retval = alloca i32, align 4
+ %s = alloca i32, align 4
+ %i = alloca i32, align 4
+ store i32 0, i32* %retval
+ store i32 0, i32* %i, align 4, !dbg !12
+ br label %while.cond, !dbg !13
+
+while.cond: ; preds = %if.end, %entry
+ %t0 = load i32, i32* %i, align 4, !dbg !14
+ %inc = add nsw i32 %t0, 1, !dbg !14
+ store i32 %inc, i32* %i, align 4, !dbg !14
+ %cmp = icmp slt i32 %t0, 400000000, !dbg !14
+ br i1 %cmp, label %while.body, label %while.end, !dbg !14
+
+while.body: ; preds = %while.cond
+ %t1 = load i32, i32* %i, align 4, !dbg !16
+ %cmp1 = icmp ne i32 %t1, 100, !dbg !16
+ br i1 %cmp1, label %if.then, label %if.else, !dbg !16
+
+if.then: ; preds = %while.body
+ %t2 = load i32, i32* %i, align 4, !dbg !18
+ %t3 = load i32, i32* %s, align 4, !dbg !18
+; Although the ratio of total samples of @foo vs total samples of @main is
+; small, since the total samples count is larger than hot cutoff computed by
+; ProfileSummaryInfo, we will still regard the callsite of foo as hot and
+; early inlining will inline it.
+; CHECK-LABEL: @main(
+; CHECK-NOT: call i32 @foo(i32 %t2, i32 %t3)
+ %call1 = call i32 @foo(i32 %t2, i32 %t3), !dbg !18
+ store i32 %call1, i32* %s, align 4, !dbg !18
+ br label %if.end, !dbg !18
+
+if.else: ; preds = %while.body
+; call @goo 's basicblock doesn't get any sample, so no profile will be annotated.
+; CHECK: call i32 @goo(i32 2, i32 3), !dbg !{{[0-9]+}}
+; CHECK-NOT: !prof
+; CHECK-SAME: {{$}}
+ %call2 = call i32 @goo(i32 2, i32 3), !dbg !26
+ store i32 %call2, i32* %s, align 4, !dbg !20
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ br label %while.cond, !dbg !22
+
+while.end: ; preds = %while.cond
+ %t4 = load i32, i32* %s, align 4, !dbg !24
+ %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %t4), !dbg !24
+ ret i32 0, !dbg !25
+}
+
+declare i32 @printf(i8*, ...) #2
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: NoDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "calls.cc", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "foo", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "calls.cc", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "main", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 7, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 3}
+!10 = !{!"clang version 3.5 "}
+!11 = !DILocation(line: 4, scope: !4)
+!12 = !DILocation(line: 8, scope: !7)
+!13 = !DILocation(line: 9, scope: !7)
+!14 = !DILocation(line: 9, scope: !15)
+!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7)
+!16 = !DILocation(line: 10, scope: !17)
+!17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7)
+!18 = !DILocation(line: 10, scope: !19)
+!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17)
+!20 = !DILocation(line: 10, scope: !21)
+!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17)
+!22 = !DILocation(line: 10, scope: !23)
+!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17)
+!24 = !DILocation(line: 11, scope: !7)
+!25 = !DILocation(line: 12, scope: !7)
+!26 = !DILocation(line: 11, scope: !19)