From 5fb65c02ca5e91e7e1a00e0efdb8edc899f3e4b9 Mon Sep 17 00:00:00 2001 From: Wei Mi Date: Fri, 12 Feb 2021 19:46:28 -0800 Subject: [PATCH] [SampleFDO] Stop repeated indirect call promotion for the same target. Found a problem in indirect call promotion in sample loader pass. Currently if an indirect call is promoted for a target, and if the parent function is inlined into some other function, the indirect call can be promoted for the same target again. That is redundent which can harm performance and can cause excessive compile time in some extreme case. The patch fixes the issue. If a target is promoted for an indirect call, the patch will write ICP metadata with the target call count being set to 0. In the later ICP in sample profile loader, if it sees a target has 0 count for an indirect call, it knows the target has been promoted and won't do indirect call promotion for the indirect call. The fix brings 0.1~0.2% performance on our search benchmark. Differential Revision: https://reviews.llvm.org/D96806 --- llvm/include/llvm/ProfileData/InstrProf.h | 3 +- .../lib/Analysis/IndirectCallPromotionAnalysis.cpp | 2 +- llvm/lib/ProfileData/InstrProf.cpp | 7 +- llvm/lib/Transforms/IPO/SampleProfile.cpp | 128 ++++++++++++++++----- .../SampleProfile/Inputs/norepeated-icp.prof | 13 +++ .../test/Transforms/SampleProfile/indirect-call.ll | 2 +- .../Transforms/SampleProfile/norepeated-icp.ll | 74 ++++++++++++ 7 files changed, 194 insertions(+), 35 deletions(-) create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp.prof create mode 100644 llvm/test/Transforms/SampleProfile/norepeated-icp.ll diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h index 9c16c35..b48438d 100644 --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -260,7 +260,8 @@ bool getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, InstrProfValueData ValueData[], - uint32_t &ActualNumValueData, uint64_t &TotalC); + uint32_t &ActualNumValueData, uint64_t &TotalC, + bool GetZeroCntValue = false); inline StringRef getPGOFuncNameMetadataName() { return "PGOFuncName"; } diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp index b112ed2..31247fc 100644 --- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp +++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp @@ -45,7 +45,7 @@ static cl::opt // Set the maximum number of targets to promote for a single indirect-call // callsite. -static cl::opt +cl::opt MaxNumPromotions("icp-max-prom", cl::init(3), cl::Hidden, cl::ZeroOrMore, cl::desc("Max number of promotions for a single indirect " "call callsite")); diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 4a0cee6..52adfee 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -982,7 +982,8 @@ bool getValueProfDataFromInst(const Instruction &Inst, InstrProfValueKind ValueKind, uint32_t MaxNumValueData, InstrProfValueData ValueData[], - uint32_t &ActualNumValueData, uint64_t &TotalC) { + uint32_t &ActualNumValueData, uint64_t &TotalC, + bool GetZeroCntValue) { MDNode *MD = Inst.getMetadata(LLVMContext::MD_prof); if (!MD) return false; @@ -1009,7 +1010,7 @@ bool getValueProfDataFromInst(const Instruction &Inst, // Get total count ConstantInt *TotalCInt = mdconst::dyn_extract(MD->getOperand(2)); - if (!TotalCInt) + if (!TotalCInt && !GetZeroCntValue) return false; TotalC = TotalCInt->getZExtValue(); @@ -1021,7 +1022,7 @@ bool getValueProfDataFromInst(const Instruction &Inst, ConstantInt *Value = mdconst::dyn_extract(MD->getOperand(I)); ConstantInt *Count = mdconst::dyn_extract(MD->getOperand(I + 1)); - if (!Value || !Count) + if (!Value || (!Count && !GetZeroCntValue)) return false; ValueData[ActualNumValueData].Value = Value->getZExtValue(); ValueData[ActualNumValueData].Count = Count->getZExtValue(); diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index fff6a7c..5706fb6 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -218,6 +218,8 @@ static cl::opt ProfileInlineReplayFile( "by inlining from sample profile loader."), cl::Hidden); +extern cl::opt MaxNumPromotions; + namespace { using BlockWeightMap = DenseMap; @@ -361,8 +363,7 @@ protected: // Attempt to promote indirect call and also inline the promoted call bool tryPromoteAndInlineCandidate( Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, - uint64_t &Sum, DenseSet &PromotedInsns, - SmallVector *InlinedCallSites = nullptr); + uint64_t &Sum, SmallVector *InlinedCallSites = nullptr); bool inlineHotFunctions(Function &F, DenseSet &InlinedGUIDs); InlineCost shouldInlineCandidate(InlineCandidate &Candidate); @@ -688,18 +689,97 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { return it.first->second; } +/// If the profile count for the promotion candidate \p Candidate is 0, +/// it means \p Candidate has already been promoted for \p Inst. +static bool isPromotedBefore(const Instruction &Inst, StringRef Candidate) { + uint32_t NumVals = 0; + uint64_t TotalCount = 0; + std::unique_ptr ValueData = + std::make_unique(MaxNumPromotions); + bool Valid = + getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions, + ValueData.get(), NumVals, TotalCount, true); + if (Valid) { + for (uint32_t I = 0; I < NumVals; I++) { + // If the promotion candidate has 0 count in the metadata, it + // means the candidate has been promoted for this indirect call. + if (ValueData[I].Value == Function::getGUID(Candidate)) + return ValueData[I].Count == 0; + } + } + return false; +} + +/// Update indirect call target profile metadata for \p Inst. If \p Total +/// is given, set TotalCount of call targets counts to \p Total, otherwise +/// keep the original value in metadata. +static void +updateIDTMetaData(Instruction &Inst, + const SmallVectorImpl &CallTargets, + uint64_t Total = 0) { + DenseMap ValueCountMap; + + uint32_t NumVals = 0; + uint64_t TotalCount = 0; + std::unique_ptr ValueData = + std::make_unique(MaxNumPromotions); + bool Valid = + getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions, + ValueData.get(), NumVals, TotalCount, true); + if (Valid) { + for (uint32_t I = 0; I < NumVals; I++) + ValueCountMap[ValueData[I].Value] = ValueData[I].Count; + } + + for (const auto &Data : CallTargets) { + auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count); + if (Pair.second) + continue; + // Update existing profile count of the call target if it is not 0. + // If it is 0, the call target has been promoted so keep it as 0. + if (Pair.first->second != 0) + Pair.first->second = Data.Count; + else { + assert(Total >= Data.Count && "Total should be >= Data.Count"); + Total -= Data.Count; + } + } + + SmallVector NewCallTargets; + for (const auto &ValueCount : ValueCountMap) { + NewCallTargets.emplace_back( + InstrProfValueData{ValueCount.first, ValueCount.second}); + } + llvm::sort(NewCallTargets, + [](const InstrProfValueData &L, const InstrProfValueData &R) { + if (L.Count != R.Count) + return L.Count > R.Count; + return L.Value > R.Value; + }); + annotateValueSite(*Inst.getParent()->getParent()->getParent(), Inst, + NewCallTargets, Total ? Total : TotalCount, + IPVK_IndirectCallTarget, NewCallTargets.size()); +} + /// Attempt to promote indirect call and also inline the promoted call. /// /// \param F Caller function. /// \param Candidate ICP and inline candidate. /// \param Sum Sum of target counts for indirect call. -/// \param PromotedInsns Map to keep track of indirect call already processed. /// \param InlinedCallSite Output vector for new call sites exposed after /// inlining. bool SampleProfileLoader::tryPromoteAndInlineCandidate( Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum, - DenseSet &PromotedInsns, SmallVector *InlinedCallSite) { + auto CalleeFunctionName = Candidate.CalleeSamples->getFuncName(); + auto R = SymbolMap.find(CalleeFunctionName); + if (R == SymbolMap.end() || !R->getValue()) + return false; + + auto &CI = *Candidate.CallInstr; + if (isPromotedBefore(CI, R->getValue()->getName())) + return false; + const char *Reason = "Callee function not available"; // R->getValue() != &F is to prevent promoting a recursive call. // If it is a recursive call, we do not inline it as it could bloat @@ -707,15 +787,17 @@ bool SampleProfileLoader::tryPromoteAndInlineCandidate( // clone the caller first, and inline the cloned caller if it is // recursive. As llvm does not inline recursive calls, we will // simply ignore it instead of handling it explicitly. - auto R = SymbolMap.find(Candidate.CalleeSamples->getFuncName()); - if (R != SymbolMap.end() && R->getValue() && - !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() && + if (!R->getValue()->isDeclaration() && R->getValue()->getSubprogram() && R->getValue()->hasFnAttribute("use-sample-profile") && - R->getValue() != &F && - isLegalToPromote(*Candidate.CallInstr, R->getValue(), &Reason)) { - auto *DI = - &pgo::promoteIndirectCall(*Candidate.CallInstr, R->getValue(), - Candidate.CallsiteCount, Sum, false, ORE); + R->getValue() != &F && isLegalToPromote(CI, R->getValue(), &Reason)) { + // For promoted target, save 0 count in the value profile metadata so + // the target won't be promoted again. + SmallVector SortedCallTargets = { + InstrProfValueData{Function::getGUID(R->getValue()->getName()), 0}}; + updateIDTMetaData(CI, SortedCallTargets); + + auto *DI = &pgo::promoteIndirectCall( + CI, R->getValue(), Candidate.CallsiteCount, Sum, false, ORE); if (DI) { Sum -= Candidate.CallsiteCount; // Prorate the indirect callsite distribution. @@ -724,10 +806,8 @@ bool SampleProfileLoader::tryPromoteAndInlineCandidate( // profile will be used to prorate callsites from the callee if // inlined. Once not inlined, the direct callsite distribution should // be prorated so that the it will reflect the real callsite counts. - setProbeDistributionFactor(*Candidate.CallInstr, - Candidate.CallsiteDistribution * Sum / - SumOrigin); - PromotedInsns.insert(Candidate.CallInstr); + setProbeDistributionFactor(CI, Candidate.CallsiteDistribution * Sum / + SumOrigin); Candidate.CallInstr = DI; if (isa(DI) || isa(DI)) { bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite); @@ -800,8 +880,6 @@ void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates( /// \returns True if there is any inline happened. bool SampleProfileLoader::inlineHotFunctions( Function &F, DenseSet &InlinedGUIDs) { - DenseSet PromotedInsns; - // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure // Profile symbol list is ignored when profile-sample-accurate is on. assert((!ProfAccForSymsInList || @@ -855,8 +933,6 @@ bool SampleProfileLoader::inlineHotFunctions( if (CalledFunction == &F) continue; if (I->isIndirectCall()) { - if (PromotedInsns.count(I)) - continue; uint64_t Sum; for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { uint64_t SumOrigin = Sum; @@ -869,8 +945,7 @@ bool SampleProfileLoader::inlineHotFunctions( continue; Candidate = {I, FS, FS->getEntrySamples(), 1.0}; - if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, - PromotedInsns)) { + if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) { LocalNotInlinedCallSites.erase(I); LocalChanged = true; } @@ -1079,7 +1154,6 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { bool SampleProfileLoader::inlineHotFunctionsWithPriority( Function &F, DenseSet &InlinedGUIDs) { - DenseSet PromotedInsns; assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now"); // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure @@ -1128,8 +1202,6 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( if (CalledFunction == &F) continue; if (I->isIndirectCall()) { - if (PromotedInsns.count(I)) - continue; uint64_t Sum; auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum); uint64_t SumOrigin = Sum; @@ -1164,7 +1236,7 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( Candidate = {I, FS, EntryCountDistributed, Candidate.CallsiteDistribution}; if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum, - PromotedInsns, &InlinedCallSites)) { + &InlinedCallSites)) { for (auto *CB : InlinedCallSites) { if (getInlineCandidate(&NewCandidate, CB)) CQueue.emplace(NewCandidate); @@ -1261,9 +1333,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { Sum += NameFS.second.getEntrySamples(); } } - annotateValueSite(*I.getParent()->getParent()->getParent(), I, - SortedCallTargets, Sum, IPVK_IndirectCallTarget, - SortedCallTargets.size()); + updateIDTMetaData(I, SortedCallTargets, Sum); } else if (!isa(&I)) { I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights( diff --git a/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp.prof b/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp.prof new file mode 100644 index 0000000..21d4078 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp.prof @@ -0,0 +1,13 @@ +_Z3foov:225715:1 + 2: 5553 + 3: 5391 + 1: _Z3goov:5860 + 1: 5279 + 2: 5279 + 1: _Z3hoov:5860 + 1: 5000 +_Z3goov:5860:1 + 1: 5279 + 2: 5279 + 1: _Z3hoov:5860 + 1: 5000 diff --git a/llvm/test/Transforms/SampleProfile/indirect-call.ll b/llvm/test/Transforms/SampleProfile/indirect-call.ll index cc7fc7a..02c8913 100644 --- a/llvm/test/Transforms/SampleProfile/indirect-call.ll +++ b/llvm/test/Transforms/SampleProfile/indirect-call.ll @@ -202,7 +202,7 @@ attributes #0 = {"use-sample-profile"} ; CHECK: ![[PROF]] = !{!"VP", i32 0, i64 3457, i64 9191153033785521275, i64 2059, i64 -1069303473483922844, i64 1398} ; CHECK: ![[BR1]] = !{!"branch_weights", i32 4000, i32 4000} ; CHECK: ![[BR2]] = !{!"branch_weights", i32 3000, i32 1000} -; CHECK: ![[VP]] = !{!"VP", i32 0, i64 8000, i64 -6391416044382067764, i64 1000} +; CHECK: ![[VP]] = !{!"VP", i32 0, i64 8000, i64 -6391416044382067764, i64 1000, i64 7476224446746900038, i64 0, i64 925324185419832389, i64 0} ; CHECK: ![[BR3]] = !{!"branch_weights", i32 1, i32 0} !6 = distinct !DISubprogram(name: "test_inline", scope: !1, file: !1, line: 6, unit: !0) !7 = !DILocation(line: 7, scope: !6) diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp.ll new file mode 100644 index 0000000..ce69981 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/norepeated-icp.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/norepeated-icp.prof -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [5 x i8] c"hoo\0A\00", align 1 +@p = dso_local global void ()* null, align 8 +@str = private unnamed_addr constant [4 x i8] c"hoo\00", align 1 + +; Function Attrs: uwtable mustprogress +define dso_local void @_Z3hoov() #0 !dbg !7 { +entry: + %puts = call i32 @puts(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @str, i64 0, i64 0)), !dbg !9 + ret void, !dbg !10 +} + +; Function Attrs: nofree nounwind +declare dso_local noundef i32 @printf(i8* nocapture noundef readonly, ...) #1 + +; Function Attrs: uwtable mustprogress +define dso_local void @_Z3goov() #0 !dbg !11 { +entry: + %0 = load void ()*, void ()** @p, align 8, !dbg !12, !tbaa !13 + call void %0(), !dbg !17 + ret void, !dbg !18 +} + +; Check the indirect call in _Z3goov inlined into _Z3foov won't be indirect +; call promoted for _Z3hoov twice in _Z3foov. +; CHECK-LABEL: @_Z3foov( +; CHECK: icmp eq void ()* {{.*}} @_Z3hoov +; CHECK-NOT: icmp eq void ()* {{.*}} @_Z3hoov +; CHECK: ret void + +; Function Attrs: uwtable mustprogress +define dso_local void @_Z3foov() #0 !dbg !19 { +entry: + call void @_Z3goov(), !dbg !20 + ret void, !dbg !21 +} + +; Function Attrs: nofree nounwind +declare noundef i32 @puts(i8* nocapture noundef readonly) #2 + +attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" } +attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nofree nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "1.cc", directory: "") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!""} +!7 = distinct !DISubprogram(name: "hoo", linkageName: "_Z3hoov", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !2) +!9 = !DILocation(line: 2, column: 3, scope: !7) +!10 = !DILocation(line: 3, column: 1, scope: !7) +!11 = distinct !DISubprogram(name: "goo", linkageName: "_Z3goov", scope: !1, file: !1, line: 6, type: !8, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!12 = !DILocation(line: 7, column: 5, scope: !11) +!13 = !{!14, !14, i64 0} +!14 = !{!"any pointer", !15, i64 0} +!15 = !{!"omnipotent char", !16, i64 0} +!16 = !{!"Simple C++ TBAA"} +!17 = !DILocation(line: 7, column: 3, scope: !11) +!18 = !DILocation(line: 8, column: 1, scope: !11) +!19 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 10, type: !8, scopeLine: 10, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!20 = !DILocation(line: 11, column: 3, scope: !19) +!21 = !DILocation(line: 12, column: 3, scope: !19) -- 2.7.4