From cd21a646f656c18bc71bb2ef789ff34ce8f2557c Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Sun, 17 Jul 2016 14:47:01 +0000 Subject: [PATCH] [ThinLTO] Perform profile-guided indirect call promotion Summary: To enable profile-guided indirect call promotion in ThinLTO mode, we simply add call graph edges for each profitable target from the profile to the summaries, then the summary-guided importing will consider the callee for importing as usual. Also we need to enable the indirect call promotion pass creation in the PassManagerBuilder when PerformThinLTO=true (we are in the ThinLTO backend), so that the newly imported functions are considered for promotion in the backends. The IC promotion profiles refer to callees by GUID, which required adding GUIDs to the per-module VST in bitcode (and assigning them valueIds similar to how they are assigned valueIds in the combined index). Reviewers: mehdi_amini, xur Subscribers: mehdi_amini, davidxl, llvm-commits Differential Revision: http://reviews.llvm.org/D21932 llvm-svn: 275707 --- llvm/include/llvm/IR/ModuleSummaryIndex.h | 9 +++ llvm/lib/Analysis/ModuleSummaryAnalysis.cpp | 34 ++++++++--- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 69 ++++++++++++++++++++-- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp | 5 +- .../Inputs/thinlto_indirect_call_promotion.ll | 7 +++ .../PGOProfile/thinlto_indirect_call_promotion.ll | 32 ++++++++++ 6 files changed, 143 insertions(+), 13 deletions(-) create mode 100644 llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.ll create mode 100644 llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index 6b866dbd..45d9bf7 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -80,6 +80,7 @@ struct ValueInfo { assert(Kind == VI_Value && "Not a Value type"); return TheValue.V; } + bool isGUID() const { return Kind == VI_GUID; } }; /// \brief Function and variable summary information to aid decisions and @@ -261,6 +262,14 @@ public: CallGraphEdgeList.push_back(std::make_pair(CalleeGUID, Info)); } + /// Record a call graph edge from this function to each function GUID recorded + /// in \p CallGraphEdges. + void + addCallGraphEdges(DenseMap &CallGraphEdges) { + for (auto &EI : CallGraphEdges) + addCallGraphEdge(EI.first, EI.second); + } + /// Record a call graph edge from this function to the function identified /// by \p CalleeV, with \p CalleeInfo including the cumulative profile /// count (across all calls from this function) or 0 if no PGO. diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index ff41581..c9ac2bd 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/IndirectCallPromotionAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Dominators.h" @@ -73,7 +74,9 @@ void ModuleSummaryIndexBuilder::computeFunctionSummary( // Map from callee ValueId to profile count. Used to accumulate profile // counts for all static calls to a given callee. DenseMap CallGraphEdges; + DenseMap IndirectCallEdges; DenseSet RefEdges; + ICallPromotionAnalysis ICallAnalysis; SmallPtrSet Visited; for (const BasicBlock &BB : F) @@ -83,13 +86,29 @@ void ModuleSummaryIndexBuilder::computeFunctionSummary( if (auto CS = ImmutableCallSite(&I)) { auto *CalledFunction = CS.getCalledFunction(); - if (CalledFunction && CalledFunction->hasName() && - !CalledFunction->isIntrinsic()) { - auto ScaledCount = BFI ? BFI->getBlockProfileCount(&BB) : None; - auto *CalleeId = - M->getValueSymbolTable().lookup(CalledFunction->getName()); - CallGraphEdges[CalleeId] += - (ScaledCount ? ScaledCount.getValue() : 0); + // Check if this is a direct call to a known function. + if (CalledFunction) { + if (CalledFunction->hasName() && !CalledFunction->isIntrinsic()) { + auto ScaledCount = BFI ? BFI->getBlockProfileCount(&BB) : None; + auto *CalleeId = + M->getValueSymbolTable().lookup(CalledFunction->getName()); + CallGraphEdges[CalleeId] += + (ScaledCount ? ScaledCount.getValue() : 0); + } + } else { + // Otherwise, check for an indirect call (call to a non-const value + // that isn't an inline assembly call). + const CallInst *CI = dyn_cast(&I); + if (CS.getCalledValue() && !isa(CS.getCalledValue()) && + !(CI && CI->isInlineAsm())) { + uint32_t NumVals, NumCandidates; + uint64_t TotalCount; + auto CandidateProfileData = + ICallAnalysis.getPromotionCandidatesForInstruction( + &I, NumVals, TotalCount, NumCandidates); + for (auto &Candidate : CandidateProfileData) + IndirectCallEdges[Candidate.Value] += Candidate.Count; + } } } findRefEdges(&I, RefEdges, Visited); @@ -99,6 +118,7 @@ void ModuleSummaryIndexBuilder::computeFunctionSummary( std::unique_ptr FuncSummary = llvm::make_unique(Flags, NumInsts); FuncSummary->addCallGraphEdges(CallGraphEdges); + FuncSummary->addCallGraphEdges(IndirectCallEdges); FuncSummary->addRefEdges(RefEdges); Index->addGlobalValueSummary(F.getName(), std::move(FuncSummary)); } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 1ee7b7b..dcb8b58 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -119,6 +119,14 @@ class ModuleBitcodeWriter : public BitcodeWriter { /// The start bit of the module block, for use in generating a module hash uint64_t BitcodeStartBit = 0; + /// Map that holds the correspondence between GUIDs in the summary index, + /// that came from indirect call profiles, and a value id generated by this + /// class to use in the VST and summary block records. + std::map GUIDToValueIdMap; + + /// Tracks the last value id recorded in the GUIDToValueMap. + unsigned GlobalValueId; + public: /// Constructs a ModuleBitcodeWriter object for the given Module, /// writing to the provided \p Buffer. @@ -132,6 +140,25 @@ public: // will start at the bitcode, and we need the offset of the VST // to line up. BitcodeStartBit = Stream.GetCurrentBitNo(); + + // Assign ValueIds to any callee values in the index that came from + // indirect call profiles and were recorded as a GUID not a Value* + // (which would have been assigned an ID by the ValueEnumerator). + // The starting ValueId is just after the number of values in the + // ValueEnumerator, so that they can be emitted in the VST. + GlobalValueId = VE.getValues().size(); + if (Index) + for (const auto &GUIDSummaryLists : *Index) + // Examine all summaries for this GUID. + for (auto &Summary : GUIDSummaryLists.second) + if (auto FS = dyn_cast(Summary.get())) + // For each call in the function summary, see if the call + // is to a GUID (which means it is for an indirect call, + // otherwise we would have a Value for it). If so, synthesize + // a value id. + for (auto &CallEdge : FS->calls()) + if (CallEdge.first.isGUID()) + assignValueId(CallEdge.first.getGUID()); } private: @@ -260,6 +287,22 @@ private: unsigned FSModRefsAbbrev); void writePerModuleGlobalValueSummary(); void writeModuleHash(size_t BlockStartPos); + + void assignValueId(GlobalValue::GUID ValGUID) { + GUIDToValueIdMap[ValGUID] = ++GlobalValueId; + } + unsigned getValueId(GlobalValue::GUID ValGUID) { + const auto &VMI = GUIDToValueIdMap.find(ValGUID); + assert(VMI != GUIDToValueIdMap.end()); + return VMI->second; + } + // Helper to get the valueId for the type of value recorded in VI. + unsigned getValueId(ValueInfo VI) { + if (VI.isGUID()) + return getValueId(VI.getGUID()); + return VE.getValueID(VI.getValue()); + } + std::map &valueIds() { return GUIDToValueIdMap; } }; /// Class to manage the bitcode writing for a combined index. @@ -2707,6 +2750,7 @@ void ModuleBitcodeWriter::writeValueSymbolTable( unsigned FnEntry8BitAbbrev; unsigned FnEntry7BitAbbrev; unsigned FnEntry6BitAbbrev; + unsigned GUIDEntryAbbrev; if (IsModuleLevel && hasVSTOffsetPlaceholder()) { // 8-bit fixed-width VST_CODE_FNENTRY function strings. BitCodeAbbrev *Abbv = new BitCodeAbbrev(); @@ -2734,11 +2778,19 @@ void ModuleBitcodeWriter::writeValueSymbolTable( Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6)); FnEntry6BitAbbrev = Stream.EmitAbbrev(Abbv); + + // FIXME: Change the name of this record as it is now used by + // the per-module index as well. + Abbv = new BitCodeAbbrev(); + Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid + GUIDEntryAbbrev = Stream.EmitAbbrev(Abbv); } // FIXME: Set up the abbrev, we know how many values there are! // FIXME: We know if the type names can use 7-bit ascii. - SmallVector NameVals; + SmallVector NameVals; for (const ValueName &Name : VST) { // Figure out the encoding to use for the name. @@ -2799,6 +2851,16 @@ void ModuleBitcodeWriter::writeValueSymbolTable( Stream.EmitRecord(Code, NameVals, AbbrevToUse); NameVals.clear(); } + // Emit any GUID valueIDs created for indirect call edges into the + // module-level VST. + if (IsModuleLevel && hasVSTOffsetPlaceholder()) + for (const auto &GI : valueIds()) { + NameVals.push_back(GI.second); + NameVals.push_back(GI.first); + Stream.EmitRecord(bitc::VST_CODE_COMBINED_ENTRY, NameVals, + GUIDEntryAbbrev); + NameVals.clear(); + } Stream.ExitBlock(); } @@ -3220,12 +3282,11 @@ void ModuleBitcodeWriter::writePerModuleFunctionSummaryRecord( std::sort(Calls.begin(), Calls.end(), [this](const FunctionSummary::EdgeTy &L, const FunctionSummary::EdgeTy &R) { - return VE.getValueID(L.first.getValue()) < - VE.getValueID(R.first.getValue()); + return getValueId(L.first) < getValueId(R.first); }); bool HasProfileData = F.getEntryCount().hasValue(); for (auto &ECI : Calls) { - NameVals.push_back(VE.getValueID(ECI.first.getValue())); + NameVals.push_back(getValueId(ECI.first)); assert(ECI.second.CallsiteCount > 0 && "Expected at least one callsite"); NameVals.push_back(ECI.second.CallsiteCount); if (HasProfileData) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 537919d..cf5b76d 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -407,10 +407,11 @@ void PassManagerBuilder::populateModulePassManager( /// PGO instrumentation is added during the compile phase for ThinLTO, do /// not run it a second time addPGOInstrPasses(MPM); - // Indirect call promotion that promotes intra-module targets only. - MPM.add(createPGOIndirectCallPromotionLegacyPass()); } + // Indirect call promotion that promotes intra-module targets only. + MPM.add(createPGOIndirectCallPromotionLegacyPass()); + if (EnableNonLTOGlobalsModRef) // We add a module alias analysis pass here. In part due to bugs in the // analysis infrastructure this "works" in that the analysis stays alive diff --git a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.ll b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.ll new file mode 100644 index 0000000..c77eb51 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.ll @@ -0,0 +1,7 @@ +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @a() { +entry: + ret void +} diff --git a/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll b/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll new file mode 100644 index 0000000..49dc7fa --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/thinlto_indirect_call_promotion.ll @@ -0,0 +1,32 @@ +; Do setup work for all below tests: generate bitcode and combined index +; RUN: opt -module-summary %s -o %t.bc +; RUN: opt -module-summary %p/Inputs/thinlto_indirect_call_promotion.ll -o %t2.bc +; RUN: llvm-lto -thinlto -o %t3 %t.bc %t2.bc + +; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc -o %t4.bc -print-imports 2>&1 | FileCheck %s --check-prefix=IMPORTS +; IMPORTS: Import a + +; RUN: opt %t4.bc -pgo-icall-prom -S -icp-count-threshold=1 | FileCheck %s --check-prefix=ICALL-PROM +; RUN: opt %t4.bc -pgo-icall-prom -S -pass-remarks=pgo-icall-prom -icp-count-threshold=1 2>&1 | FileCheck %s --check-prefix=PASS-REMARK +; PASS-REMARK: Promote indirect call to a with count 1 out of 1 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@foo = external local_unnamed_addr global void ()*, align 8 + +define i32 @main() local_unnamed_addr { +entry: + %0 = load void ()*, void ()** @foo, align 8 +; ICALL-PROM: br i1 %{{[0-9]+}}, label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]] + tail call void %0(), !prof !1 + ret i32 0 +} + +!1 = !{!"VP", i32 0, i64 1, i64 -6289574019528802036, i64 1} + +; Should not have a VP annotation on new indirect call (check before and after +; branch_weights annotation). +; ICALL-PROM-NOT: !"VP" +; ICALL-PROM: [[BRANCH_WEIGHT]] = !{!"branch_weights", i32 1, i32 0} +; ICALL-PROM-NOT: !"VP" -- 2.7.4