From 106ec64fbc7fb5ef28d0368fb1dca18e67e75adf Mon Sep 17 00:00:00 2001 From: Hiroshi Yamauchi Date: Tue, 4 Feb 2020 15:19:33 -0800 Subject: [PATCH] [PGO] Add memcmp/bcmp size value profiling. Summary: This adds support for memcmp/bcmp to the existing memcpy/memset value profiling. Reviewers: davidxl Subscribers: hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D79751 --- .../Instrumentation/PGOInstrumentation.cpp | 52 ++++-- .../Transforms/Instrumentation/PGOMemOPSizeOpt.cpp | 190 ++++++++++++++++----- .../Instrumentation/ValueProfileCollector.cpp | 10 +- .../Instrumentation/ValueProfileCollector.h | 3 +- .../Instrumentation/ValueProfilePlugins.inc | 22 ++- .../Inputs/memop_size_annotation.proftext | 22 ++- .../Transforms/PGOProfile/memop_size_annotation.ll | 9 + llvm/test/Transforms/PGOProfile/memop_size_opt.ll | 130 ++++++++++++-- 8 files changed, 354 insertions(+), 84 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 5fcb0b2..72eb5cd 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -377,6 +377,7 @@ private: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); } }; @@ -405,6 +406,7 @@ private: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); } }; @@ -437,6 +439,7 @@ INITIALIZE_PASS_BEGIN(PGOInstrumentationGenLegacyPass, "pgo-instr-gen", "PGO instrumentation.", false, false) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen", "PGO instrumentation.", false, false) @@ -566,11 +569,11 @@ public: } FuncPGOInstrumentation( - Function &Func, + Function &Func, TargetLibraryInfo &TLI, std::unordered_multimap &ComdatMembers, bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr, BlockFrequencyInfo *BFI = nullptr, bool IsCS = false) - : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func), + : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func, TLI), ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) { // This should be done before CFG hash computation. SIVisitor.countSelects(Func); @@ -834,15 +837,16 @@ populateEHOperandBundle(VPCandidateInfo &Cand, // Visit all edge and instrument the edges not in MST, and do value profiling. // Critical edges will be split. static void instrumentOneFunc( - Function &F, Module *M, BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFI, + Function &F, Module *M, TargetLibraryInfo &TLI, BranchProbabilityInfo *BPI, + BlockFrequencyInfo *BFI, std::unordered_multimap &ComdatMembers, bool IsCS) { // Split indirectbr critical edges here before computing the MST rather than // later in getInstrBB() to avoid invalidating it. SplitIndirectBrCriticalEdges(F, BPI, BFI); - FuncPGOInstrumentation FuncInfo(F, ComdatMembers, true, BPI, - BFI, IsCS); + FuncPGOInstrumentation FuncInfo(F, TLI, ComdatMembers, true, + BPI, BFI, IsCS); std::vector InstrumentBBs; FuncInfo.getInstrumentBBs(InstrumentBBs); unsigned NumCounters = @@ -997,12 +1001,12 @@ namespace { class PGOUseFunc { public: - PGOUseFunc(Function &Func, Module *Modu, + PGOUseFunc(Function &Func, Module *Modu, TargetLibraryInfo &TLI, std::unordered_multimap &ComdatMembers, BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin, ProfileSummaryInfo *PSI, bool IsCS) : F(Func), M(Modu), BFI(BFIin), PSI(PSI), - FuncInfo(Func, ComdatMembers, false, BPI, BFIin, IsCS), + FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, IsCS), FreqAttr(FFA_Normal), IsCS(IsCS) {} // Read counts for the instrumented BB from profile. @@ -1504,7 +1508,8 @@ static void collectComdatMembers( } static bool InstrumentAllFunctions( - Module &M, function_ref LookupBPI, + Module &M, function_ref LookupTLI, + function_ref LookupBPI, function_ref LookupBFI, bool IsCS) { // For the context-sensitve instrumentation, we should have a separated pass // (before LTO/ThinLTO linking) to create these variables. @@ -1516,9 +1521,10 @@ static bool InstrumentAllFunctions( for (auto &F : M) { if (F.isDeclaration()) continue; + auto &TLI = LookupTLI(F); auto *BPI = LookupBPI(F); auto *BFI = LookupBFI(F); - instrumentOneFunc(F, &M, BPI, BFI, ComdatMembers, IsCS); + instrumentOneFunc(F, &M, TLI, BPI, BFI, ComdatMembers, IsCS); } return true; } @@ -1534,27 +1540,32 @@ bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) { if (skipModule(M)) return false; + auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis().getTLI(F); + }; auto LookupBPI = [this](Function &F) { return &this->getAnalysis(F).getBPI(); }; auto LookupBFI = [this](Function &F) { return &this->getAnalysis(F).getBFI(); }; - return InstrumentAllFunctions(M, LookupBPI, LookupBFI, IsCS); + return InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS); } PreservedAnalyses PGOInstrumentationGen::run(Module &M, ModuleAnalysisManager &AM) { auto &FAM = AM.getResult(M).getManager(); + auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult(F); + }; auto LookupBPI = [&FAM](Function &F) { return &FAM.getResult(F); }; - auto LookupBFI = [&FAM](Function &F) { return &FAM.getResult(F); }; - if (!InstrumentAllFunctions(M, LookupBPI, LookupBFI, IsCS)) + if (!InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); @@ -1562,6 +1573,7 @@ PreservedAnalyses PGOInstrumentationGen::run(Module &M, static bool annotateAllFunctions( Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName, + function_ref LookupTLI, function_ref LookupBPI, function_ref LookupBFI, ProfileSummaryInfo *PSI, bool IsCS) { @@ -1609,12 +1621,13 @@ static bool annotateAllFunctions( for (auto &F : M) { if (F.isDeclaration()) continue; + auto &TLI = LookupTLI(F); auto *BPI = LookupBPI(F); auto *BFI = LookupBFI(F); // Split indirectbr critical edges here before computing the MST rather than // later in getInstrBB() to avoid invalidating it. SplitIndirectBrCriticalEdges(F, BPI, BFI); - PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI, PSI, IsCS); + PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS); bool AllZeros = false; if (!Func.readCounters(PGOReader.get(), AllZeros)) continue; @@ -1695,10 +1708,12 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M, ModuleAnalysisManager &AM) { auto &FAM = AM.getResult(M).getManager(); + auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult(F); + }; auto LookupBPI = [&FAM](Function &F) { return &FAM.getResult(F); }; - auto LookupBFI = [&FAM](Function &F) { return &FAM.getResult(F); }; @@ -1706,7 +1721,7 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M, auto *PSI = &AM.getResult(M); if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName, - LookupBPI, LookupBFI, PSI, IsCS)) + LookupTLI, LookupBPI, LookupBFI, PSI, IsCS)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); @@ -1716,6 +1731,9 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) { if (skipModule(M)) return false; + auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis().getTLI(F); + }; auto LookupBPI = [this](Function &F) { return &this->getAnalysis(F).getBPI(); }; @@ -1724,8 +1742,8 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) { }; auto *PSI = &getAnalysis().getPSI(); - return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI, PSI, - IsCS); + return annotateAllFunctions(M, ProfileFileName, "", LookupTLI, LookupBPI, + LookupBFI, PSI, IsCS); } static std::string getSimpleNodeName(const BasicBlock *Node) { diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp index 9767fda..bef0e02 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -95,6 +95,11 @@ extern cl::opt MemOPSizeRange; // This option sets the value that groups large memop sizes extern cl::opt MemOPSizeLarge; +static cl::opt + MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(false), + cl::Hidden, + cl::desc("Size-specialize memcmp and bcmp calls")); + namespace { class PGOMemOPSizeOptLegacyPass : public FunctionPass { public: @@ -113,6 +118,7 @@ private: AU.addRequired(); AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); } }; } // end anonymous namespace @@ -122,6 +128,7 @@ INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", "Optimize memory intrinsic using its size value profile", false, false) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", "Optimize memory intrinsic using its size value profile", false, false) @@ -131,11 +138,90 @@ FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() { } namespace { + +static const char *getMIName(const MemIntrinsic *MI) { + switch (MI->getIntrinsicID()) { + case Intrinsic::memcpy: + return "memcpy"; + case Intrinsic::memmove: + return "memmove"; + case Intrinsic::memset: + return "memset"; + default: + return "unknown"; + } +} + +// A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). +struct MemOp { + Instruction *I; + MemOp(MemIntrinsic *MI) : I(MI) {} + MemOp(CallInst *CI) : I(CI) {} + MemIntrinsic *asMI() { return dyn_cast(I); } + CallInst *asCI() { return cast(I); } + MemOp clone() { + if (auto MI = asMI()) + return MemOp(cast(MI->clone())); + return MemOp(cast(asCI()->clone())); + } + Value *getLength() { + if (auto MI = asMI()) + return MI->getLength(); + return asCI()->getArgOperand(2); + } + void setLength(Value *Length) { + if (auto MI = asMI()) + return MI->setLength(Length); + asCI()->setArgOperand(2, Length); + } + StringRef getFuncName() { + if (auto MI = asMI()) + return MI->getCalledFunction()->getName(); + return asCI()->getCalledFunction()->getName(); + } + bool isMemmove() { + if (auto MI = asMI()) + if (MI->getIntrinsicID() == Intrinsic::memmove) + return true; + return false; + } + bool isMemcmp(TargetLibraryInfo &TLI) { + LibFunc Func; + if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && + Func == LibFunc_memcmp) { + return true; + } + return false; + } + bool isBcmp(TargetLibraryInfo &TLI) { + LibFunc Func; + if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && + Func == LibFunc_bcmp) { + return true; + } + return false; + } + const char *getName(TargetLibraryInfo &TLI) { + if (auto MI = asMI()) + return getMIName(MI); + LibFunc Func; + if (TLI.getLibFunc(*asCI(), Func)) { + if (Func == LibFunc_memcmp) + return "memcmp"; + if (Func == LibFunc_bcmp) + return "bcmp"; + } + llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); + return nullptr; + } +}; + class MemOPSizeOpt : public InstVisitor { public: MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, - OptimizationRemarkEmitter &ORE, DominatorTree *DT) - : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) { + OptimizationRemarkEmitter &ORE, DominatorTree *DT, + TargetLibraryInfo &TLI) + : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { ValueDataArray = std::make_unique(MemOPMaxVersion + 2); // Get the MemOPSize range information from option MemOPSizeRange, @@ -147,13 +233,12 @@ public: WorkList.clear(); visit(Func); - for (auto &MI : WorkList) { + for (auto &MO : WorkList) { ++NumOfPGOMemOPAnnotate; - if (perform(MI)) { + if (perform(MO)) { Changed = true; ++NumOfPGOMemOPOpt; - LLVM_DEBUG(dbgs() << "MemOP call: " - << MI->getCalledFunction()->getName() + LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() << "is Transformed.\n"); } } @@ -164,7 +249,16 @@ public: // Not perform on constant length calls. if (dyn_cast(Length)) return; - WorkList.push_back(&MI); + WorkList.push_back(MemOp(&MI)); + } + + void visitCallInst(CallInst &CI) { + LibFunc Func; + if (TLI.getLibFunc(CI, Func) && + (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && + !dyn_cast(CI.getArgOperand(2))) { + WorkList.push_back(MemOp(&CI)); + } } private: @@ -172,15 +266,16 @@ private: BlockFrequencyInfo &BFI; OptimizationRemarkEmitter &ORE; DominatorTree *DT; + TargetLibraryInfo &TLI; bool Changed; - std::vector WorkList; + std::vector WorkList; // Start of the previse range. int64_t PreciseRangeStart; // Last value of the previse range. int64_t PreciseRangeLast; // The space to read the profile annotation. std::unique_ptr ValueDataArray; - bool perform(MemIntrinsic *MI); + bool perform(MemOp MO); // This kind shows which group the value falls in. For PreciseValue, we have // the profile count for that value. LargeGroup groups the values that are in @@ -196,19 +291,6 @@ private: } }; -static const char *getMIName(const MemIntrinsic *MI) { - switch (MI->getIntrinsicID()) { - case Intrinsic::memcpy: - return "memcpy"; - case Intrinsic::memmove: - return "memmove"; - case Intrinsic::memset: - return "memset"; - default: - return "unknown"; - } -} - static bool isProfitable(uint64_t Count, uint64_t TotalCount) { assert(Count <= TotalCount); if (Count < MemOPCountThreshold) @@ -227,21 +309,23 @@ static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num, return ScaleCount / Denom; } -bool MemOPSizeOpt::perform(MemIntrinsic *MI) { - assert(MI); - if (MI->getIntrinsicID() == Intrinsic::memmove) +bool MemOPSizeOpt::perform(MemOp MO) { + assert(MO.I); + if (MO.isMemmove()) + return false; + if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) return false; uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; uint64_t TotalCount; - if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions, + if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions, ValueDataArray.get(), NumVals, TotalCount)) return false; uint64_t ActualCount = TotalCount; uint64_t SavedTotalCount = TotalCount; if (MemOPScaleCount) { - auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent()); + auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); if (!BBEdgeCount) return false; ActualCount = *BBEdgeCount; @@ -333,13 +417,13 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) { // } // merge_bb: - BasicBlock *BB = MI->getParent(); + BasicBlock *BB = MO.I->getParent(); LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); LLVM_DEBUG(dbgs() << *BB << "\n"); auto OrigBBFreq = BFI.getBlockFreq(BB); - BasicBlock *DefaultBB = SplitBlock(BB, MI, DT); - BasicBlock::iterator It(*MI); + BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); + BasicBlock::iterator It(*MO.I); ++It; assert(It != DefaultBB->end()); BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); @@ -351,15 +435,24 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) { auto &Ctx = Func.getContext(); IRBuilder<> IRB(BB); BB->getTerminator()->eraseFromParent(); - Value *SizeVar = MI->getLength(); + Value *SizeVar = MO.getLength(); SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); + Type *MemOpTy = MO.I->getType(); + PHINode *PHI = nullptr; + if (!MemOpTy->isVoidTy()) { + // Insert a phi for the return values at the merge block. + IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); + PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); + MO.I->replaceAllUsesWith(PHI); + PHI->addIncoming(MO.I, DefaultBB); + } // Clear the value profile data. - MI->setMetadata(LLVMContext::MD_prof, nullptr); + MO.I->setMetadata(LLVMContext::MD_prof, nullptr); // If all promoted, we don't need the MD.prof metadata. if (SavedRemainCount > 0 || Version != NumVals) // Otherwise we need update with the un-promoted records back. - annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version), + annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version), SavedRemainCount, IPVK_MemOPSize, NumVals); LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); @@ -371,17 +464,18 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) { for (uint64_t SizeId : SizeIds) { BasicBlock *CaseBB = BasicBlock::Create( Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); - Instruction *NewInst = MI->clone(); + MemOp NewMO = MO.clone(); // Fix the argument. - auto *MemI = cast(NewInst); - auto *SizeType = dyn_cast(MemI->getLength()->getType()); + auto *SizeType = dyn_cast(NewMO.getLength()->getType()); assert(SizeType && "Expected integer type size argument."); ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); - MemI->setLength(CaseSizeId); - CaseBB->getInstList().push_back(NewInst); + NewMO.setLength(CaseSizeId); + CaseBB->getInstList().push_back(NewMO.I); IRBuilder<> IRBCase(CaseBB); IRBCase.CreateBr(MergeBB); SI->addCase(CaseSizeId, CaseBB); + if (!MemOpTy->isVoidTy()) + PHI->addIncoming(NewMO.I, CaseBB); if (DT) { Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); Updates.push_back({DominatorTree::Insert, BB, CaseBB}); @@ -399,11 +493,10 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) { ORE.emit([&]() { using namespace ore; - return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI) - << "optimized " << NV("Intrinsic", StringRef(getMIName(MI))) - << " with count " << NV("Count", SumForOpt) << " out of " - << NV("Total", TotalCount) << " for " << NV("Versions", Version) - << " versions"; + return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) + << "optimized " << NV("Memop", MO.getName(TLI)) << " with count " + << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount) + << " for " << NV("Versions", Version) << " versions"; }); return true; @@ -412,13 +505,13 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) { static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, OptimizationRemarkEmitter &ORE, - DominatorTree *DT) { + DominatorTree *DT, TargetLibraryInfo &TLI) { if (DisableMemOPOPT) return false; if (F.hasFnAttribute(Attribute::OptimizeForSize)) return false; - MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT); + MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); MemOPSizeOpt.perform(); return MemOPSizeOpt.isChanged(); } @@ -429,7 +522,9 @@ bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) { auto &ORE = getAnalysis().getORE(); auto *DTWP = getAnalysisIfAvailable(); DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - return PGOMemOPSizeOptImpl(F, BFI, ORE, DT); + TargetLibraryInfo &TLI = + getAnalysis().getTLI(F); + return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); } namespace llvm { @@ -440,7 +535,8 @@ PreservedAnalyses PGOMemOPSizeOpt::run(Function &F, auto &BFI = FAM.getResult(F); auto &ORE = FAM.getResult(F); auto *DT = FAM.getCachedResult(F); - bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT); + auto &TLI = FAM.getResult(F); + bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); if (!Changed) return PreservedAnalyses::all(); auto PA = PreservedAnalyses(); diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp index 604726d..cd4f636 100644 --- a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp +++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp @@ -38,7 +38,7 @@ using PluginChainFinal = PluginChain; template <> class PluginChain<> { public: - PluginChain(Function &F) {} + PluginChain(Function &F, TargetLibraryInfo &TLI) {} void get(InstrProfValueKind K, std::vector &Candidates) {} }; @@ -48,7 +48,8 @@ class PluginChain : public PluginChain { using Base = PluginChain; public: - PluginChain(Function &F) : PluginChain(F), Plugin(F) {} + PluginChain(Function &F, TargetLibraryInfo &TLI) + : PluginChain(F, TLI), Plugin(F, TLI) {} void get(InstrProfValueKind K, std::vector &Candidates) { if (K == PluginT::Kind) @@ -65,8 +66,9 @@ public: using PluginChainFinal::PluginChainFinal; }; -ValueProfileCollector::ValueProfileCollector(Function &F) - : PImpl(new ValueProfileCollectorImpl(F)) {} +ValueProfileCollector::ValueProfileCollector(Function &F, + TargetLibraryInfo &TLI) + : PImpl(new ValueProfileCollectorImpl(F, TLI)) {} ValueProfileCollector::~ValueProfileCollector() = default; diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h index ff883c8..c3f549c 100644 --- a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h +++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h @@ -16,6 +16,7 @@ #ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H #define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" @@ -58,7 +59,7 @@ public: Instruction *AnnotatedInst; // Where metadata is attached. }; - ValueProfileCollector(Function &Fn); + ValueProfileCollector(Function &Fn, TargetLibraryInfo &TLI); ValueProfileCollector(ValueProfileCollector &&) = delete; ValueProfileCollector &operator=(ValueProfileCollector &&) = delete; diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc index 361035b..b5dd9fa 100644 --- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc +++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc @@ -23,12 +23,14 @@ using CandidateInfo = ValueProfileCollector::CandidateInfo; ///--------------------------- MemIntrinsicPlugin ------------------------------ class MemIntrinsicPlugin : public InstVisitor { Function &F; + TargetLibraryInfo &TLI; std::vector *Candidates; public: static constexpr InstrProfValueKind Kind = IPVK_MemOPSize; - MemIntrinsicPlugin(Function &Fn) : F(Fn), Candidates(nullptr) {} + MemIntrinsicPlugin(Function &Fn, TargetLibraryInfo &TLI) + : F(Fn), TLI(TLI), Candidates(nullptr) {} void run(std::vector &Cs) { Candidates = &Cs; @@ -45,6 +47,22 @@ public: Instruction *AnnotatedInst = &MI; Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst}); } + void visitCallInst(CallInst &CI) { + auto *F = CI.getCalledFunction(); + if (!F) + return; + LibFunc Func; + if (TLI.getLibFunc(CI, Func) && + (Func == LibFunc_memcmp || Func == LibFunc_bcmp)) { + Value *Length = CI.getArgOperand(2); + // Not instrument constant length calls. + if (dyn_cast(Length)) + return; + Instruction *InsertPt = &CI; + Instruction *AnnotatedInst = &CI; + Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst}); + } + } }; ///------------------------ IndirectCallPromotionPlugin ------------------------ @@ -54,7 +72,7 @@ class IndirectCallPromotionPlugin { public: static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget; - IndirectCallPromotionPlugin(Function &Fn) : F(Fn) {} + IndirectCallPromotionPlugin(Function &Fn, TargetLibraryInfo &TLI) : F(Fn) {} void run(std::vector &Candidates) { std::vector Result = findIndirectCalls(F); diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext b/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext index 400b29d..cce1a67 100644 --- a/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext +++ b/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext @@ -14,7 +14,27 @@ foo # ValueKind = IPVK_MemOPSize: 1 # NumValueSites: -1 +3 +9 +7:33 +2:88 +9:72 +4:66 +1:99 +5:55 +6:44 +3:77 +8:22 +9 +7:33 +2:88 +9:72 +4:66 +1:99 +5:55 +6:44 +3:77 +8:22 9 7:33 2:88 diff --git a/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll b/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll index a599884..5884a6e 100644 --- a/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll +++ b/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll @@ -33,6 +33,12 @@ for.body3: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i1 false) ; MEMOP_ANNOTATION: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i1 false) ; MEMOP_ANNOTATION-SAME: !prof ![[MEMOP_VALUESITE:[0-9]+]] + %memcmp = call i32 @memcmp(i8* %dst, i8* %src, i64 %conv) +; MEMOP_ANNOTATION: call i32 @memcmp(i8* %dst, i8* %src, i64 %conv) +; MEMOP_ANNOTATION-SAME: !prof ![[MEMOP_VALUESITE]] + %bcmp = call i32 @bcmp(i8* %dst, i8* %src, i64 %conv) +; MEMOP_ANNOTATION: call i32 @bcmp(i8* %dst, i8* %src, i64 %conv) +; MEMOP_ANNOTATION-SAME: !prof ![[MEMOP_VALUESITE]] ; MEMOP_ANNOTATION9: ![[MEMOP_VALUESITE]] = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66, i64 5, i64 55, i64 6, i64 44, i64 7, i64 33, i64 8, i64 22} ; MEMOP_ANNOTATION4: ![[MEMOP_VALUESITE]] = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72} br label %for.inc @@ -56,4 +62,7 @@ declare void @llvm.lifetime.start(i64, i8* nocapture) declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) +declare i32 @memcmp(i8*, i8*, i64) +declare i32 @bcmp(i8*, i8*, i64) + declare void @llvm.lifetime.end(i64, i8* nocapture) diff --git a/llvm/test/Transforms/PGOProfile/memop_size_opt.ll b/llvm/test/Transforms/PGOProfile/memop_size_opt.ll index 8d6215c..bc79fbc 100644 --- a/llvm/test/Transforms/PGOProfile/memop_size_opt.ll +++ b/llvm/test/Transforms/PGOProfile/memop_size_opt.ll @@ -1,8 +1,8 @@ -; RUN: opt < %s -pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -S | FileCheck %s --check-prefix=MEMOP_OPT -; RUN: opt < %s -passes=pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -S | FileCheck %s --check-prefix=MEMOP_OPT -; RUN: opt < %s -pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -pass-remarks-with-hotness -pass-remarks-output=%t.opt.yaml -S | FileCheck %s --check-prefix=MEMOP_OPT +; RUN: opt < %s -pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -pgo-memop-optimize-memcmp-bcmp -S | FileCheck %s --check-prefix=MEMOP_OPT +; RUN: opt < %s -passes=pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 --pgo-memop-optimize-memcmp-bcmp -S | FileCheck %s --check-prefix=MEMOP_OPT +; RUN: opt < %s -pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -pgo-memop-optimize-memcmp-bcmp -pass-remarks-with-hotness -pass-remarks-output=%t.opt.yaml -S | FileCheck %s --check-prefix=MEMOP_OPT ; RUN: FileCheck %s -input-file=%t.opt.yaml --check-prefix=YAML -; RUN: opt < %s -passes=pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -pass-remarks-with-hotness -pass-remarks-output=%t.opt.yaml -S | FileCheck %s --check-prefix=MEMOP_OPT +; RUN: opt < %s -passes=pgo-memop-opt -verify-dom-info -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -pgo-memop-optimize-memcmp-bcmp -pass-remarks-with-hotness -pass-remarks-output=%t.opt.yaml -S | FileCheck %s --check-prefix=MEMOP_OPT ; RUN: FileCheck %s -input-file=%t.opt.yaml --check-prefix=YAML @@ -57,12 +57,6 @@ for.body3: ; MEMOP_OPT: br label %[[MERGE_LABEL2]] ; MEMOP_OPT: [[MERGE_LABEL2]]: ; MEMOP_OPT: br label %for.inc -; MEMOP_OPT: [[SWITCH_BW]] = !{!"branch_weights", i32 457, i32 99} -; Should be 457 total left (original total count 556, minus 99 from specialized -; value 1, which is removed from VP array. Also, we only end up with 5 total -; values, since the default max number of promotions is 5 and therefore -; the rest of the values are ignored when extracting the VP metadata. -; MEMOP_OPT: [[NEWVP]] = !{!"VP", i32 1, i64 457, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66} for.inc: %inc = add nsw i32 %j.0, 1 @@ -79,6 +73,83 @@ for.end6: ret void } +declare void @consume(i32 %v1, i32 %v2) + +define void @foo_memcmp_bcmp(i8* %dst, i8* %src, i8* %dst2, i8* %src2, i32* %a, i32 %n) !prof !27 { +entry: + br label %for.cond + +for.cond: + %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end6, !prof !28 + +for.body: + br label %for.cond1 + +for.cond1: + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %idx.ext = sext i32 %i.0 to i64 + %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext + %0 = load i32, i32* %add.ptr, align 4 + %cmp2 = icmp slt i32 %j.0, %0 + br i1 %cmp2, label %for.body3, label %for.end, !prof !29 + +for.body3: + %add = add nsw i32 %i.0, 1 + %conv = sext i32 %add to i64 + %memcmp = call i32 @memcmp(i8* %dst, i8* %src, i64 %conv), !prof !30 + %bcmp = call i32 @bcmp(i8* %dst2, i8* %src2, i64 %conv), !prof !31 + call void @consume(i32 %memcmp, i32 %bcmp) + br label %for.inc + +; MEMOP_OPT: switch i64 %conv, label %[[DEFAULT_LABEL:.*]] [ +; MEMOP_OPT: i64 1, label %[[CASE_1_LABEL:.*]] +; MEMOP_OPT: ], !prof [[SWITCH_BW:![0-9]+]] +; MEMOP_OPT: [[CASE_1_LABEL]]: +; MEMOP_OPT: %[[RV:.*]] = call i32 @memcmp(i8* %dst, i8* %src, i64 1) +; MEMOP_OPT: br label %[[MERGE_LABEL:.*]] +; MEMOP_OPT: [[DEFAULT_LABEL]]: +; MEMOP_OPT: %[[RVD:.*]] = call i32 @memcmp(i8* %dst, i8* %src, i64 %conv), !prof [[NEWVP:![0-9]+]] +; MEMOP_OPT: br label %[[MERGE_LABEL]] +; MEMOP_OPT: [[MERGE_LABEL]]: +; MEMOP_OPT: %[[PHI:.*]] = phi i32 [ %[[RVD]], %[[DEFAULT_LABEL]] ], [ %[[RV]], %[[CASE_1_LABEL]] ] +; MEMOP_OPT: switch i64 %conv, label %[[DEFAULT_LABEL2:.*]] [ +; MEMOP_OPT: i64 1, label %[[CASE_1_LABEL2:.*]] +; MEMOP_OPT: ], !prof [[SWITCH_BW:![0-9]+]] +; MEMOP_OPT: [[CASE_1_LABEL2]]: +; MEMOP_OPT: %[[RV2:.*]] = call i32 @bcmp(i8* %dst2, i8* %src2, i64 1) +; MEMOP_OPT: br label %[[MERGE_LABEL2:.*]] +; MEMOP_OPT: [[DEFAULT_LABEL2]]: +; MEMOP_OPT: %[[RVD2:.*]] = call i32 @bcmp(i8* %dst2, i8* %src2, i64 %conv), !prof [[NEWVP]] +; MEMOP_OPT: br label %[[MERGE_LABEL2]] +; MEMOP_OPT: [[MERGE_LABEL2]]: +; MEMOP_OPT: %[[PHI2:.*]] = phi i32 [ %[[RVD2]], %[[DEFAULT_LABEL2]] ], [ %[[RV2]], %[[CASE_1_LABEL2]] ] +; MEMOP_OPT: call void @consume(i32 %[[PHI]], i32 %[[PHI2]]) +; MEMOP_OPT: br label %for.inc + +for.inc: + %inc = add nsw i32 %j.0, 1 + br label %for.cond1 + +for.end: + br label %for.inc4 + +for.inc4: + %inc5 = add nsw i32 %i.0, 1 + br label %for.cond + +for.end6: + ret void +} + +; MEMOP_OPT: [[SWITCH_BW]] = !{!"branch_weights", i32 457, i32 99} +; Should be 457 total left (original total count 556, minus 99 from specialized +; value 1, which is removed from VP array. Also, we only end up with 5 total +; values, since the default max number of promotions is 5 and therefore +; the rest of the values are ignored when extracting the VP metadata. +; MEMOP_OPT: [[NEWVP]] = !{!"VP", i32 1, i64 457, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66} + !llvm.module.flags = !{!0} !0 = !{i32 1, !"ProfileSummary", !1} @@ -118,6 +189,9 @@ declare void @llvm.lifetime.start(i64, i8* nocapture) declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) +declare i32 @memcmp(i8*, i8*, i64) +declare i32 @bcmp(i8*, i8*, i64) + declare void @llvm.lifetime.end(i64, i8* nocapture) ; YAML: --- !Passed @@ -127,7 +201,7 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) ; YAML-NEXT: Hotness: 0 ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'optimized ' -; YAML-NEXT: - Intrinsic: memcpy +; YAML-NEXT: - Memop: memcpy ; YAML-NEXT: - String: ' with count ' ; YAML-NEXT: - Count: '99' ; YAML-NEXT: - String: ' out of ' @@ -143,7 +217,39 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) ; YAML-NEXT: Hotness: 0 ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'optimized ' -; YAML-NEXT: - Intrinsic: memcpy +; YAML-NEXT: - Memop: memcpy +; YAML-NEXT: - String: ' with count ' +; YAML-NEXT: - Count: '99' +; YAML-NEXT: - String: ' out of ' +; YAML-NEXT: - Total: '556' +; YAML-NEXT: - String: ' for ' +; YAML-NEXT: - Versions: '1' +; YAML-NEXT: - String: ' versions' +; YAML-NEXT: ... +; YAML-NEXT: --- !Passed +; YAML-NEXT: Pass: pgo-memop-opt +; YAML-NEXT: Name: memopt-opt +; YAML-NEXT: Function: foo_memcmp_bcmp +; YAML-NEXT: Hotness: 0 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'optimized ' +; YAML-NEXT: - Memop: memcmp +; YAML-NEXT: - String: ' with count ' +; YAML-NEXT: - Count: '99' +; YAML-NEXT: - String: ' out of ' +; YAML-NEXT: - Total: '556' +; YAML-NEXT: - String: ' for ' +; YAML-NEXT: - Versions: '1' +; YAML-NEXT: - String: ' versions' +; YAML-NEXT: ... +; YAML-NEXT: --- !Passed +; YAML-NEXT: Pass: pgo-memop-opt +; YAML-NEXT: Name: memopt-opt +; YAML-NEXT: Function: foo_memcmp_bcmp +; YAML-NEXT: Hotness: 0 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'optimized ' +; YAML-NEXT: - Memop: bcmp ; YAML-NEXT: - String: ' with count ' ; YAML-NEXT: - Count: '99' ; YAML-NEXT: - String: ' out of ' -- 2.7.4