From: Mircea Trofin Date: Thu, 9 Jan 2020 01:11:23 +0000 (-0800) Subject: [NFC][InlineCost] Factor cost modeling out of CallAnalyzer traversal. X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=76aab66d34446ccf764cf8127b73e1517df75fb4;p=platform%2Fupstream%2Fllvm.git [NFC][InlineCost] Factor cost modeling out of CallAnalyzer traversal. Summary: The goal is to simplify experimentation on the cost model. Today, CallAnalyzer decides 2 things: legality, and benefit. The refactoring keeps legality assessment in CallAnalyzer, and factors benefit evaluation out, as an extension. Reviewers: davidxl, eraman Subscribers: kamleshbhalui, fedor.sergeev, hiraditya, baloghadamsoftware, haicheng, a.sidorin, Szelethus, donat.nagy, dkrupp, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D71733 --- diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index b5f4192..017301f 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -93,11 +93,12 @@ static cl::opt OptComputeFullInlineCost( "exceeds the threshold.")); namespace { - +class InlineCostCallAnalyzer; class CallAnalyzer : public InstVisitor { typedef InstVisitor Base; friend class InstVisitor; +protected: /// The TargetTransformInfo available for this compilation. const TargetTransformInfo &TTI; @@ -124,20 +125,86 @@ class CallAnalyzer : public InstVisitor { /// easily cacheable. Instead, use the cover function paramHasAttr. CallBase &CandidateCall; - /// Tunable parameters that control the analysis. - const InlineParams &Params; + /// Extension points for handling callsite features. + /// Called after a basic block was analyzed. + virtual void onBlockAnalyzed(const BasicBlock *BB) {} - /// Upper bound for the inlining cost. Bonuses are being applied to account - /// for speculative "expected profit" of the inlining decision. - int Threshold; + /// Called at the end of the analysis of the callsite. Return the outcome of + /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or + /// the reason it can't. + virtual InlineResult finalizeAnalysis() { return true; } - /// Inlining cost measured in abstract units, accounts for all the - /// instructions expected to be executed for a given function invocation. - /// Instructions that are statically proven to be dead based on call-site - /// arguments are not counted here. - int Cost = 0; + /// Called when we're about to start processing a basic block, and every time + /// we are done processing an instruction. Return true if there is no point in + /// continuing the analysis (e.g. we've determined already the call site is + /// too expensive to inline) + virtual bool shouldStop() { return false; } + + /// Called before the analysis of the callee body starts (with callsite + /// contexts propagated). It checks callsite-specific information. Return a + /// reason analysis can't continue if that's the case, or 'true' if it may + /// continue. + virtual InlineResult onAnalysisStart() { return true; } + + /// Called if the analysis engine decides SROA cannot be done for the given + /// alloca. + virtual void onDisableSROA(AllocaInst *Arg) {} + + /// Called the analysis engine determines load elimination won't happen. + virtual void onDisableLoadElimination() {} + + /// Called to account for a call. + virtual void onCallPenalty() {} + + /// Called to account for the expectation the inlining would result in a load + /// elimination. + virtual void onLoadEliminationOpportunity() {} + + /// Called to account for the cost of argument setup for the Call in the + /// callee's body (not the callsite currently under analysis). + virtual void onCallArgumentSetup(const CallBase &Call) {} + + /// Called to account for a load relative intrinsic. + virtual void onLoadRelativeIntrinsic() {} + + /// Called to account for a lowered call. + virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) { + } - bool ComputeFullInlineCost; + /// Account for a jump table of given size. Return false to stop further + /// processing the switch instruction + virtual bool onJumpTable(unsigned JumpTableSize) { return true; } + + /// Account for a case cluster of given size. Return false to stop further + /// processing of the instruction. + virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; } + + /// Called at the end of processing a switch instruction, with the given + /// number of case clusters. + virtual void onFinalizeSwitch(unsigned JumpTableSize, + unsigned NumCaseCluster) {} + + /// Called to account for any other instruction not specifically accounted + /// for. + virtual void onCommonInstructionSimplification() {} + + /// Start accounting potential benefits due to SROA for the given alloca. + virtual void onInitializeSROAArg(AllocaInst *Arg) {} + + /// Account SROA savings for the AllocaInst value. + virtual void onAggregateSROAUse(AllocaInst *V) {} + + bool handleSROA(Value *V, bool DoNotDisable) { + // Check for SROA candidates in comparisons. + if (auto *SROAArg = getSROAArgForValueOrNull(V)) { + if (DoNotDisable) { + onAggregateSROAUse(SROAArg); + return true; + } + disableSROA(SROAArg); + } + return false; + } bool IsCallerRecursive = false; bool IsRecursiveCall = false; @@ -149,20 +216,11 @@ class CallAnalyzer : public InstVisitor { bool HasUninlineableIntrinsic = false; bool InitsVargArgs = false; - /// Attempt to evaluate indirect calls to boost its inline cost. - bool BoostIndirectCalls; - /// Number of bytes allocated statically by the callee. uint64_t AllocatedSize = 0; unsigned NumInstructions = 0; unsigned NumVectorInstructions = 0; - /// Bonus to be applied when percentage of vector instructions in callee is - /// high (see more details in updateThreshold). - int VectorBonus = 0; - /// Bonus to be applied when the callee has only one reachable basic block. - int SingleBBBonus = 0; - /// While we walk the potentially-inlined instructions, we build up and /// maintain a mapping of simplified values specific to this callsite. The /// idea is to propagate any special information we have about arguments to @@ -174,12 +232,10 @@ class CallAnalyzer : public InstVisitor { /// Keep track of the values which map back (through function arguments) to /// allocas on the caller stack which could be simplified through SROA. - DenseMap SROAArgValues; - - /// The mapping of caller Alloca values to their accumulated cost savings. If - /// we have to disable SROA for one of the allocas, this tells us how much - /// cost must be added. - DenseMap SROAArgCosts; + /// We can disable an association (because for some reason the SROA oportunity + /// is lost) by setting the value to nullptr. We don't delete because we still + /// want isAllocaDerivedArg to function correctly. + DenseMap SROAArgValues; /// Keep track of values which map to a pointer base and constant offset. DenseMap> ConstantOffsetPtrs; @@ -196,17 +252,18 @@ class CallAnalyzer : public InstVisitor { /// loads. bool EnableLoadElimination; SmallPtrSet LoadAddrSet; - int LoadEliminationCost = 0; + + AllocaInst *getSROAArgForValueOrNull(Value *V) const { + auto It = SROAArgValues.find(V); + if (It == SROAArgValues.end()) + return nullptr; + return It->second; + } // Custom simplification helper routines. bool isAllocaDerivedArg(Value *V); - bool lookupSROAArgAndCost(Value *V, Value *&Arg, - DenseMap::iterator &CostIt); - void disableSROA(DenseMap::iterator CostIt); void disableSROA(Value *V); void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); - void accumulateSROACost(DenseMap::iterator CostIt, - int InstructionCost); void disableLoadElimination(); bool isGEPFree(GetElementPtrInst &GEP); bool canFoldInboundsGEP(GetElementPtrInst &I); @@ -227,32 +284,13 @@ class CallAnalyzer : public InstVisitor { /// inlined through this particular callsite. bool isKnownNonNullInCallee(Value *V); - /// Update Threshold based on callsite properties such as callee - /// attributes and callee hotness for PGO builds. The Callee is explicitly - /// passed to support analyzing indirect calls whose target is inferred by - /// analysis. - void updateThreshold(CallBase &Call, Function &Callee); - /// Return true if size growth is allowed when inlining the callee at \p Call. bool allowSizeGrowth(CallBase &Call); - /// Return true if \p Call is a cold callsite. - bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); - - /// Return a higher threshold if \p Call is a hot callsite. - Optional getHotCallSiteThreshold(CallBase &Call, - BlockFrequencyInfo *CallerBFI); - // Custom analysis routines. InlineResult analyzeBlock(BasicBlock *BB, SmallPtrSetImpl &EphValues); - /// Handle a capped 'int' increment for Cost. - void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { - assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound"); - Cost = (int)std::min(UpperBound, Cost + Inc); - } - // Disable several entry points to the visitor so we don't accidentally use // them by declaring but not defining them here. void visit(Module *); @@ -298,20 +336,13 @@ public: std::function &GetAssumptionCache, Optional> &GetBFI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, - Function &Callee, CallBase &Call, const InlineParams &Params, - bool BoostIndirect = true) + Function &Callee, CallBase &Call) : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), - CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold), - ComputeFullInlineCost(OptComputeFullInlineCost || - Params.ComputeFullInlineCost || ORE), - BoostIndirectCalls(BoostIndirect), EnableLoadElimination(true) {} + CandidateCall(Call), EnableLoadElimination(true) {} InlineResult analyze(); - int getThreshold() { return Threshold; } - int getCost() { return Cost; } - // Keep a bunch of stats about the cost savings found so we can print them // out when debugging. unsigned NumConstantArgs = 0; @@ -320,12 +351,283 @@ public: unsigned NumConstantPtrCmps = 0; unsigned NumConstantPtrDiffs = 0; unsigned NumInstructionsSimplified = 0; + + void dump(); +}; + +/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note +/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer +class InlineCostCallAnalyzer final : public CallAnalyzer { + const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; + const bool ComputeFullInlineCost; + int LoadEliminationCost = 0; + /// Bonus to be applied when percentage of vector instructions in callee is + /// high (see more details in updateThreshold). + int VectorBonus = 0; + /// Bonus to be applied when the callee has only one reachable basic block. + int SingleBBBonus = 0; + + /// Tunable parameters that control the analysis. + const InlineParams &Params; + + /// Upper bound for the inlining cost. Bonuses are being applied to account + /// for speculative "expected profit" of the inlining decision. + int Threshold = 0; + + /// Attempt to evaluate indirect calls to boost its inline cost. + const bool BoostIndirectCalls; + + /// Inlining cost measured in abstract units, accounts for all the + /// instructions expected to be executed for a given function invocation. + /// Instructions that are statically proven to be dead based on call-site + /// arguments are not counted here. + int Cost = 0; + + bool SingleBB = true; + unsigned SROACostSavings = 0; unsigned SROACostSavingsLost = 0; + /// The mapping of caller Alloca values to their accumulated cost savings. If + /// we have to disable SROA for one of the allocas, this tells us how much + /// cost must be added. + DenseMap SROAArgCosts; + + /// Return true if \p Call is a cold callsite. + bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); + + /// Update Threshold based on callsite properties such as callee + /// attributes and callee hotness for PGO builds. The Callee is explicitly + /// passed to support analyzing indirect calls whose target is inferred by + /// analysis. + void updateThreshold(CallBase &Call, Function &Callee); + /// Return a higher threshold if \p Call is a hot callsite. + Optional getHotCallSiteThreshold(CallBase &Call, + BlockFrequencyInfo *CallerBFI); + + /// Handle a capped 'int' increment for Cost. + void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { + assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound"); + Cost = (int)std::min(UpperBound, Cost + Inc); + } + + void onDisableSROA(AllocaInst *Arg) override { + auto CostIt = SROAArgCosts.find(Arg); + if (CostIt == SROAArgCosts.end()) + return; + addCost(CostIt->second); + SROACostSavings -= CostIt->second; + SROACostSavingsLost += CostIt->second; + SROAArgCosts.erase(CostIt); + } + + void onDisableLoadElimination() override { + addCost(LoadEliminationCost); + LoadEliminationCost = 0; + } + void onCallPenalty() override { addCost(InlineConstants::CallPenalty); } + void onCallArgumentSetup(const CallBase &Call) override { + // Pay the price of the argument setup. We account for the average 1 + // instruction per call argument setup here. + addCost(Call.arg_size() * InlineConstants::InstrCost); + } + void onLoadRelativeIntrinsic() override { + // This is normally lowered to 4 LLVM instructions. + addCost(3 * InlineConstants::InstrCost); + } + void onLoweredCall(Function *F, CallBase &Call, + bool IsIndirectCall) override { + // We account for the average 1 instruction per call argument setup here. + addCost(Call.arg_size() * InlineConstants::InstrCost); + + // If we have a constant that we are calling as a function, we can peer + // through it and see the function target. This happens not infrequently + // during devirtualization and so we want to give it a hefty bonus for + // inlining, but cap that bonus in the event that inlining wouldn't pan out. + // Pretend to inline the function, with a custom threshold. + if (IsIndirectCall && BoostIndirectCalls) { + auto IndirectCallParams = Params; + IndirectCallParams.DefaultThreshold = + InlineConstants::IndirectCallThreshold; + /// FIXME: if InlineCostCallAnalyzer is derived from, this may need + /// to instantiate the derived class. + InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, + Call, IndirectCallParams, false); + if (CA.analyze()) { + // We were able to inline the indirect call! Subtract the cost from the + // threshold to get the bonus we want to apply, but don't go below zero. + Cost -= std::max(0, CA.getThreshold() - CA.getCost()); + } + } else + // Otherwise simply add the cost for merely making the call. + addCost(InlineConstants::CallPenalty); + } + + void onFinalizeSwitch(unsigned JumpTableSize, + unsigned NumCaseCluster) override { + // If suitable for a jump table, consider the cost for the table size and + // branch to destination. + // Maximum valid cost increased in this function. + if (JumpTableSize) { + int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + + 4 * InlineConstants::InstrCost; + + addCost(JTCost, (int64_t)CostUpperBound); + return; + } + // Considering forming a binary search, we should find the number of nodes + // which is same as the number of comparisons when lowered. For a given + // number of clusters, n, we can define a recursive function, f(n), to find + // the number of nodes in the tree. The recursion is : + // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, + // and f(n) = n, when n <= 3. + // This will lead a binary tree where the leaf should be either f(2) or f(3) + // when n > 3. So, the number of comparisons from leaves should be n, while + // the number of non-leaf should be : + // 2^(log2(n) - 1) - 1 + // = 2^log2(n) * 2^-1 - 1 + // = n / 2 - 1. + // Considering comparisons from leaf and non-leaf nodes, we can estimate the + // number of comparisons in a simple closed form : + // n + n / 2 - 1 = n * 3 / 2 - 1 + if (NumCaseCluster <= 3) { + // Suppose a comparison includes one compare and one conditional branch. + addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); + return; + } + + int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; + int64_t SwitchCost = + ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; + + addCost(SwitchCost, (int64_t)CostUpperBound); + } + void onCommonInstructionSimplification() override { + addCost(InlineConstants::InstrCost); + } + + void onInitializeSROAArg(AllocaInst *Arg) override { SROAArgCosts[Arg] = 0; } + + void onAggregateSROAUse(AllocaInst *SROAArg) override { + auto CostIt = SROAArgCosts.find(SROAArg); + CostIt->second += InlineConstants::InstrCost; + SROACostSavings += InlineConstants::InstrCost; + } + + virtual void onBlockAnalyzed(const BasicBlock *BB) override { + auto *TI = BB->getTerminator(); + // If we had any successors at this point, than post-inlining is likely to + // have them as well. Note that we assume any basic blocks which existed + // due to branches or switches which folded above will also fold after + // inlining. + if (SingleBB && TI->getNumSuccessors() > 1) { + // Take off the bonus we applied to the threshold. + Threshold -= SingleBBBonus; + SingleBB = false; + } + } + virtual InlineResult finalizeAnalysis() override { + // Loops generally act a lot like calls in that they act like barriers to + // movement, require a certain amount of setup, etc. So when optimising for + // size, we penalise any call sites that perform loops. We do this after all + // other costs here, so will likely only be dealing with relatively small + // functions (and hence DT and LI will hopefully be cheap). + auto *Caller = CandidateCall.getFunction(); + if (Caller->hasMinSize()) { + DominatorTree DT(F); + LoopInfo LI(DT); + int NumLoops = 0; + for (Loop *L : LI) { + // Ignore loops that will not be executed + if (DeadBlocks.count(L->getHeader())) + continue; + NumLoops++; + } + addCost(NumLoops * InlineConstants::CallPenalty); + } + + // We applied the maximum possible vector bonus at the beginning. Now, + // subtract the excess bonus, if any, from the Threshold before + // comparing against Cost. + if (NumVectorInstructions <= NumInstructions / 10) + Threshold -= VectorBonus; + else if (NumVectorInstructions <= NumInstructions / 2) + Threshold -= VectorBonus / 2; + + return Cost < std::max(1, Threshold); + } + virtual bool shouldStop() override { + // Bail out the moment we cross the threshold. This means we'll under-count + // the cost, but only when undercounting doesn't matter. + return Cost >= Threshold && !ComputeFullInlineCost; + } + + virtual void onLoadEliminationOpportunity() { + LoadEliminationCost += InlineConstants::InstrCost; + } + + InlineResult onAnalysisStart() override { + // Perform some tweaks to the cost and threshold based on the direct + // callsite information. + + // We want to more aggressively inline vector-dense kernels, so up the + // threshold, and we'll lower it if the % of vector instructions gets too + // low. Note that these bonuses are some what arbitrary and evolved over + // time by accident as much as because they are principled bonuses. + // + // FIXME: It would be nice to remove all such bonuses. At least it would be + // nice to base the bonus values on something more scientific. + assert(NumInstructions == 0); + assert(NumVectorInstructions == 0); + + // Update the threshold based on callsite properties + updateThreshold(CandidateCall, F); + + // While Threshold depends on commandline options that can take negative + // values, we want to enforce the invariant that the computed threshold and + // bonuses are non-negative. + assert(Threshold >= 0); + assert(SingleBBBonus >= 0); + assert(VectorBonus >= 0); + + // Speculatively apply all possible bonuses to Threshold. If cost exceeds + // this Threshold any time, and cost cannot decrease, we can stop processing + // the rest of the function body. + Threshold += (SingleBBBonus + VectorBonus); + + // Give out bonuses for the callsite, as the instructions setting them up + // will be gone after inlining. + addCost(-getCallsiteCost(this->CandidateCall, DL)); + + // If this function uses the coldcc calling convention, prefer not to inline + // it. + if (F.getCallingConv() == CallingConv::Cold) + Cost += InlineConstants::ColdccPenalty; + + // Check if we're done. This can happen due to bonuses and penalties. + if (Cost >= Threshold && !ComputeFullInlineCost) + return "high cost"; + + return true; + } + +public: + InlineCostCallAnalyzer( + const TargetTransformInfo &TTI, + std::function &GetAssumptionCache, + Optional> &GetBFI, + ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, + CallBase &Call, const InlineParams &Params, bool BoostIndirect = true) + : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call), + ComputeFullInlineCost(OptComputeFullInlineCost || + Params.ComputeFullInlineCost || ORE), + Params(Params), Threshold(Params.DefaultThreshold), + BoostIndirectCalls(BoostIndirect) {} void dump(); -}; + int getThreshold() { return Threshold; } + int getCost() { return Cost; } +}; } // namespace /// Test whether the given value is an Alloca-derived function argument. @@ -333,55 +635,22 @@ bool CallAnalyzer::isAllocaDerivedArg(Value *V) { return SROAArgValues.count(V); } -/// Lookup the SROA-candidate argument and cost iterator which V maps to. -/// Returns false if V does not map to a SROA-candidate. -bool CallAnalyzer::lookupSROAArgAndCost( - Value *V, Value *&Arg, DenseMap::iterator &CostIt) { - if (SROAArgValues.empty() || SROAArgCosts.empty()) - return false; - - DenseMap::iterator ArgIt = SROAArgValues.find(V); - if (ArgIt == SROAArgValues.end()) - return false; - - Arg = ArgIt->second; - CostIt = SROAArgCosts.find(Arg); - return CostIt != SROAArgCosts.end(); -} - -/// Disable SROA for the candidate marked by this cost iterator. -/// -/// This marks the candidate as no longer viable for SROA, and adds the cost -/// savings associated with it back into the inline cost measurement. -void CallAnalyzer::disableSROA(DenseMap::iterator CostIt) { - // If we're no longer able to perform SROA we need to undo its cost savings - // and prevent subsequent analysis. - addCost(CostIt->second); - SROACostSavings -= CostIt->second; - SROACostSavingsLost += CostIt->second; - SROAArgCosts.erase(CostIt); - disableLoadElimination(); -} - /// If 'V' maps to a SROA candidate, disable SROA for it. void CallAnalyzer::disableSROA(Value *V) { - Value *SROAArg; - DenseMap::iterator CostIt; - if (lookupSROAArgAndCost(V, SROAArg, CostIt)) - disableSROA(CostIt); -} - -/// Accumulate the given cost for a particular SROA candidate. -void CallAnalyzer::accumulateSROACost(DenseMap::iterator CostIt, - int InstructionCost) { - CostIt->second += InstructionCost; - SROACostSavings += InstructionCost; + auto It = SROAArgValues.find(V); + if (It == SROAArgValues.end()) + return; + auto *SROAArg = It->second; + if (!SROAArg) + return; + It->second = nullptr; + onDisableSROA(SROAArg); + disableLoadElimination(); } void CallAnalyzer::disableLoadElimination() { if (EnableLoadElimination) { - addCost(LoadEliminationCost); - LoadEliminationCost = 0; + onDisableLoadElimination(); EnableLoadElimination = false; } } @@ -553,9 +822,7 @@ bool CallAnalyzer::visitPHI(PHINode &I) { if (FirstBaseAndOffset.first) { ConstantOffsetPtrs[&I] = FirstBaseAndOffset; - Value *SROAArg; - DenseMap::iterator CostIt; - if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) + if (auto *SROAArg = getSROAArgForValueOrNull(FirstV)) SROAArgValues[&I] = SROAArg; } @@ -585,10 +852,8 @@ bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { } bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { - Value *SROAArg; DenseMap::iterator CostIt; - bool SROACandidate = - lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); + auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand()); // Lambda to check whether a GEP's indices are all constant. auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { @@ -599,7 +864,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { }; if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { - if (SROACandidate) + if (SROAArg) SROAArgValues[&I] = SROAArg; // Constant GEPs are modeled as free. @@ -607,8 +872,8 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { } // Variable GEPs will require math and will disable SROA. - if (SROACandidate) - disableSROA(CostIt); + if (SROAArg) + disableSROA(SROAArg); return isGEPFree(I); } @@ -648,9 +913,7 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) { ConstantOffsetPtrs[&I] = BaseAndOffset; // Also look for SROA candidates here. - Value *SROAArg; - DenseMap::iterator CostIt; - if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) + if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0))) SROAArgValues[&I] = SROAArg; // Bitcasts are always zero cost. @@ -682,9 +945,7 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { // and so we can just add the integer in here. The only places where SROA is // preserved either cannot fire on an integer, or won't in-and-of themselves // disable SROA (ext) w/o some later use that we would see and disable. - Value *SROAArg; - DenseMap::iterator CostIt; - if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) + if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0))) SROAArgValues[&I] = SROAArg; return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); @@ -708,9 +969,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { } // "Propagate" SROA here in the same manner as we do for ptrtoint above. - Value *SROAArg; - DenseMap::iterator CostIt; - if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) + if (auto *SROAArg = getSROAArgForValueOrNull(Op)) SROAArgValues[&I] = SROAArg; return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); @@ -737,7 +996,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) { case Instruction::FPToUI: case Instruction::FPToSI: if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) - addCost(InlineConstants::CallPenalty); + onCallPenalty(); break; default: break; @@ -810,8 +1069,8 @@ bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { return true; } -bool CallAnalyzer::isColdCallSite(CallBase &Call, - BlockFrequencyInfo *CallerBFI) { +bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, + BlockFrequencyInfo *CallerBFI) { // If global profile summary is available, then callsite's coldness is // determined based on that. if (PSI && PSI->hasProfileSummary()) @@ -834,8 +1093,8 @@ bool CallAnalyzer::isColdCallSite(CallBase &Call, } Optional -CallAnalyzer::getHotCallSiteThreshold(CallBase &Call, - BlockFrequencyInfo *CallerBFI) { +InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, + BlockFrequencyInfo *CallerBFI) { // If global profile summary is available, then callsite's hotness is // determined based on that. @@ -862,7 +1121,7 @@ CallAnalyzer::getHotCallSiteThreshold(CallBase &Call, return None; } -void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { +void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { // If no size growth is allowed for this inlining, set Threshold to 0. if (!allowSizeGrowth(Call)) { Threshold = 0; @@ -1024,19 +1283,7 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) { : ConstantInt::getFalse(I.getType()); return true; } - // Finally check for SROA candidates in comparisons. - Value *SROAArg; - DenseMap::iterator CostIt; - if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { - if (isa(I.getOperand(1))) { - accumulateSROACost(CostIt, InlineConstants::InstrCost); - return true; - } - - disableSROA(CostIt); - } - - return false; + return handleSROA(I.getOperand(0), isa(I.getOperand(1))); } bool CallAnalyzer::visitSub(BinaryOperator &I) { @@ -1100,7 +1347,7 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { if (I.getType()->isFloatingPointTy() && TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive && !match(&I, m_FNeg(m_Value()))) - addCost(InlineConstants::CallPenalty); + onCallPenalty(); return false; } @@ -1127,23 +1374,15 @@ bool CallAnalyzer::visitFNeg(UnaryOperator &I) { } bool CallAnalyzer::visitLoad(LoadInst &I) { - Value *SROAArg; - DenseMap::iterator CostIt; - if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { - if (I.isSimple()) { - accumulateSROACost(CostIt, InlineConstants::InstrCost); - return true; - } - - disableSROA(CostIt); - } + if (handleSROA(I.getPointerOperand(), I.isSimple())) + return true; // If the data is already loaded from this address and hasn't been clobbered // by any stores or calls, this load is likely to be redundant and can be // eliminated. if (EnableLoadElimination && !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { - LoadEliminationCost += InlineConstants::InstrCost; + onLoadEliminationOpportunity(); return true; } @@ -1151,16 +1390,8 @@ bool CallAnalyzer::visitLoad(LoadInst &I) { } bool CallAnalyzer::visitStore(StoreInst &I) { - Value *SROAArg; - DenseMap::iterator CostIt; - if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { - if (I.isSimple()) { - accumulateSROACost(CostIt, InlineConstants::InstrCost); - return true; - } - - disableSROA(CostIt); - } + if (handleSROA(I.getPointerOperand(), I.isSimple())) + return true; // The store can potentially clobber loads and prevent repeated loads from // being eliminated. @@ -1250,9 +1481,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) { // in this inline context. If not, we've done all we can. F = dyn_cast_or_null(SimplifiedValues.lookup(Callee)); if (!F) { - // Pay the price of the argument setup. We account for the average 1 - // instruction per call argument setup here. - addCost(Call.arg_size() * InlineConstants::InstrCost); + onCallArgumentSetup(Call); if (!Call.onlyReadsMemory()) disableLoadElimination(); @@ -1276,8 +1505,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) { return Base::visitCallBase(Call); case Intrinsic::load_relative: - // This is normally lowered to 4 LLVM instructions. - addCost(3 * InlineConstants::InstrCost); + onLoadRelativeIntrinsic(); return false; case Intrinsic::memset: @@ -1304,28 +1532,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) { } if (TTI.isLoweredToCall(F)) { - // We account for the average 1 instruction per call argument setup here. - addCost(Call.arg_size() * InlineConstants::InstrCost); - - // If we have a constant that we are calling as a function, we can peer - // through it and see the function target. This happens not infrequently - // during devirtualization and so we want to give it a hefty bonus for - // inlining, but cap that bonus in the event that inlining wouldn't pan out. - // Pretend to inline the function, with a custom threshold. - if (IsIndirectCall && BoostIndirectCalls) { - auto IndirectCallParams = Params; - IndirectCallParams.DefaultThreshold = - InlineConstants::IndirectCallThreshold; - CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call, - IndirectCallParams, false); - if (CA.analyze()) { - // We were able to inline the indirect call! Subtract the cost from the - // threshold to get the bonus we want to apply, but don't go below zero. - Cost -= std::max(0, CA.getThreshold() - CA.getCost()); - } - } else - // Otherwise simply add the cost for merely making the call. - addCost(InlineConstants::CallPenalty); + onLoweredCall(F, Call, IsIndirectCall); } if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory()))) @@ -1381,9 +1588,7 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) { if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; - Value *SROAArg; - DenseMap::iterator CostIt; - if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) + if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal)) SROAArgValues[&SI] = SROAArg; return true; } @@ -1422,9 +1627,7 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) { if (BaseAndOffset.first) { ConstantOffsetPtrs[&SI] = BaseAndOffset; - Value *SROAArg; - DenseMap::iterator CostIt; - if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) + if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV)) SROAArgValues[&SI] = SROAArg; } @@ -1452,49 +1655,12 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { // inlining those. It will prevent inlining in cases where the optimization // does not (yet) fire. - // Maximum valid cost increased in this function. - int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; - unsigned JumpTableSize = 0; BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr; unsigned NumCaseCluster = TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI); - // If suitable for a jump table, consider the cost for the table size and - // branch to destination. - if (JumpTableSize) { - int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + - 4 * InlineConstants::InstrCost; - - addCost(JTCost, (int64_t)CostUpperBound); - return false; - } - - // Considering forming a binary search, we should find the number of nodes - // which is same as the number of comparisons when lowered. For a given - // number of clusters, n, we can define a recursive function, f(n), to find - // the number of nodes in the tree. The recursion is : - // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, - // and f(n) = n, when n <= 3. - // This will lead a binary tree where the leaf should be either f(2) or f(3) - // when n > 3. So, the number of comparisons from leaves should be n, while - // the number of non-leaf should be : - // 2^(log2(n) - 1) - 1 - // = 2^log2(n) * 2^-1 - 1 - // = n / 2 - 1. - // Considering comparisons from leaf and non-leaf nodes, we can estimate the - // number of comparisons in a simple closed form : - // n + n / 2 - 1 = n * 3 / 2 - 1 - if (NumCaseCluster <= 3) { - // Suppose a comparison includes one compare and one conditional branch. - addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); - return false; - } - - int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; - int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; - - addCost(SwitchCost, (int64_t)CostUpperBound); + onFinalizeSwitch(JumpTableSize, NumCaseCluster); return false; } @@ -1587,7 +1753,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB, if (Base::visit(&*I)) ++NumInstructionsSimplified; else - addCost(InlineConstants::InstrCost); + onCommonInstructionSimplification(); using namespace ore; // If the visit this instruction detected an uninlinable pattern, abort. @@ -1632,9 +1798,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB, return IR; } - // Check if we've passed the maximum possible threshold so we don't spin in - // huge basic blocks that will never inline. - if (Cost >= Threshold && !ComputeFullInlineCost) + if (shouldStop()) return false; } @@ -1728,46 +1892,9 @@ void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { InlineResult CallAnalyzer::analyze() { ++NumCallsAnalyzed; - // Perform some tweaks to the cost and threshold based on the direct - // callsite information. - - // We want to more aggressively inline vector-dense kernels, so up the - // threshold, and we'll lower it if the % of vector instructions gets too - // low. Note that these bonuses are some what arbitrary and evolved over time - // by accident as much as because they are principled bonuses. - // - // FIXME: It would be nice to remove all such bonuses. At least it would be - // nice to base the bonus values on something more scientific. - assert(NumInstructions == 0); - assert(NumVectorInstructions == 0); - - // Update the threshold based on callsite properties - updateThreshold(CandidateCall, F); - - // While Threshold depends on commandline options that can take negative - // values, we want to enforce the invariant that the computed threshold and - // bonuses are non-negative. - assert(Threshold >= 0); - assert(SingleBBBonus >= 0); - assert(VectorBonus >= 0); - - // Speculatively apply all possible bonuses to Threshold. If cost exceeds - // this Threshold any time, and cost cannot decrease, we can stop processing - // the rest of the function body. - Threshold += (SingleBBBonus + VectorBonus); - - // Give out bonuses for the callsite, as the instructions setting them up - // will be gone after inlining. - addCost(-getCallsiteCost(CandidateCall, DL)); - - // If this function uses the coldcc calling convention, prefer not to inline - // it. - if (F.getCallingConv() == CallingConv::Cold) - Cost += InlineConstants::ColdccPenalty; - - // Check if we're done. This can happen due to bonuses and penalties. - if (Cost >= Threshold && !ComputeFullInlineCost) - return "high cost"; + auto Result = onAnalysisStart(); + if (!Result) + return Result; if (F.empty()) return true; @@ -1796,9 +1923,9 @@ InlineResult CallAnalyzer::analyze() { ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); // We can SROA any pointer arguments derived from alloca instructions. - if (isa(PtrArg)) { - SROAArgValues[&*FAI] = PtrArg; - SROAArgCosts[PtrArg] = 0; + if (auto *SROAArg = dyn_cast(PtrArg)) { + SROAArgValues[&*FAI] = SROAArg; + onInitializeSROAArg(SROAArg); } } } @@ -1824,12 +1951,10 @@ InlineResult CallAnalyzer::analyze() { BBSetVector; BBSetVector BBWorklist; BBWorklist.insert(&F.getEntryBlock()); - bool SingleBB = true; + // Note that we *must not* cache the size, this loop grows the worklist. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { - // Bail out the moment we cross the threshold. This means we'll under-count - // the cost, but only when undercounting doesn't matter. - if (Cost >= Threshold && !ComputeFullInlineCost) + if (shouldStop()) break; BasicBlock *BB = BBWorklist[Idx]; @@ -1889,15 +2014,7 @@ InlineResult CallAnalyzer::analyze() { ++TIdx) BBWorklist.insert(TI->getSuccessor(TIdx)); - // If we had any successors at this point, than post-inlining is likely to - // have them as well. Note that we assume any basic blocks which existed - // due to branches or switches which folded above will also fold after - // inlining. - if (SingleBB && TI->getNumSuccessors() > 1) { - // Take off the bonus we applied to the threshold. - Threshold -= SingleBBBonus; - SingleBB = false; - } + onBlockAnalyzed(BB); } bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() && @@ -1908,38 +2025,12 @@ InlineResult CallAnalyzer::analyze() { if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) return "noduplicate"; - // Loops generally act a lot like calls in that they act like barriers to - // movement, require a certain amount of setup, etc. So when optimising for - // size, we penalise any call sites that perform loops. We do this after all - // other costs here, so will likely only be dealing with relatively small - // functions (and hence DT and LI will hopefully be cheap). - if (Caller->hasMinSize()) { - DominatorTree DT(F); - LoopInfo LI(DT); - int NumLoops = 0; - for (Loop *L : LI) { - // Ignore loops that will not be executed - if (DeadBlocks.count(L->getHeader())) - continue; - NumLoops++; - } - addCost(NumLoops * InlineConstants::CallPenalty); - } - - // We applied the maximum possible vector bonus at the beginning. Now, - // subtract the excess bonus, if any, from the Threshold before - // comparing against Cost. - if (NumVectorInstructions <= NumInstructions / 10) - Threshold -= VectorBonus; - else if (NumVectorInstructions <= NumInstructions / 2) - Threshold -= VectorBonus / 2; - - return Cost < std::max(1, Threshold); + return finalizeAnalysis(); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Dump stats about this call's analysis. -LLVM_DUMP_METHOD void CallAnalyzer::dump() { +LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" DEBUG_PRINT_STAT(NumConstantArgs); DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); @@ -2073,8 +2164,8 @@ InlineCost llvm::getInlineCost( LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "... (caller:" << Caller->getName() << ")\n"); - CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, - Call, Params); + InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, + *Callee, Call, Params); InlineResult ShouldInline = CA.analyze(); LLVM_DEBUG(CA.dump()); @@ -2121,15 +2212,16 @@ InlineResult llvm::isInlineViable(Function &F) { switch (Call->getCalledFunction()->getIntrinsicID()) { default: break; - // Disallow inlining of @llvm.icall.branch.funnel because current - // backend can't separate call targets from call arguments. + // Disallow inlining of @llvm.icall.branch.funnel because current + // backend can't separate call targets from call arguments. case llvm::Intrinsic::icall_branch_funnel: return "disallowed inlining of @llvm.icall.branch.funnel"; - // Disallow inlining functions that call @llvm.localescape. Doing this - // correctly would require major changes to the inliner. + // Disallow inlining functions that call @llvm.localescape. Doing this + // correctly would require major changes to the inliner. case llvm::Intrinsic::localescape: return "disallowed inlining of @llvm.localescape"; - // Disallow inlining of functions that initialize VarArgs with va_start. + // Disallow inlining of functions that initialize VarArgs with + // va_start. case llvm::Intrinsic::vastart: return "contains VarArgs initialized with va_start"; }