From 338a601612ca36e112b14f622eb310985b93192a Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 8 Jan 2020 17:42:23 -0800 Subject: [PATCH] Revert "[NFC][InlineCost] Factor cost modeling out of CallAnalyzer traversal." This reverts commit 76aab66d34446ccf764cf8127b73e1517df75fb4. Failure: http://lab.llvm.org:8011/builders/clang-with-thin-lto-ubuntu/builds/20562, will investigate and resubmit. --- llvm/lib/Analysis/InlineCost.cpp | 752 +++++++++++++++++---------------------- 1 file changed, 330 insertions(+), 422 deletions(-) diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index 017301f..b5f4192 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -93,12 +93,11 @@ static cl::opt OptComputeFullInlineCost( "exceeds the threshold.")); namespace { -class InlineCostCallAnalyzer; + class CallAnalyzer : public InstVisitor { typedef InstVisitor Base; friend class InstVisitor; -protected: /// The TargetTransformInfo available for this compilation. const TargetTransformInfo &TTI; @@ -125,86 +124,20 @@ protected: /// easily cacheable. Instead, use the cover function paramHasAttr. CallBase &CandidateCall; - /// Extension points for handling callsite features. - /// Called after a basic block was analyzed. - virtual void onBlockAnalyzed(const BasicBlock *BB) {} - - /// Called at the end of the analysis of the callsite. Return the outcome of - /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or - /// the reason it can't. - virtual InlineResult finalizeAnalysis() { return true; } - - /// Called when we're about to start processing a basic block, and every time - /// we are done processing an instruction. Return true if there is no point in - /// continuing the analysis (e.g. we've determined already the call site is - /// too expensive to inline) - virtual bool shouldStop() { return false; } - - /// Called before the analysis of the callee body starts (with callsite - /// contexts propagated). It checks callsite-specific information. Return a - /// reason analysis can't continue if that's the case, or 'true' if it may - /// continue. - virtual InlineResult onAnalysisStart() { return true; } - - /// Called if the analysis engine decides SROA cannot be done for the given - /// alloca. - virtual void onDisableSROA(AllocaInst *Arg) {} - - /// Called the analysis engine determines load elimination won't happen. - virtual void onDisableLoadElimination() {} - - /// Called to account for a call. - virtual void onCallPenalty() {} - - /// Called to account for the expectation the inlining would result in a load - /// elimination. - virtual void onLoadEliminationOpportunity() {} - - /// Called to account for the cost of argument setup for the Call in the - /// callee's body (not the callsite currently under analysis). - virtual void onCallArgumentSetup(const CallBase &Call) {} - - /// Called to account for a load relative intrinsic. - virtual void onLoadRelativeIntrinsic() {} - - /// Called to account for a lowered call. - virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) { - } - - /// Account for a jump table of given size. Return false to stop further - /// processing the switch instruction - virtual bool onJumpTable(unsigned JumpTableSize) { return true; } - - /// Account for a case cluster of given size. Return false to stop further - /// processing of the instruction. - virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; } - - /// Called at the end of processing a switch instruction, with the given - /// number of case clusters. - virtual void onFinalizeSwitch(unsigned JumpTableSize, - unsigned NumCaseCluster) {} - - /// Called to account for any other instruction not specifically accounted - /// for. - virtual void onCommonInstructionSimplification() {} + /// Tunable parameters that control the analysis. + const InlineParams &Params; - /// Start accounting potential benefits due to SROA for the given alloca. - virtual void onInitializeSROAArg(AllocaInst *Arg) {} + /// Upper bound for the inlining cost. Bonuses are being applied to account + /// for speculative "expected profit" of the inlining decision. + int Threshold; - /// Account SROA savings for the AllocaInst value. - virtual void onAggregateSROAUse(AllocaInst *V) {} + /// Inlining cost measured in abstract units, accounts for all the + /// instructions expected to be executed for a given function invocation. + /// Instructions that are statically proven to be dead based on call-site + /// arguments are not counted here. + int Cost = 0; - bool handleSROA(Value *V, bool DoNotDisable) { - // Check for SROA candidates in comparisons. - if (auto *SROAArg = getSROAArgForValueOrNull(V)) { - if (DoNotDisable) { - onAggregateSROAUse(SROAArg); - return true; - } - disableSROA(SROAArg); - } - return false; - } + bool ComputeFullInlineCost; bool IsCallerRecursive = false; bool IsRecursiveCall = false; @@ -216,11 +149,20 @@ protected: bool HasUninlineableIntrinsic = false; bool InitsVargArgs = false; + /// Attempt to evaluate indirect calls to boost its inline cost. + bool BoostIndirectCalls; + /// Number of bytes allocated statically by the callee. uint64_t AllocatedSize = 0; unsigned NumInstructions = 0; unsigned NumVectorInstructions = 0; + /// Bonus to be applied when percentage of vector instructions in callee is + /// high (see more details in updateThreshold). + int VectorBonus = 0; + /// Bonus to be applied when the callee has only one reachable basic block. + int SingleBBBonus = 0; + /// While we walk the potentially-inlined instructions, we build up and /// maintain a mapping of simplified values specific to this callsite. The /// idea is to propagate any special information we have about arguments to @@ -232,10 +174,12 @@ protected: /// Keep track of the values which map back (through function arguments) to /// allocas on the caller stack which could be simplified through SROA. - /// We can disable an association (because for some reason the SROA oportunity - /// is lost) by setting the value to nullptr. We don't delete because we still - /// want isAllocaDerivedArg to function correctly. - DenseMap SROAArgValues; + DenseMap SROAArgValues; + + /// The mapping of caller Alloca values to their accumulated cost savings. If + /// we have to disable SROA for one of the allocas, this tells us how much + /// cost must be added. + DenseMap SROAArgCosts; /// Keep track of values which map to a pointer base and constant offset. DenseMap> ConstantOffsetPtrs; @@ -252,18 +196,17 @@ protected: /// loads. bool EnableLoadElimination; SmallPtrSet LoadAddrSet; - - AllocaInst *getSROAArgForValueOrNull(Value *V) const { - auto It = SROAArgValues.find(V); - if (It == SROAArgValues.end()) - return nullptr; - return It->second; - } + int LoadEliminationCost = 0; // Custom simplification helper routines. bool isAllocaDerivedArg(Value *V); + bool lookupSROAArgAndCost(Value *V, Value *&Arg, + DenseMap::iterator &CostIt); + void disableSROA(DenseMap::iterator CostIt); void disableSROA(Value *V); void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); + void accumulateSROACost(DenseMap::iterator CostIt, + int InstructionCost); void disableLoadElimination(); bool isGEPFree(GetElementPtrInst &GEP); bool canFoldInboundsGEP(GetElementPtrInst &I); @@ -284,13 +227,32 @@ protected: /// inlined through this particular callsite. bool isKnownNonNullInCallee(Value *V); + /// Update Threshold based on callsite properties such as callee + /// attributes and callee hotness for PGO builds. The Callee is explicitly + /// passed to support analyzing indirect calls whose target is inferred by + /// analysis. + void updateThreshold(CallBase &Call, Function &Callee); + /// Return true if size growth is allowed when inlining the callee at \p Call. bool allowSizeGrowth(CallBase &Call); + /// Return true if \p Call is a cold callsite. + bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); + + /// Return a higher threshold if \p Call is a hot callsite. + Optional getHotCallSiteThreshold(CallBase &Call, + BlockFrequencyInfo *CallerBFI); + // Custom analysis routines. InlineResult analyzeBlock(BasicBlock *BB, SmallPtrSetImpl &EphValues); + /// Handle a capped 'int' increment for Cost. + void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { + assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound"); + Cost = (int)std::min(UpperBound, Cost + Inc); + } + // Disable several entry points to the visitor so we don't accidentally use // them by declaring but not defining them here. void visit(Module *); @@ -336,13 +298,20 @@ public: std::function &GetAssumptionCache, Optional> &GetBFI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, - Function &Callee, CallBase &Call) + Function &Callee, CallBase &Call, const InlineParams &Params, + bool BoostIndirect = true) : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), - CandidateCall(Call), EnableLoadElimination(true) {} + CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold), + ComputeFullInlineCost(OptComputeFullInlineCost || + Params.ComputeFullInlineCost || ORE), + BoostIndirectCalls(BoostIndirect), EnableLoadElimination(true) {} InlineResult analyze(); + int getThreshold() { return Threshold; } + int getCost() { return Cost; } + // Keep a bunch of stats about the cost savings found so we can print them // out when debugging. unsigned NumConstantArgs = 0; @@ -351,283 +320,12 @@ public: unsigned NumConstantPtrCmps = 0; unsigned NumConstantPtrDiffs = 0; unsigned NumInstructionsSimplified = 0; - - void dump(); -}; - -/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note -/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer -class InlineCostCallAnalyzer final : public CallAnalyzer { - const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; - const bool ComputeFullInlineCost; - int LoadEliminationCost = 0; - /// Bonus to be applied when percentage of vector instructions in callee is - /// high (see more details in updateThreshold). - int VectorBonus = 0; - /// Bonus to be applied when the callee has only one reachable basic block. - int SingleBBBonus = 0; - - /// Tunable parameters that control the analysis. - const InlineParams &Params; - - /// Upper bound for the inlining cost. Bonuses are being applied to account - /// for speculative "expected profit" of the inlining decision. - int Threshold = 0; - - /// Attempt to evaluate indirect calls to boost its inline cost. - const bool BoostIndirectCalls; - - /// Inlining cost measured in abstract units, accounts for all the - /// instructions expected to be executed for a given function invocation. - /// Instructions that are statically proven to be dead based on call-site - /// arguments are not counted here. - int Cost = 0; - - bool SingleBB = true; - unsigned SROACostSavings = 0; unsigned SROACostSavingsLost = 0; - /// The mapping of caller Alloca values to their accumulated cost savings. If - /// we have to disable SROA for one of the allocas, this tells us how much - /// cost must be added. - DenseMap SROAArgCosts; - - /// Return true if \p Call is a cold callsite. - bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); - - /// Update Threshold based on callsite properties such as callee - /// attributes and callee hotness for PGO builds. The Callee is explicitly - /// passed to support analyzing indirect calls whose target is inferred by - /// analysis. - void updateThreshold(CallBase &Call, Function &Callee); - /// Return a higher threshold if \p Call is a hot callsite. - Optional getHotCallSiteThreshold(CallBase &Call, - BlockFrequencyInfo *CallerBFI); - - /// Handle a capped 'int' increment for Cost. - void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { - assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound"); - Cost = (int)std::min(UpperBound, Cost + Inc); - } - - void onDisableSROA(AllocaInst *Arg) override { - auto CostIt = SROAArgCosts.find(Arg); - if (CostIt == SROAArgCosts.end()) - return; - addCost(CostIt->second); - SROACostSavings -= CostIt->second; - SROACostSavingsLost += CostIt->second; - SROAArgCosts.erase(CostIt); - } - - void onDisableLoadElimination() override { - addCost(LoadEliminationCost); - LoadEliminationCost = 0; - } - void onCallPenalty() override { addCost(InlineConstants::CallPenalty); } - void onCallArgumentSetup(const CallBase &Call) override { - // Pay the price of the argument setup. We account for the average 1 - // instruction per call argument setup here. - addCost(Call.arg_size() * InlineConstants::InstrCost); - } - void onLoadRelativeIntrinsic() override { - // This is normally lowered to 4 LLVM instructions. - addCost(3 * InlineConstants::InstrCost); - } - void onLoweredCall(Function *F, CallBase &Call, - bool IsIndirectCall) override { - // We account for the average 1 instruction per call argument setup here. - addCost(Call.arg_size() * InlineConstants::InstrCost); - - // If we have a constant that we are calling as a function, we can peer - // through it and see the function target. This happens not infrequently - // during devirtualization and so we want to give it a hefty bonus for - // inlining, but cap that bonus in the event that inlining wouldn't pan out. - // Pretend to inline the function, with a custom threshold. - if (IsIndirectCall && BoostIndirectCalls) { - auto IndirectCallParams = Params; - IndirectCallParams.DefaultThreshold = - InlineConstants::IndirectCallThreshold; - /// FIXME: if InlineCostCallAnalyzer is derived from, this may need - /// to instantiate the derived class. - InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, - Call, IndirectCallParams, false); - if (CA.analyze()) { - // We were able to inline the indirect call! Subtract the cost from the - // threshold to get the bonus we want to apply, but don't go below zero. - Cost -= std::max(0, CA.getThreshold() - CA.getCost()); - } - } else - // Otherwise simply add the cost for merely making the call. - addCost(InlineConstants::CallPenalty); - } - - void onFinalizeSwitch(unsigned JumpTableSize, - unsigned NumCaseCluster) override { - // If suitable for a jump table, consider the cost for the table size and - // branch to destination. - // Maximum valid cost increased in this function. - if (JumpTableSize) { - int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + - 4 * InlineConstants::InstrCost; - - addCost(JTCost, (int64_t)CostUpperBound); - return; - } - // Considering forming a binary search, we should find the number of nodes - // which is same as the number of comparisons when lowered. For a given - // number of clusters, n, we can define a recursive function, f(n), to find - // the number of nodes in the tree. The recursion is : - // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, - // and f(n) = n, when n <= 3. - // This will lead a binary tree where the leaf should be either f(2) or f(3) - // when n > 3. So, the number of comparisons from leaves should be n, while - // the number of non-leaf should be : - // 2^(log2(n) - 1) - 1 - // = 2^log2(n) * 2^-1 - 1 - // = n / 2 - 1. - // Considering comparisons from leaf and non-leaf nodes, we can estimate the - // number of comparisons in a simple closed form : - // n + n / 2 - 1 = n * 3 / 2 - 1 - if (NumCaseCluster <= 3) { - // Suppose a comparison includes one compare and one conditional branch. - addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); - return; - } - - int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; - int64_t SwitchCost = - ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; - - addCost(SwitchCost, (int64_t)CostUpperBound); - } - void onCommonInstructionSimplification() override { - addCost(InlineConstants::InstrCost); - } - - void onInitializeSROAArg(AllocaInst *Arg) override { SROAArgCosts[Arg] = 0; } - - void onAggregateSROAUse(AllocaInst *SROAArg) override { - auto CostIt = SROAArgCosts.find(SROAArg); - CostIt->second += InlineConstants::InstrCost; - SROACostSavings += InlineConstants::InstrCost; - } - - virtual void onBlockAnalyzed(const BasicBlock *BB) override { - auto *TI = BB->getTerminator(); - // If we had any successors at this point, than post-inlining is likely to - // have them as well. Note that we assume any basic blocks which existed - // due to branches or switches which folded above will also fold after - // inlining. - if (SingleBB && TI->getNumSuccessors() > 1) { - // Take off the bonus we applied to the threshold. - Threshold -= SingleBBBonus; - SingleBB = false; - } - } - virtual InlineResult finalizeAnalysis() override { - // Loops generally act a lot like calls in that they act like barriers to - // movement, require a certain amount of setup, etc. So when optimising for - // size, we penalise any call sites that perform loops. We do this after all - // other costs here, so will likely only be dealing with relatively small - // functions (and hence DT and LI will hopefully be cheap). - auto *Caller = CandidateCall.getFunction(); - if (Caller->hasMinSize()) { - DominatorTree DT(F); - LoopInfo LI(DT); - int NumLoops = 0; - for (Loop *L : LI) { - // Ignore loops that will not be executed - if (DeadBlocks.count(L->getHeader())) - continue; - NumLoops++; - } - addCost(NumLoops * InlineConstants::CallPenalty); - } - - // We applied the maximum possible vector bonus at the beginning. Now, - // subtract the excess bonus, if any, from the Threshold before - // comparing against Cost. - if (NumVectorInstructions <= NumInstructions / 10) - Threshold -= VectorBonus; - else if (NumVectorInstructions <= NumInstructions / 2) - Threshold -= VectorBonus / 2; - - return Cost < std::max(1, Threshold); - } - virtual bool shouldStop() override { - // Bail out the moment we cross the threshold. This means we'll under-count - // the cost, but only when undercounting doesn't matter. - return Cost >= Threshold && !ComputeFullInlineCost; - } - - virtual void onLoadEliminationOpportunity() { - LoadEliminationCost += InlineConstants::InstrCost; - } - - InlineResult onAnalysisStart() override { - // Perform some tweaks to the cost and threshold based on the direct - // callsite information. - - // We want to more aggressively inline vector-dense kernels, so up the - // threshold, and we'll lower it if the % of vector instructions gets too - // low. Note that these bonuses are some what arbitrary and evolved over - // time by accident as much as because they are principled bonuses. - // - // FIXME: It would be nice to remove all such bonuses. At least it would be - // nice to base the bonus values on something more scientific. - assert(NumInstructions == 0); - assert(NumVectorInstructions == 0); - - // Update the threshold based on callsite properties - updateThreshold(CandidateCall, F); - - // While Threshold depends on commandline options that can take negative - // values, we want to enforce the invariant that the computed threshold and - // bonuses are non-negative. - assert(Threshold >= 0); - assert(SingleBBBonus >= 0); - assert(VectorBonus >= 0); - - // Speculatively apply all possible bonuses to Threshold. If cost exceeds - // this Threshold any time, and cost cannot decrease, we can stop processing - // the rest of the function body. - Threshold += (SingleBBBonus + VectorBonus); - - // Give out bonuses for the callsite, as the instructions setting them up - // will be gone after inlining. - addCost(-getCallsiteCost(this->CandidateCall, DL)); - - // If this function uses the coldcc calling convention, prefer not to inline - // it. - if (F.getCallingConv() == CallingConv::Cold) - Cost += InlineConstants::ColdccPenalty; - - // Check if we're done. This can happen due to bonuses and penalties. - if (Cost >= Threshold && !ComputeFullInlineCost) - return "high cost"; - - return true; - } - -public: - InlineCostCallAnalyzer( - const TargetTransformInfo &TTI, - std::function &GetAssumptionCache, - Optional> &GetBFI, - ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, - CallBase &Call, const InlineParams &Params, bool BoostIndirect = true) - : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call), - ComputeFullInlineCost(OptComputeFullInlineCost || - Params.ComputeFullInlineCost || ORE), - Params(Params), Threshold(Params.DefaultThreshold), - BoostIndirectCalls(BoostIndirect) {} void dump(); - - int getThreshold() { return Threshold; } - int getCost() { return Cost; } }; + } // namespace /// Test whether the given value is an Alloca-derived function argument. @@ -635,22 +333,55 @@ bool CallAnalyzer::isAllocaDerivedArg(Value *V) { return SROAArgValues.count(V); } +/// Lookup the SROA-candidate argument and cost iterator which V maps to. +/// Returns false if V does not map to a SROA-candidate. +bool CallAnalyzer::lookupSROAArgAndCost( + Value *V, Value *&Arg, DenseMap::iterator &CostIt) { + if (SROAArgValues.empty() || SROAArgCosts.empty()) + return false; + + DenseMap::iterator ArgIt = SROAArgValues.find(V); + if (ArgIt == SROAArgValues.end()) + return false; + + Arg = ArgIt->second; + CostIt = SROAArgCosts.find(Arg); + return CostIt != SROAArgCosts.end(); +} + +/// Disable SROA for the candidate marked by this cost iterator. +/// +/// This marks the candidate as no longer viable for SROA, and adds the cost +/// savings associated with it back into the inline cost measurement. +void CallAnalyzer::disableSROA(DenseMap::iterator CostIt) { + // If we're no longer able to perform SROA we need to undo its cost savings + // and prevent subsequent analysis. + addCost(CostIt->second); + SROACostSavings -= CostIt->second; + SROACostSavingsLost += CostIt->second; + SROAArgCosts.erase(CostIt); + disableLoadElimination(); +} + /// If 'V' maps to a SROA candidate, disable SROA for it. void CallAnalyzer::disableSROA(Value *V) { - auto It = SROAArgValues.find(V); - if (It == SROAArgValues.end()) - return; - auto *SROAArg = It->second; - if (!SROAArg) - return; - It->second = nullptr; - onDisableSROA(SROAArg); - disableLoadElimination(); + Value *SROAArg; + DenseMap::iterator CostIt; + if (lookupSROAArgAndCost(V, SROAArg, CostIt)) + disableSROA(CostIt); +} + +/// Accumulate the given cost for a particular SROA candidate. +void CallAnalyzer::accumulateSROACost(DenseMap::iterator CostIt, + int InstructionCost) { + CostIt->second += InstructionCost; + SROACostSavings += InstructionCost; } void CallAnalyzer::disableLoadElimination() { if (EnableLoadElimination) { - onDisableLoadElimination(); + addCost(LoadEliminationCost); + LoadEliminationCost = 0; EnableLoadElimination = false; } } @@ -822,7 +553,9 @@ bool CallAnalyzer::visitPHI(PHINode &I) { if (FirstBaseAndOffset.first) { ConstantOffsetPtrs[&I] = FirstBaseAndOffset; - if (auto *SROAArg = getSROAArgForValueOrNull(FirstV)) + Value *SROAArg; + DenseMap::iterator CostIt; + if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) SROAArgValues[&I] = SROAArg; } @@ -852,8 +585,10 @@ bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { } bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { + Value *SROAArg; DenseMap::iterator CostIt; - auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand()); + bool SROACandidate = + lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); // Lambda to check whether a GEP's indices are all constant. auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { @@ -864,7 +599,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { }; if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { - if (SROAArg) + if (SROACandidate) SROAArgValues[&I] = SROAArg; // Constant GEPs are modeled as free. @@ -872,8 +607,8 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { } // Variable GEPs will require math and will disable SROA. - if (SROAArg) - disableSROA(SROAArg); + if (SROACandidate) + disableSROA(CostIt); return isGEPFree(I); } @@ -913,7 +648,9 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) { ConstantOffsetPtrs[&I] = BaseAndOffset; // Also look for SROA candidates here. - if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0))) + Value *SROAArg; + DenseMap::iterator CostIt; + if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) SROAArgValues[&I] = SROAArg; // Bitcasts are always zero cost. @@ -945,7 +682,9 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { // and so we can just add the integer in here. The only places where SROA is // preserved either cannot fire on an integer, or won't in-and-of themselves // disable SROA (ext) w/o some later use that we would see and disable. - if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0))) + Value *SROAArg; + DenseMap::iterator CostIt; + if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) SROAArgValues[&I] = SROAArg; return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); @@ -969,7 +708,9 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { } // "Propagate" SROA here in the same manner as we do for ptrtoint above. - if (auto *SROAArg = getSROAArgForValueOrNull(Op)) + Value *SROAArg; + DenseMap::iterator CostIt; + if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) SROAArgValues[&I] = SROAArg; return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); @@ -996,7 +737,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) { case Instruction::FPToUI: case Instruction::FPToSI: if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) - onCallPenalty(); + addCost(InlineConstants::CallPenalty); break; default: break; @@ -1069,8 +810,8 @@ bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { return true; } -bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, - BlockFrequencyInfo *CallerBFI) { +bool CallAnalyzer::isColdCallSite(CallBase &Call, + BlockFrequencyInfo *CallerBFI) { // If global profile summary is available, then callsite's coldness is // determined based on that. if (PSI && PSI->hasProfileSummary()) @@ -1093,8 +834,8 @@ bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, } Optional -InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, - BlockFrequencyInfo *CallerBFI) { +CallAnalyzer::getHotCallSiteThreshold(CallBase &Call, + BlockFrequencyInfo *CallerBFI) { // If global profile summary is available, then callsite's hotness is // determined based on that. @@ -1121,7 +862,7 @@ InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, return None; } -void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { +void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { // If no size growth is allowed for this inlining, set Threshold to 0. if (!allowSizeGrowth(Call)) { Threshold = 0; @@ -1283,7 +1024,19 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) { : ConstantInt::getFalse(I.getType()); return true; } - return handleSROA(I.getOperand(0), isa(I.getOperand(1))); + // Finally check for SROA candidates in comparisons. + Value *SROAArg; + DenseMap::iterator CostIt; + if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { + if (isa(I.getOperand(1))) { + accumulateSROACost(CostIt, InlineConstants::InstrCost); + return true; + } + + disableSROA(CostIt); + } + + return false; } bool CallAnalyzer::visitSub(BinaryOperator &I) { @@ -1347,7 +1100,7 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { if (I.getType()->isFloatingPointTy() && TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive && !match(&I, m_FNeg(m_Value()))) - onCallPenalty(); + addCost(InlineConstants::CallPenalty); return false; } @@ -1374,15 +1127,23 @@ bool CallAnalyzer::visitFNeg(UnaryOperator &I) { } bool CallAnalyzer::visitLoad(LoadInst &I) { - if (handleSROA(I.getPointerOperand(), I.isSimple())) - return true; + Value *SROAArg; + DenseMap::iterator CostIt; + if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { + if (I.isSimple()) { + accumulateSROACost(CostIt, InlineConstants::InstrCost); + return true; + } + + disableSROA(CostIt); + } // If the data is already loaded from this address and hasn't been clobbered // by any stores or calls, this load is likely to be redundant and can be // eliminated. if (EnableLoadElimination && !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { - onLoadEliminationOpportunity(); + LoadEliminationCost += InlineConstants::InstrCost; return true; } @@ -1390,8 +1151,16 @@ bool CallAnalyzer::visitLoad(LoadInst &I) { } bool CallAnalyzer::visitStore(StoreInst &I) { - if (handleSROA(I.getPointerOperand(), I.isSimple())) - return true; + Value *SROAArg; + DenseMap::iterator CostIt; + if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { + if (I.isSimple()) { + accumulateSROACost(CostIt, InlineConstants::InstrCost); + return true; + } + + disableSROA(CostIt); + } // The store can potentially clobber loads and prevent repeated loads from // being eliminated. @@ -1481,7 +1250,9 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) { // in this inline context. If not, we've done all we can. F = dyn_cast_or_null(SimplifiedValues.lookup(Callee)); if (!F) { - onCallArgumentSetup(Call); + // Pay the price of the argument setup. We account for the average 1 + // instruction per call argument setup here. + addCost(Call.arg_size() * InlineConstants::InstrCost); if (!Call.onlyReadsMemory()) disableLoadElimination(); @@ -1505,7 +1276,8 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) { return Base::visitCallBase(Call); case Intrinsic::load_relative: - onLoadRelativeIntrinsic(); + // This is normally lowered to 4 LLVM instructions. + addCost(3 * InlineConstants::InstrCost); return false; case Intrinsic::memset: @@ -1532,7 +1304,28 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) { } if (TTI.isLoweredToCall(F)) { - onLoweredCall(F, Call, IsIndirectCall); + // We account for the average 1 instruction per call argument setup here. + addCost(Call.arg_size() * InlineConstants::InstrCost); + + // If we have a constant that we are calling as a function, we can peer + // through it and see the function target. This happens not infrequently + // during devirtualization and so we want to give it a hefty bonus for + // inlining, but cap that bonus in the event that inlining wouldn't pan out. + // Pretend to inline the function, with a custom threshold. + if (IsIndirectCall && BoostIndirectCalls) { + auto IndirectCallParams = Params; + IndirectCallParams.DefaultThreshold = + InlineConstants::IndirectCallThreshold; + CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call, + IndirectCallParams, false); + if (CA.analyze()) { + // We were able to inline the indirect call! Subtract the cost from the + // threshold to get the bonus we want to apply, but don't go below zero. + Cost -= std::max(0, CA.getThreshold() - CA.getCost()); + } + } else + // Otherwise simply add the cost for merely making the call. + addCost(InlineConstants::CallPenalty); } if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory()))) @@ -1588,7 +1381,9 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) { if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; - if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal)) + Value *SROAArg; + DenseMap::iterator CostIt; + if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) SROAArgValues[&SI] = SROAArg; return true; } @@ -1627,7 +1422,9 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) { if (BaseAndOffset.first) { ConstantOffsetPtrs[&SI] = BaseAndOffset; - if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV)) + Value *SROAArg; + DenseMap::iterator CostIt; + if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) SROAArgValues[&SI] = SROAArg; } @@ -1655,12 +1452,49 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { // inlining those. It will prevent inlining in cases where the optimization // does not (yet) fire. + // Maximum valid cost increased in this function. + int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; + unsigned JumpTableSize = 0; BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr; unsigned NumCaseCluster = TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI); - onFinalizeSwitch(JumpTableSize, NumCaseCluster); + // If suitable for a jump table, consider the cost for the table size and + // branch to destination. + if (JumpTableSize) { + int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + + 4 * InlineConstants::InstrCost; + + addCost(JTCost, (int64_t)CostUpperBound); + return false; + } + + // Considering forming a binary search, we should find the number of nodes + // which is same as the number of comparisons when lowered. For a given + // number of clusters, n, we can define a recursive function, f(n), to find + // the number of nodes in the tree. The recursion is : + // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, + // and f(n) = n, when n <= 3. + // This will lead a binary tree where the leaf should be either f(2) or f(3) + // when n > 3. So, the number of comparisons from leaves should be n, while + // the number of non-leaf should be : + // 2^(log2(n) - 1) - 1 + // = 2^log2(n) * 2^-1 - 1 + // = n / 2 - 1. + // Considering comparisons from leaf and non-leaf nodes, we can estimate the + // number of comparisons in a simple closed form : + // n + n / 2 - 1 = n * 3 / 2 - 1 + if (NumCaseCluster <= 3) { + // Suppose a comparison includes one compare and one conditional branch. + addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); + return false; + } + + int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; + int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; + + addCost(SwitchCost, (int64_t)CostUpperBound); return false; } @@ -1753,7 +1587,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB, if (Base::visit(&*I)) ++NumInstructionsSimplified; else - onCommonInstructionSimplification(); + addCost(InlineConstants::InstrCost); using namespace ore; // If the visit this instruction detected an uninlinable pattern, abort. @@ -1798,7 +1632,9 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB, return IR; } - if (shouldStop()) + // Check if we've passed the maximum possible threshold so we don't spin in + // huge basic blocks that will never inline. + if (Cost >= Threshold && !ComputeFullInlineCost) return false; } @@ -1892,9 +1728,46 @@ void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { InlineResult CallAnalyzer::analyze() { ++NumCallsAnalyzed; - auto Result = onAnalysisStart(); - if (!Result) - return Result; + // Perform some tweaks to the cost and threshold based on the direct + // callsite information. + + // We want to more aggressively inline vector-dense kernels, so up the + // threshold, and we'll lower it if the % of vector instructions gets too + // low. Note that these bonuses are some what arbitrary and evolved over time + // by accident as much as because they are principled bonuses. + // + // FIXME: It would be nice to remove all such bonuses. At least it would be + // nice to base the bonus values on something more scientific. + assert(NumInstructions == 0); + assert(NumVectorInstructions == 0); + + // Update the threshold based on callsite properties + updateThreshold(CandidateCall, F); + + // While Threshold depends on commandline options that can take negative + // values, we want to enforce the invariant that the computed threshold and + // bonuses are non-negative. + assert(Threshold >= 0); + assert(SingleBBBonus >= 0); + assert(VectorBonus >= 0); + + // Speculatively apply all possible bonuses to Threshold. If cost exceeds + // this Threshold any time, and cost cannot decrease, we can stop processing + // the rest of the function body. + Threshold += (SingleBBBonus + VectorBonus); + + // Give out bonuses for the callsite, as the instructions setting them up + // will be gone after inlining. + addCost(-getCallsiteCost(CandidateCall, DL)); + + // If this function uses the coldcc calling convention, prefer not to inline + // it. + if (F.getCallingConv() == CallingConv::Cold) + Cost += InlineConstants::ColdccPenalty; + + // Check if we're done. This can happen due to bonuses and penalties. + if (Cost >= Threshold && !ComputeFullInlineCost) + return "high cost"; if (F.empty()) return true; @@ -1923,9 +1796,9 @@ InlineResult CallAnalyzer::analyze() { ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); // We can SROA any pointer arguments derived from alloca instructions. - if (auto *SROAArg = dyn_cast(PtrArg)) { - SROAArgValues[&*FAI] = SROAArg; - onInitializeSROAArg(SROAArg); + if (isa(PtrArg)) { + SROAArgValues[&*FAI] = PtrArg; + SROAArgCosts[PtrArg] = 0; } } } @@ -1951,10 +1824,12 @@ InlineResult CallAnalyzer::analyze() { BBSetVector; BBSetVector BBWorklist; BBWorklist.insert(&F.getEntryBlock()); - + bool SingleBB = true; // Note that we *must not* cache the size, this loop grows the worklist. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { - if (shouldStop()) + // Bail out the moment we cross the threshold. This means we'll under-count + // the cost, but only when undercounting doesn't matter. + if (Cost >= Threshold && !ComputeFullInlineCost) break; BasicBlock *BB = BBWorklist[Idx]; @@ -2014,7 +1889,15 @@ InlineResult CallAnalyzer::analyze() { ++TIdx) BBWorklist.insert(TI->getSuccessor(TIdx)); - onBlockAnalyzed(BB); + // If we had any successors at this point, than post-inlining is likely to + // have them as well. Note that we assume any basic blocks which existed + // due to branches or switches which folded above will also fold after + // inlining. + if (SingleBB && TI->getNumSuccessors() > 1) { + // Take off the bonus we applied to the threshold. + Threshold -= SingleBBBonus; + SingleBB = false; + } } bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() && @@ -2025,12 +1908,38 @@ InlineResult CallAnalyzer::analyze() { if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) return "noduplicate"; - return finalizeAnalysis(); + // Loops generally act a lot like calls in that they act like barriers to + // movement, require a certain amount of setup, etc. So when optimising for + // size, we penalise any call sites that perform loops. We do this after all + // other costs here, so will likely only be dealing with relatively small + // functions (and hence DT and LI will hopefully be cheap). + if (Caller->hasMinSize()) { + DominatorTree DT(F); + LoopInfo LI(DT); + int NumLoops = 0; + for (Loop *L : LI) { + // Ignore loops that will not be executed + if (DeadBlocks.count(L->getHeader())) + continue; + NumLoops++; + } + addCost(NumLoops * InlineConstants::CallPenalty); + } + + // We applied the maximum possible vector bonus at the beginning. Now, + // subtract the excess bonus, if any, from the Threshold before + // comparing against Cost. + if (NumVectorInstructions <= NumInstructions / 10) + Threshold -= VectorBonus; + else if (NumVectorInstructions <= NumInstructions / 2) + Threshold -= VectorBonus / 2; + + return Cost < std::max(1, Threshold); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Dump stats about this call's analysis. -LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { +LLVM_DUMP_METHOD void CallAnalyzer::dump() { #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" DEBUG_PRINT_STAT(NumConstantArgs); DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); @@ -2164,8 +2073,8 @@ InlineCost llvm::getInlineCost( LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "... (caller:" << Caller->getName() << ")\n"); - InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, - *Callee, Call, Params); + CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, + Call, Params); InlineResult ShouldInline = CA.analyze(); LLVM_DEBUG(CA.dump()); @@ -2212,16 +2121,15 @@ InlineResult llvm::isInlineViable(Function &F) { switch (Call->getCalledFunction()->getIntrinsicID()) { default: break; - // Disallow inlining of @llvm.icall.branch.funnel because current - // backend can't separate call targets from call arguments. + // Disallow inlining of @llvm.icall.branch.funnel because current + // backend can't separate call targets from call arguments. case llvm::Intrinsic::icall_branch_funnel: return "disallowed inlining of @llvm.icall.branch.funnel"; - // Disallow inlining functions that call @llvm.localescape. Doing this - // correctly would require major changes to the inliner. + // Disallow inlining functions that call @llvm.localescape. Doing this + // correctly would require major changes to the inliner. case llvm::Intrinsic::localescape: return "disallowed inlining of @llvm.localescape"; - // Disallow inlining of functions that initialize VarArgs with - // va_start. + // Disallow inlining of functions that initialize VarArgs with va_start. case llvm::Intrinsic::vastart: return "contains VarArgs initialized with va_start"; } -- 2.7.4