From 9e778e6c730a439229079811674dec6a37792412 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 15 May 2019 10:05:49 +0000 Subject: [PATCH] [LV] Move getScalarizationOverhead and vector call cost computations to CM. (NFC) This reduces the number of parameters we need to pass in and they seem a natural fit in LoopVectorizationCostModel. Also simplifies things for D59995. As a follow up refactoring, we could only expose a expose a shouldUseVectorIntrinsic() helper in LoopVectorizationCostModel, instead of calling getVectorCallCost/getVectorIntrinsicCost in InnerLoopVectorizer/VPRecipeBuilder. Reviewers: Ayal, hsaito, dcaballe, rengolin Reviewed By: rengolin Differential Revision: https://reviews.llvm.org/D61638 llvm-svn: 360758 --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 116 ++++++++++++------------ llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h | 7 +- 2 files changed, 61 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 83f1c70..ce03b85 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1169,6 +1169,18 @@ public: return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Estimate cost of an intrinsic call instruction CI if it were vectorized + /// with factor VF. Return the cost of the instruction, including + /// scalarization overhead if it's needed. + unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); + + /// Estimate cost of a call instruction CI if it were vectorized with factor + /// VF. Return the cost of the instruction, including scalarization overhead + /// if it's needed. The flag NeedToScalarize shows if the call needs to be + /// scalarized - + // i.e. either vector version isn't available, or is too expensive. + unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize); + private: unsigned NumPredStores = 0; @@ -1221,6 +1233,10 @@ private: /// element) unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + /// Estimate the overhead of scalarizing an instruction. This is a + /// convenience wrapper for the type-based getScalarizationOverhead API. + unsigned getScalarizationOverhead(Instruction *I, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); @@ -3057,45 +3073,9 @@ static void cse(BasicBlock *BB) { } } -/// Estimate the overhead of scalarizing an instruction. This is a -/// convenience wrapper for the type-based getScalarizationOverhead API. -static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, - const TargetTransformInfo &TTI) { - if (VF == 1) - return 0; - - unsigned Cost = 0; - Type *RetTy = ToVectorTy(I->getType(), VF); - if (!RetTy->isVoidTy() && - (!isa(I) || - !TTI.supportsEfficientVectorElementLoadStore())) - Cost += TTI.getScalarizationOverhead(RetTy, true, false); - - // Some targets keep addresses scalar. - if (isa(I) && !TTI.prefersVectorizedAddressing()) - return Cost; - - if (CallInst *CI = dyn_cast(I)) { - SmallVector Operands(CI->arg_operands()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } - else if (!isa(I) || - !TTI.supportsEfficientVectorElementLoadStore()) { - SmallVector Operands(I->operand_values()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } - - return Cost; -} - -// Estimate cost of a call instruction CI if it were vectorized with factor VF. -// Return the cost of the instruction, including scalarization overhead if it's -// needed. The flag NeedToScalarize shows if the call needs to be scalarized - -// i.e. either vector version isn't available, or is too expensive. -static unsigned getVectorCallCost(CallInst *CI, unsigned VF, - const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI, - bool &NeedToScalarize) { +unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, + unsigned VF, + bool &NeedToScalarize) { Function *F = CI->getCalledFunction(); StringRef FnName = CI->getCalledFunction()->getName(); Type *ScalarRetTy = CI->getType(); @@ -3118,7 +3098,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -3137,12 +3117,8 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF, return Cost; } -// Estimate cost of an intrinsic call instruction CI if it were vectorized with -// factor VF. Return the cost of the instruction, including scalarization -// overhead if it's needed. -static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, - const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI) { +unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, + unsigned VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -4126,9 +4102,9 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; assert((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."); @@ -5522,7 +5498,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF); // If we have a predicated store, it may not be executed for each vector // lane. Scale the cost by the probability of executing the predicated @@ -5674,6 +5650,34 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { return VectorizationCostTy(C, TypeNotScalarized); } +unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, + unsigned VF) { + + if (VF == 1) + return 0; + + unsigned Cost = 0; + Type *RetTy = ToVectorTy(I->getType(), VF); + if (!RetTy->isVoidTy() && + (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) + Cost += TTI.getScalarizationOverhead(RetTy, true, false); + + // Some targets keep addresses scalar. + if (isa(I) && !TTI.prefersVectorizedAddressing()) + return Cost; + + if (CallInst *CI = dyn_cast(I)) { + SmallVector Operands(CI->arg_operands()); + Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + } else if (!isa(I) || + !TTI.supportsEfficientVectorElementLoadStore()) { + SmallVector Operands(I->operand_values()); + Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + } + + return Cost; +} + void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (VF == 1) return; @@ -5914,7 +5918,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // The cost of insertelement and extractelement instructions needed for // scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally @@ -6035,16 +6039,16 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, case Instruction::Call: { bool NeedToScalarize; CallInst *CI = cast(I); - unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); + unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize); if (getVectorIntrinsicIDForCall(CI, TLI)) - return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); + return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); return CallCost; } default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + - getScalarizationOverhead(I, VF, TTI); + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6638,9 +6642,9 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; return UseVectorIntrinsic || !NeedToScalarize; } if (isa(I) || isa(I)) { @@ -6828,7 +6832,7 @@ LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); auto Plan = llvm::make_unique(VPBB); - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder); + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) Plan->addVPValue(V); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index bc6b221..0ca6a6b 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -29,9 +29,6 @@ class VPRecipeBuilder { /// Target Library Info. const TargetLibraryInfo *TLI; - /// Target Transform Info. - const TargetTransformInfo *TTI; - /// The legality analysis. LoopVectorizationLegality *Legal; @@ -104,11 +101,9 @@ public: public: VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, VPBuilder &Builder) - : OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), - Builder(Builder) {} + : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {} /// Check if a recipe can be create for \p I withing the given VF \p Range. /// If a recipe can be created, it adds it to \p VPBB. -- 2.7.4