/// Widen a single call instruction within the innermost loop.
void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,
- VPTransformState &State);
+ VPTransformState &State,
+ Intrinsic::ID VectorIntrinsicID);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
return Cost->useOrderedReductions(RdxDesc);
}
-void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
- VPUser &ArgOperands,
- VPTransformState &State) {
+void InnerLoopVectorizer::widenCallInstruction(
+ CallInst &CI, VPValue *Def, VPUser &ArgOperands, VPTransformState &State,
+ Intrinsic::ID VectorIntrinsicID) {
assert(!isa<DbgInfoIntrinsic>(CI) &&
"DbgInfoIntrinsic should have been dropped during VPlan construction");
State.setDebugLocFromInst(&CI);
for (Value *ArgOperand : CI.args())
Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI);
-
- // The flag shows whether we use Intrinsic or a usual Call for vectorized
- // version of the instruction.
- // Is it beneficial to perform intrinsic call compared to lib call?
- bool NeedToScalarize = false;
- InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize);
- InstructionCost IntrinsicCost =
- ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0;
- bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
- assert((UseVectorIntrinsic || !NeedToScalarize) &&
- "Instruction should be scalarized elsewhere.");
- assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
- "Either the intrinsic cost or vector call cost must be valid");
-
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Type *, 2> TysForDecl = {CI.getType()};
SmallVector<Value *, 4> Args;
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
Value *Arg;
- if (!UseVectorIntrinsic ||
- !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
+ if (!VectorIntrinsicID ||
+ !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
Arg = State.get(I.value(), Part);
else
Arg = State.get(I.value(), VPIteration(0, 0));
- if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
+ if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
TysForDecl.push_back(Arg->getType());
Args.push_back(Arg);
}
Function *VectorF;
- if (UseVectorIntrinsic) {
+ if (VectorIntrinsicID) {
// Use vector version of the intrinsic.
if (VF.isVector())
TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF);
Module *M = State.Builder.GetInsertBlock()->getModule();
- VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+ VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
assert(VectorF && "Can't retrieve vector intrinsic.");
} else {
// Use vector version of the function call.
ID == Intrinsic::experimental_noalias_scope_decl))
return nullptr;
- auto willWiden = [&](ElementCount VF) -> bool {
- // The following case may be scalarized depending on the VF.
- // The flag shows whether we use Intrinsic or a usual Call for vectorized
- // version of the instruction.
- // Is it beneficial to perform intrinsic call compared to lib call?
- bool NeedToScalarize = false;
- InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
- InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
- bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
- return UseVectorIntrinsic || !NeedToScalarize;
- };
+ ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
- if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
- return nullptr;
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool ShouldUseVectorIntrinsic =
+ ID && LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) -> bool {
+ bool NeedToScalarize = false;
+ // Is it beneficial to perform intrinsic call compared to lib
+ // call?
+ InstructionCost CallCost =
+ CM.getVectorCallCost(CI, VF, NeedToScalarize);
+ InstructionCost IntrinsicCost =
+ CM.getVectorIntrinsicCost(CI, VF);
+ return IntrinsicCost <= CallCost;
+ },
+ Range);
+ if (ShouldUseVectorIntrinsic)
+ return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
+
+ // Is better to call a vectorized version of the function than to to scalarize
+ // the call?
+ auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
+ [&](ElementCount VF) -> bool {
+ // The following case may be scalarized depending on the VF.
+ // The flag shows whether we can use a usual Call for vectorized
+ // version of the instruction.
+ bool NeedToScalarize = false;
+ CM.getVectorCallCost(CI, VF, NeedToScalarize);
+ return !NeedToScalarize;
+ },
+ Range);
+ if (ShouldUseVectorCall)
+ return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
+ Intrinsic::not_intrinsic);
- ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
- return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
+ return nullptr;
}
bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
VPlanTransforms::VPInstructionsToVPRecipes(
OrigLoop, Plan,
[this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
- DeadInstructions, *PSE.getSE());
+ DeadInstructions, *PSE.getSE(), *TLI);
// Remove the existing terminator of the exiting block of the top-most region.
// A BranchOnCount will be added instead when adding the canonical IV recipes.
void VPWidenCallRecipe::execute(VPTransformState &State) {
State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
- *this, State);
+ *this, State, VectorIntrinsicID);
}
void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
class VPReplicateRecipe;
class VPlanSlp;
+namespace Intrinsic {
+typedef unsigned ID;
+}
+
/// Returns a calculation for the total number of elements for a given \p VF.
/// For fixed width vectors this value is a constant, whereas for scalable
/// vectors it is an expression determined at runtime.
/// A recipe for widening Call instructions.
class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
+ /// ID of the vector intrinsic to call when widening the call. If set the
+ /// Intrinsic::not_intrinsic, a library call will be used instead.
+ Intrinsic::ID VectorIntrinsicID;
public:
template <typename IterT>
- VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)
+ VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,
+ Intrinsic::ID VectorIntrinsicID)
: VPRecipeBase(VPRecipeBase::VPWidenCallSC, CallArguments),
- VPValue(VPValue::VPVWidenCallSC, &I, this) {}
+ VPValue(VPValue::VPVWidenCallSC, &I, this),
+ VectorIntrinsicID(VectorIntrinsicID) {}
~VPWidenCallRecipe() override = default;
O << "call @" << CI->getCalledFunction()->getName() << "(";
printOperands(O, SlotTracker);
O << ")";
+
+ if (VectorIntrinsicID)
+ O << " (using vector intrinsic)";
+ else
+ O << " (using library function)";
}
void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Intrinsics.h"
using namespace llvm;
Loop *OrigLoop, VPlanPtr &Plan,
function_ref<const InductionDescriptor *(PHINode *)>
GetIntOrFpInductionDescriptor,
- SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE) {
+ SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE,
+ const TargetLibraryInfo &TLI) {
ReversePostOrderTraversal<VPBlockRecursiveTraversalWrapper<VPBlockBase *>>
RPOT(Plan->getEntry());
GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
} else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
NewRecipe =
- new VPWidenCallRecipe(*CI, Plan->mapToVPValues(CI->args()));
+ new VPWidenCallRecipe(*CI, Plan->mapToVPValues(CI->args()),
+ getVectorIntrinsicIDForCall(CI, &TLI));
} else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
bool InvariantCond =
SE.isLoopInvariant(SE.getSCEV(SI->getOperand(0)), OrigLoop);
class PHINode;
class ScalarEvolution;
class Loop;
+class TargetLibraryInfo;
struct VPlanTransforms {
/// Replaces the VPInstructions in \p Plan with corresponding
function_ref<const InductionDescriptor *(PHINode *)>
GetIntOrFpInductionDescriptor,
SmallPtrSetImpl<Instruction *> &DeadInstructions,
- ScalarEvolution &SE);
+ ScalarEvolution &SE, const TargetLibraryInfo &TLI);
static bool sinkScalarOperands(VPlan &Plan);
target triple = "arm64-apple-ios"
; CHECK-LABEL: LV: Checking a loop in 'test'
-; CHECK: VPlan 'Initial VPlan for VF={2,4},UF>=1' {
+; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' {
; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr ir<%src>, vp<%3>
; CHECK-NEXT: WIDEN ir<%l> = load ir<%gep.src>
; CHECK-NEXT: WIDEN ir<%conv> = fpext ir<%l>
-; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>)
+; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using library function)
; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<%3>
; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst>
; CHECK-NEXT: EMIT vp<%10> = VF * UF +(nuw) vp<%2>
; CHECK-NEXT: middle.block:
; CHECK-NEXT: No successors
; CHECK-NEXT: }
+
+; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%1> = vector-trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION
+; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<0>, ir<1>
+; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr ir<%src>, vp<%3>
+; CHECK-NEXT: WIDEN ir<%l> = load ir<%gep.src>
+; CHECK-NEXT: WIDEN ir<%conv> = fpext ir<%l>
+; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using vector intrinsic)
+; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<%3>
+; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst>
+; CHECK-NEXT: EMIT vp<%10> = VF * UF +(nuw) vp<%2>
+; CHECK-NEXT: EMIT branch-on-count vp<%10> vp<%1>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
;
define void @test(ptr noalias %src, ptr noalias %dst) {
; CHECK-LABEL: @test(
; CHECK-NEXT: " vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<0\>, ir\<1\>\l" +
; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr ir\<%y\>, vp\<[[STEPS]]\>\l" +
; CHECK-NEXT: " WIDEN ir\<%lv\> = load ir\<%arrayidx\>\l" +
-; CHECK-NEXT: " WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>)\l" +
+; CHECK-NEXT: " WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>) (using vector intrinsic)\l" +
; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr ir\<%x\>, vp\<[[STEPS]]\>\l" +
; CHECK-NEXT: " WIDEN store ir\<%arrayidx2\>, ir\<%call\>\l" +
; CHECK-NEXT: " EMIT vp\<[[CAN_IV_NEXT:%.+]]\> = VF * UF +(nuw) vp\<[[CAN_IV]]\>\l" +
#include "../lib/Transforms/Vectorize/VPlan.h"
#include "../lib/Transforms/Vectorize/VPlanTransforms.h"
#include "VPlanTestBase.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "gtest/gtest.h"
#include <string>
)";
EXPECT_EQ(ExpectedStr, FullDump);
#endif
-
+ TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
+ TargetLibraryInfo TLI(TLII);
SmallPtrSet<Instruction *, 1> DeadInstructions;
VPlanTransforms::VPInstructionsToVPRecipes(
LI->getLoopFor(LoopHeader), Plan, [](PHINode *P) { return nullptr; },
- DeadInstructions, *SE);
+ DeadInstructions, *SE, TLI);
}
TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) {
auto Plan = buildHCFG(LoopHeader);
SmallPtrSet<Instruction *, 1> DeadInstructions;
+ TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
+ TargetLibraryInfo TLI(TLII);
VPlanTransforms::VPInstructionsToVPRecipes(
LI->getLoopFor(LoopHeader), Plan, [](PHINode *P) { return nullptr; },
- DeadInstructions, *SE);
+ DeadInstructions, *SE, TLI);
VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
EXPECT_NE(nullptr, Entry->getSingleSuccessor());
SmallVector<VPValue *, 2> Args;
Args.push_back(&Op1);
Args.push_back(&Op2);
- VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end()));
+ VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end()), false);
EXPECT_TRUE(isa<VPUser>(&Recipe));
VPRecipeBase *BaseR = &Recipe;
EXPECT_TRUE(isa<VPUser>(BaseR));
SmallVector<VPValue *, 2> Args;
Args.push_back(&Op1);
Args.push_back(&Op2);
- VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end()));
+ VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end()),
+ false);
EXPECT_TRUE(Recipe.mayHaveSideEffects());
EXPECT_TRUE(Recipe.mayReadFromMemory());
EXPECT_TRUE(Recipe.mayWriteToMemory());