bool Insert, bool Extract) const;
/// Estimate the overhead of scalarizing an instructions unique
- /// non-constant operands. The types of the arguments are ordinarily
- /// scalar, in which case the costs are multiplied with VF.
+ /// non-constant operands. The (potentially vector) types to use for each of
+ /// argument are passes via Tys.
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
- unsigned VF) const;
+ ArrayRef<Type *> Tys) const;
/// If target has efficient vector element load/store instructions, it can
/// return true here so that insertion/extraction costs are not added to
bool Insert, bool Extract) = 0;
virtual unsigned
getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
- unsigned VF) = 0;
+ ArrayRef<Type *> Tys) = 0;
virtual bool supportsEfficientVectorElementLoadStore() = 0;
virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
virtual MemCmpExpansionOptions
return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
- unsigned VF) override {
- return Impl.getOperandsScalarizationOverhead(Args, VF);
+ ArrayRef<Type *> Tys) override {
+ return Impl.getOperandsScalarizationOverhead(Args, Tys);
}
bool supportsEfficientVectorElementLoadStore() override {
}
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
- unsigned VF) const {
+ ArrayRef<Type *> Tys) const {
return 0;
}
return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}
- /// Estimate the overhead of scalarizing an instruction's unique
- /// non-constant operands. The types of the arguments are ordinarily
- /// scalar, in which case the costs are multiplied with VF.
+ /// Estimate the overhead of scalarizing an instructions unique
+ /// non-constant operands. The (potentially vector) types to use for each of
+ /// argument are passes via Tys.
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
- unsigned VF) {
+ ArrayRef<Type *> Tys) {
+ assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
+
unsigned Cost = 0;
SmallPtrSet<const Value*, 4> UniqueOperands;
- for (const Value *A : Args) {
+ for (int I = 0, E = Args.size(); I != E; I++) {
// Disregard things like metadata arguments.
- Type *Ty = A->getType();
+ const Value *A = Args[I];
+ Type *Ty = Tys[I];
if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
!Ty->isPtrOrPtrVectorTy())
continue;
if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
- auto *VecTy = dyn_cast<VectorType>(Ty);
- if (VecTy) {
- // If A is a vector operand, VF should be 1 or correspond to A.
- assert((VF == 1 ||
- VF == cast<FixedVectorType>(VecTy)->getNumElements()) &&
- "Vector argument does not match VF");
- }
- else
- VecTy = FixedVectorType::get(Ty, VF);
-
- Cost += getScalarizationOverhead(VecTy, false, true);
+ if (auto *VecTy = dyn_cast<VectorType>(Ty))
+ Cost += getScalarizationOverhead(VecTy, false, true);
}
}
return Cost;
}
- unsigned getScalarizationOverhead(VectorType *InTy,
- ArrayRef<const Value *> Args) {
- auto *Ty = cast<FixedVectorType>(InTy);
-
+ /// Estimate the overhead of scalarizing the inputs and outputs of an
+ /// instruction, with return type RetTy and arguments Args of type Tys. If
+ /// Args are unknown (empty), then the cost associated with one argument is
+ /// added as a heuristic.
+ unsigned getScalarizationOverhead(VectorType *RetTy,
+ ArrayRef<const Value *> Args,
+ ArrayRef<Type *> Tys) {
unsigned Cost = 0;
- Cost += getScalarizationOverhead(Ty, true, false);
+ Cost += getScalarizationOverhead(RetTy, true, false);
if (!Args.empty())
- Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements());
+ Cost += getOperandsScalarizationOverhead(Args, Tys);
else
// When no information on arguments is provided, we add the cost
// associated with one argument as a heuristic.
- Cost += getScalarizationOverhead(Ty, false, true);
+ Cost += getScalarizationOverhead(RetTy, false, true);
return Cost;
}
Opd1PropInfo, Opd2PropInfo, Args, CxtI);
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
- return getScalarizationOverhead(VTy, Args) + Num * Cost;
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
}
// We don't know anything about this scalar instruction.
ScalarizationCost +=
getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
ScalarizationCost +=
- getOperandsScalarizationOverhead(Args, RetVF.getKnownMinValue());
+ getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
}
IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
}
unsigned TargetTransformInfo::getOperandsScalarizationOverhead(
- ArrayRef<const Value *> Args, unsigned VF) const {
- return TTIImpl->getOperandsScalarizationOverhead(Args, VF);
+ ArrayRef<const Value *> Args, ArrayRef<Type *> Tys) const {
+ return TTIImpl->getOperandsScalarizationOverhead(Args, Tys);
}
bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
Opd1PropInfo, Opd2PropInfo, Args, CxtI);
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
- return getScalarizationOverhead(VTy, Args) + Num * Cost;
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
}
// We don't know anything about this scalar instruction.
if (!RetTy->isVoidTy())
ScalarizationCost +=
getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
- ScalarizationCost += getOperandsScalarizationOverhead(Args, RetVF);
+ ScalarizationCost +=
+ getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
}
IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
CostKind);
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
- return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost;
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
}
return BaseCost;
return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
}
-unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
- ArrayRef<const Value*> Args, unsigned VF) {
- return BaseT::getOperandsScalarizationOverhead(Args, VF);
+unsigned
+HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+ ArrayRef<Type *> Tys) {
+ return BaseT::getOperandsScalarizationOverhead(Args, Tys);
}
unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
bool Insert, bool Extract);
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
- unsigned VF);
+ ArrayRef<Type *> Tys);
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys,
TTI::TargetCostKind CostKind);
unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
if (DivRemConstPow2)
return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
- if (DivRemConst)
- return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args);
+ if (DivRemConst) {
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys);
+ }
if ((SignedDivRem || UnsignedDivRem) && VF > 4)
// Temporary hack: disable high vectorization factors with integer
// division/remainder, which will get scalarized and handled with
// inserting and extracting the values.
unsigned ScalarCost =
getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
- unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(VTy, Args);
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ unsigned Cost =
+ (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys);
// FIXME: VF 2 for these FP operations are currently just as
// expensive as for VF 4.
if (VF == 2)
// There is no native support for FRem.
if (Opcode == Instruction::FRem) {
- unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args);
+ SmallVector<Type *> Tys(Args.size(), Ty);
+ unsigned Cost =
+ (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys);
// FIXME: VF 2 for float is currently just as expensive as for VF 4.
if (VF == 2 && ScalarBits == 32)
Cost *= 2;
return Cost;
}
+static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
+ if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
+ return Elt;
+ return VectorType::get(Elt, VF);
+}
+
InstructionCost
LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
ElementCount VF) {
- auto MaybeVectorizeType = [](Type *Elt, ElementCount VF) -> Type * {
- if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
- return Elt;
- return VectorType::get(Elt, VF);
- };
-
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");
Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
// Skip operands that do not require extraction/scalarization and do not incur
// any overhead.
+ SmallVector<Type *> Tys;
+ for (auto *V : filterExtractingOperands(Ops, VF))
+ Tys.push_back(MaybeVectorizeType(V->getType(), VF));
return Cost + TTI.getOperandsScalarizationOverhead(
- filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
+ filterExtractingOperands(Ops, VF), Tys);
}
void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; This test checks we don't crash on certain matrix operations, more than
+; checks the cost of the intrinsics per-se.
+
+define void @matrix() {
+; CHECK-LABEL: 'matrix'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %matrix1 = call <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nonnull align 4 undef, i64 1, i1 false, i32 1, i32 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 452 for instruction: %0 = call <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32> undef, <1 x i32> %matrix1, i32 10, i32 1, i32 1)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+entry:
+ %matrix1 = call <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nonnull align 4 undef, i64 1, i1 false, i32 1, i32 1)
+ %0 = call <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32> undef, <1 x i32> %matrix1, i32 10, i32 1, i32 1)
+ ret void
+}
+
+declare <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) #2
+declare <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32>, <1 x i32>, i32 immarg, i32 immarg, i32 immarg) #3
}
define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) {
+;; FIXME: Should be calling sin_vec, once the cost of scalarizing is handled.
; CHECK-LABEL: @vec_intrinsic
; CHECK: vector.body:
; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
-; CHECK: call fast <vscale x 2 x double> @sin_vec(<vscale x 2 x double> %[[LOAD]])
+; CHECK: call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %[[LOAD]])
entry:
%cmp7 = icmp sgt i64 %N, 0
br i1 %cmp7, label %for.body, label %for.end
%0 = load double, double* %arrayidx, align 8
%1 = call fast double @llvm.sin.f64(double %0) #2
%add = fadd fast double %1, 1.000000e+00
+ store double %add, double* %arrayidx, align 8
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, %N
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1