From 5fb3a57ea759fb60c064c084ec9e607fd63dcd75 Mon Sep 17 00:00:00 2001 From: ShihPo Hung Date: Sat, 21 Jan 2023 05:29:05 -0800 Subject: [PATCH] [Cost] Add CostKind to getVectorInstrCost and its related users LoopUnroll estimates the loop size via getInstructionCost(), but getInstructionCost() cannot pass CostKind to getVectorInstrCost(). And so does getShuffleCost() to getBroadcastShuffleOverhead(), getPermuteShuffleOverhead(), getExtractSubvectorOverhead(), and getInsertSubvectorOverhead(). To address this, this patch adds an argument CostKind to these functions. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D142116 --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 42 +++-- .../llvm/Analysis/TargetTransformInfoImpl.h | 20 +- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 205 ++++++++++++--------- llvm/lib/Analysis/TargetTransformInfo.cpp | 29 +-- llvm/lib/CodeGen/CodeGenPrepare.cpp | 6 +- .../Target/AArch64/AArch64TargetTransformInfo.cpp | 9 +- .../Target/AArch64/AArch64TargetTransformInfo.h | 6 +- .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 6 +- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 1 + llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp | 6 +- llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h | 1 + llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 29 ++- llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 5 +- .../Target/Hexagon/HexagonTargetTransformInfo.cpp | 16 +- .../Target/Hexagon/HexagonTargetTransformInfo.h | 14 +- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 7 +- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h | 5 +- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 5 +- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h | 5 +- .../Target/SystemZ/SystemZTargetTransformInfo.cpp | 25 ++- .../Target/SystemZ/SystemZTargetTransformInfo.h | 5 +- .../WebAssembly/WebAssemblyTargetTransformInfo.cpp | 12 +- .../WebAssembly/WebAssemblyTargetTransformInfo.h | 5 +- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 56 +++--- llvm/lib/Target/X86/X86TargetTransformInfo.h | 8 +- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 54 +++--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 20 +- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 39 ++-- .../X86/shuffle-extract_subvector-latency.ll | 2 +- 29 files changed, 383 insertions(+), 260 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index a9cb871..0c81f0b 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -752,13 +752,16 @@ public: /// extracted from vectors. InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, bool Extract) const; + bool Insert, bool Extract, + TTI::TargetCostKind CostKind) const; /// Estimate the overhead of scalarizing an instructions unique /// non-constant operands. The (potentially vector) types to use for each of /// argument are passes via Tys. - InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) const; + InstructionCost + getOperandsScalarizationOverhead(ArrayRef Args, + ArrayRef Tys, + TTI::TargetCostKind CostKind) const; /// If target has efficient vector element load/store instructions, it can /// return true here so that insertion/extraction costs are not added to @@ -1193,6 +1196,7 @@ public: /// case is to provision the cost of vectorization/scalarization in /// vectorizer passes. InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index = -1, Value *Op0 = nullptr, Value *Op1 = nullptr) const; @@ -1203,6 +1207,7 @@ public: /// A typical suitable use case is cost estimation when vector instruction /// exists (e.g., from basic blocks during transformation). InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index = -1) const; /// \return The cost of replication shuffle of \p VF elements typed \p EltTy @@ -1675,11 +1680,12 @@ public: virtual bool useColdCCForColdCall(Function &F) = 0; virtual InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, - bool Extract) = 0; + bool Insert, bool Extract, + TargetCostKind CostKind) = 0; virtual InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) = 0; + ArrayRef Tys, + TargetCostKind CostKind) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool supportsTailCalls() = 0; virtual bool supportsTailCallFor(const CallBase *CB) = 0; @@ -1787,9 +1793,11 @@ public: TTI::TargetCostKind CostKind, const Instruction *I) = 0; virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) = 0; virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index) = 0; virtual InstructionCost @@ -2150,13 +2158,16 @@ public: InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, bool Extract) override { - return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); + bool Insert, bool Extract, + TargetCostKind CostKind) override { + return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, + CostKind); } InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) override { - return Impl.getOperandsScalarizationOverhead(Args, Tys); + ArrayRef Tys, + TargetCostKind CostKind) override { + return Impl.getOperandsScalarizationOverhead(Args, Tys, CostKind); } bool supportsEfficientVectorElementLoadStore() override { @@ -2360,13 +2371,16 @@ public: const Instruction *I) override { return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1) override { - return Impl.getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, + Value *Op1) override { + return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); } InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index) override { - return Impl.getVectorInstrCost(I, Val, Index); + return Impl.getVectorInstrCost(I, Val, CostKind, Index); } InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 0cd68df..21d1048 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -333,12 +333,15 @@ public: InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, bool Extract) const { + bool Insert, bool Extract, + TTI::TargetCostKind CostKind) const { return 0; } - InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) const { + InstructionCost + getOperandsScalarizationOverhead(ArrayRef Args, + ArrayRef Tys, + TTI::TargetCostKind CostKind) const { return 0; } @@ -585,12 +588,15 @@ public: return 1; } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1) const { + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, + Value *Op1) const { return 1; } InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index) const { return 1; } @@ -1176,7 +1182,7 @@ public: if (auto *CI = dyn_cast(IE->getOperand(2))) if (CI->getValue().getActiveBits() <= 32) Idx = CI->getZExtValue(); - return TargetTTI->getVectorInstrCost(*IE, Ty, Idx); + return TargetTTI->getVectorInstrCost(*IE, Ty, CostKind, Idx); } case Instruction::ShuffleVector: { auto *Shuffle = dyn_cast(U); @@ -1272,7 +1278,7 @@ public: if (CI->getValue().getActiveBits() <= 32) Idx = CI->getZExtValue(); Type *DstTy = U->getOperand(0)->getType(); - return TargetTTI->getVectorInstrCost(*EEI, DstTy, Idx); + return TargetTTI->getVectorInstrCost(*EEI, DstTy, CostKind, Idx); } } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index f27c689..77dd315 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -86,23 +86,25 @@ private: /// Estimate a cost of Broadcast as an extract and sequence of insert /// operations. - InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) { + InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy, + TTI::TargetCostKind CostKind) { InstructionCost Cost = 0; // Broadcast cost is equal to the cost of extracting the zero'th element // plus the cost of inserting it into every element of the result vector. - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0, - nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, + CostKind, 0, nullptr, nullptr); for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i, - nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, + CostKind, i, nullptr, nullptr); } return Cost; } /// Estimate a cost of shuffle as a sequence of extract and insert /// operations. - InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) { + InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy, + TTI::TargetCostKind CostKind) { InstructionCost Cost = 0; // Shuffle cost is equal to the cost of extracting element from its argument // plus the cost of inserting them onto the result vector. @@ -112,18 +114,20 @@ private: // vector and finally index 3 of second vector and insert them at index // <0,1,2,3> of result vector. for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i, - nullptr, nullptr); - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i, - nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, + CostKind, i, nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, + CostKind, i, nullptr, nullptr); } return Cost; } /// Estimate a cost of subvector extraction as a sequence of extract and /// insert operations. - InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index, - FixedVectorType *SubVTy) { + InstructionCost getExtractSubvectorOverhead(VectorType *VTy, + TTI::TargetCostKind CostKind, + int Index, + FixedVectorType *SubVTy) { assert(VTy && SubVTy && "Can only extract subvectors from vectors"); int NumSubElts = SubVTy->getNumElements(); @@ -137,18 +141,21 @@ private: // the source type plus the cost of inserting them into the result vector // type. for (int i = 0; i != NumSubElts; ++i) { - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, - i + Index, nullptr, nullptr); - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i, - nullptr, nullptr); + Cost += + thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, + CostKind, i + Index, nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, + CostKind, i, nullptr, nullptr); } return Cost; } /// Estimate a cost of subvector insertion as a sequence of extract and /// insert operations. - InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index, - FixedVectorType *SubVTy) { + InstructionCost getInsertSubvectorOverhead(VectorType *VTy, + TTI::TargetCostKind CostKind, + int Index, + FixedVectorType *SubVTy) { assert(VTy && SubVTy && "Can only insert subvectors into vectors"); int NumSubElts = SubVTy->getNumElements(); @@ -163,9 +170,10 @@ private: // type. for (int i = 0; i != NumSubElts; ++i) { Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, - i, nullptr, nullptr); - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, - i + Index, nullptr, nullptr); + CostKind, i, nullptr, nullptr); + Cost += + thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind, + i + Index, nullptr, nullptr); } return Cost; } @@ -216,7 +224,7 @@ private: FixedVectorType::get( PointerType::get(VT->getElementType(), 0), VT->getNumElements()), - -1, nullptr, nullptr) + CostKind, -1, nullptr, nullptr) : 0; InstructionCost LoadCost = VT->getNumElements() * @@ -224,8 +232,9 @@ private: getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); // Next, compute the cost of packing the result in a vector. - InstructionCost PackingCost = getScalarizationOverhead( - VT, Opcode != Instruction::Store, Opcode == Instruction::Store); + InstructionCost PackingCost = + getScalarizationOverhead(VT, Opcode != Instruction::Store, + Opcode == Instruction::Store, CostKind); InstructionCost ConditionalCost = 0; if (VariableMask) { @@ -241,7 +250,7 @@ private: Instruction::ExtractElement, FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VT->getNumElements()), - -1, nullptr, nullptr) + + CostKind, -1, nullptr, nullptr) + getCFInstrCost(Instruction::Br, CostKind) + getCFInstrCost(Instruction::PHI, CostKind)); } @@ -710,7 +719,8 @@ public: /// extracted from vectors. InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, - bool Insert, bool Extract) { + bool Insert, bool Extract, + TTI::TargetCostKind CostKind) { /// FIXME: a bitfield is not a reasonable abstraction for talking about /// which elements are needed from a scalable vector if (isa(InTy)) @@ -726,11 +736,11 @@ public: if (!DemandedElts[i]) continue; if (Insert) - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i, - nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, + CostKind, i, nullptr, nullptr); if (Extract) - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i, - nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, + CostKind, i, nullptr, nullptr); } return Cost; @@ -738,20 +748,24 @@ public: /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, - bool Extract) { + bool Extract, + TTI::TargetCostKind CostKind) { if (isa(InTy)) return InstructionCost::getInvalid(); auto *Ty = cast(InTy); APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); - return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); + return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, + CostKind); } /// Estimate the overhead of scalarizing an instructions unique /// non-constant operands. The (potentially vector) types to use for each of /// argument are passes via Tys. - InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) { + InstructionCost + getOperandsScalarizationOverhead(ArrayRef Args, + ArrayRef Tys, + TTI::TargetCostKind CostKind) { assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); InstructionCost Cost = 0; @@ -766,7 +780,8 @@ public: if (!isa(A) && UniqueOperands.insert(A).second) { if (auto *VecTy = dyn_cast(Ty)) - Cost += getScalarizationOverhead(VecTy, false, true); + Cost += getScalarizationOverhead(VecTy, /*Insert*/ false, + /*Extract*/ true, CostKind); } } @@ -779,14 +794,17 @@ public: /// added as a heuristic. InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef Args, - ArrayRef Tys) { - InstructionCost Cost = getScalarizationOverhead(RetTy, true, false); + ArrayRef Tys, + TTI::TargetCostKind CostKind) { + InstructionCost Cost = getScalarizationOverhead( + RetTy, /*Insert*/ true, /*Extract*/ false, CostKind); if (!Args.empty()) - Cost += getOperandsScalarizationOverhead(Args, Tys); + Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind); else // When no information on arguments is provided, we add the cost // associated with one argument as a heuristic. - Cost += getScalarizationOverhead(RetTy, false, true); + Cost += getScalarizationOverhead(RetTy, /*Insert*/ false, + /*Extract*/ true, CostKind); return Cost; } @@ -898,7 +916,7 @@ public: // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. SmallVector Tys(Args.size(), Ty); - return getScalarizationOverhead(VTy, Args, Tys) + + return getScalarizationOverhead(VTy, Args, Tys, CostKind) + VTy->getNumElements() * Cost; } @@ -951,7 +969,7 @@ public: switch (improveShuffleKindFromMask(Kind, Mask)) { case TTI::SK_Broadcast: if (auto *FVT = dyn_cast(Tp)) - return getBroadcastShuffleOverhead(FVT); + return getBroadcastShuffleOverhead(FVT, CostKind); return InstructionCost::getInvalid(); case TTI::SK_Select: case TTI::SK_Splice: @@ -960,13 +978,13 @@ public: case TTI::SK_PermuteSingleSrc: case TTI::SK_PermuteTwoSrc: if (auto *FVT = dyn_cast(Tp)) - return getPermuteShuffleOverhead(FVT); + return getPermuteShuffleOverhead(FVT, CostKind); return InstructionCost::getInvalid(); case TTI::SK_ExtractSubvector: - return getExtractSubvectorOverhead(Tp, Index, + return getExtractSubvectorOverhead(Tp, CostKind, Index, cast(SubTp)); case TTI::SK_InsertSubvector: - return getInsertSubvectorOverhead(Tp, Index, + return getInsertSubvectorOverhead(Tp, CostKind, Index, cast(SubTp)); } llvm_unreachable("Unknown TTI::ShuffleKind"); @@ -1110,7 +1128,9 @@ public: // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; + return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true, + CostKind) + + Num * Cost; } // We already handled vector-to-vector and scalar-to-scalar conversions. @@ -1119,8 +1139,12 @@ public: // that the conversion is scalarized in one way or another. if (Opcode == Instruction::BitCast) { // Illegal bitcasts are done by storing and loading from a stack slot. - return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + - (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); + return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false, + /*Extract*/ true, CostKind) + : 0) + + (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true, + /*Extract*/ false, CostKind) + : 0); } llvm_unreachable("Unhandled cast"); @@ -1128,11 +1152,11 @@ public: InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, - Index, nullptr, nullptr) + + CostKind, Index, nullptr, nullptr) + thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), - TTI::CastContextHint::None, - TTI::TCK_RecipThroughput); + TTI::CastContextHint::None, CostKind); } InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, @@ -1183,19 +1207,23 @@ public: // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(ValVTy, true, false) + Num * Cost; + return getScalarizationOverhead(ValVTy, /*Insert*/ true, + /*Extract*/ false, CostKind) + + Num * Cost; } // Unknown scalar opcode. return 1; } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1) { + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1) { return getRegUsageForType(Val->getScalarType()); } InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index) { Value *Op0 = nullptr; Value *Op1 = nullptr; @@ -1203,7 +1231,8 @@ public: Op0 = IE->getOperand(0); Op1 = IE->getOperand(1); } - return thisT()->getVectorInstrCost(I.getOpcode(), Val, Index, Op0, Op1); + return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0, + Op1); } InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, @@ -1231,10 +1260,10 @@ public: APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF); Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, /*Insert*/ false, - /*Extract*/ true); - Cost += - thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, - /*Insert*/ true, /*Extract*/ false); + /*Extract*/ true, CostKind); + Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, + /*Insert*/ true, + /*Extract*/ false, CostKind); return Cost; } @@ -1275,9 +1304,9 @@ public: if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { // This is a vector load/store for some illegal type that is scalarized. // We must account for the cost of building or decomposing the vector. - Cost += getScalarizationOverhead(cast(Src), - Opcode != Instruction::Store, - Opcode == Instruction::Store); + Cost += getScalarizationOverhead( + cast(Src), Opcode != Instruction::Store, + Opcode == Instruction::Store, CostKind); } } @@ -1389,13 +1418,13 @@ public: // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 // The cost is estimated as extract elements at 0, 2, 4, 6 from the // <8 x i32> vector and insert them into a <4 x i32> vector. - InstructionCost InsSubCost = - thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, - /*Insert*/ true, /*Extract*/ false); + InstructionCost InsSubCost = thisT()->getScalarizationOverhead( + SubVT, DemandedAllSubElts, + /*Insert*/ true, /*Extract*/ false, CostKind); Cost += Indices.size() * InsSubCost; - Cost += - thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, - /*Insert*/ false, /*Extract*/ true); + Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, + /*Insert*/ false, + /*Extract*/ true, CostKind); } else { // The interleave cost is extract elements from sub vectors, and // insert them into the wide vector. @@ -1410,13 +1439,13 @@ public: // The cost is estimated as extract all elements (of actual members, // excluding gaps) from both <4 x i32> vectors and insert into the <12 x // i32> vector. - InstructionCost ExtSubCost = - thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, - /*Insert*/ false, /*Extract*/ true); + InstructionCost ExtSubCost = thisT()->getScalarizationOverhead( + SubVT, DemandedAllSubElts, + /*Insert*/ false, /*Extract*/ true, CostKind); Cost += ExtSubCost * Indices.size(); Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, /*Insert*/ true, - /*Extract*/ false); + /*Extract*/ false, CostKind); } if (!UseMaskForCond) @@ -1649,10 +1678,11 @@ public: if (RetVF.isVector() && !RetVF.isScalable()) { ScalarizationCost = 0; if (!RetTy->isVoidTy()) - ScalarizationCost += - getScalarizationOverhead(cast(RetTy), true, false); + ScalarizationCost += getScalarizationOverhead( + cast(RetTy), + /*Insert*/ true, /*Extract*/ false, CostKind); ScalarizationCost += - getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); + getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); } IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, @@ -1704,7 +1734,8 @@ public: Type *ScalarRetTy = RetTy; if (auto *RetVTy = dyn_cast(RetTy)) { if (!SkipScalarizationCost) - ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); + ScalarizationCost = getScalarizationOverhead( + RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind); ScalarCalls = std::max(ScalarCalls, cast(RetVTy)->getNumElements()); ScalarRetTy = RetTy->getScalarType(); @@ -1714,7 +1745,8 @@ public: Type *Ty = Tys[i]; if (auto *VTy = dyn_cast(Ty)) { if (!SkipScalarizationCost) - ScalarizationCost += getScalarizationOverhead(VTy, false, true); + ScalarizationCost += getScalarizationOverhead( + VTy, /*Insert*/ false, /*Extract*/ true, CostKind); ScalarCalls = std::max(ScalarCalls, cast(VTy)->getNumElements()); Ty = Ty->getScalarType(); @@ -2124,8 +2156,10 @@ public: return InstructionCost::getInvalid(); InstructionCost ScalarizationCost = - SkipScalarizationCost ? ScalarizationCostPassed - : getScalarizationOverhead(RetVTy, true, false); + SkipScalarizationCost + ? ScalarizationCostPassed + : getScalarizationOverhead(RetVTy, /*Insert*/ true, + /*Extract*/ false, CostKind); unsigned ScalarCalls = cast(RetVTy)->getNumElements(); SmallVector ScalarTys; @@ -2141,7 +2175,8 @@ public: for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { if (auto *VTy = dyn_cast(Tys[i])) { if (!ICA.skipScalarizationCost()) - ScalarizationCost += getScalarizationOverhead(VTy, false, true); + ScalarizationCost += getScalarizationOverhead( + VTy, /*Insert*/ false, /*Extract*/ true, CostKind); ScalarCalls = std::max(ScalarCalls, cast(VTy)->getNumElements()); } @@ -2258,8 +2293,8 @@ public: ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); return ShuffleCost + ArithCost + - thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0, - nullptr, nullptr); + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, + CostKind, 0, nullptr, nullptr); } /// Try to calculate the cost of performing strict (in-order) reductions, @@ -2286,8 +2321,8 @@ public: return InstructionCost::getInvalid(); auto *VTy = cast(Ty); - InstructionCost ExtractCost = - getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true); + InstructionCost ExtractCost = getScalarizationOverhead( + VTy, /*Insert=*/false, /*Extract=*/true, CostKind); InstructionCost ArithCost = thisT()->getArithmeticInstrCost( Opcode, VTy->getElementType(), CostKind); ArithCost *= VTy->getNumElements(); @@ -2366,8 +2401,8 @@ public: // The last min/max should be in vector registers and we counted it above. // So just need a single extractelement. return ShuffleCost + MinMaxCost + - thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0, - nullptr, nullptr); + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, + CostKind, 0, nullptr, nullptr); } InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index d03a8cf..ad7e543 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -513,16 +513,17 @@ bool TargetTransformInfo::useColdCCForColdCall(Function &F) const { return TTIImpl->useColdCCForColdCall(F); } -InstructionCost -TargetTransformInfo::getScalarizationOverhead(VectorType *Ty, - const APInt &DemandedElts, - bool Insert, bool Extract) const { - return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); +InstructionCost TargetTransformInfo::getScalarizationOverhead( + VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, + TTI::TargetCostKind CostKind) const { + return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, + CostKind); } InstructionCost TargetTransformInfo::getOperandsScalarizationOverhead( - ArrayRef Args, ArrayRef Tys) const { - return TTIImpl->getOperandsScalarizationOverhead(Args, Tys); + ArrayRef Args, ArrayRef Tys, + TTI::TargetCostKind CostKind) const { + return TTIImpl->getOperandsScalarizationOverhead(Args, Tys, CostKind); } bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { @@ -898,23 +899,25 @@ InstructionCost TargetTransformInfo::getCmpSelInstrCost( } InstructionCost TargetTransformInfo::getVectorInstrCost( - unsigned Opcode, Type *Val, unsigned Index, Value *Op0, Value *Op1) const { + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + Value *Op0, Value *Op1) const { // FIXME: Assert that Opcode is either InsertElement or ExtractElement. // This is mentioned in the interface description and respected by all // callers, but never asserted upon. InstructionCost Cost = - TTIImpl->getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + TTIImpl->getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } -InstructionCost TargetTransformInfo::getVectorInstrCost(const Instruction &I, - Type *Val, - unsigned Index) const { +InstructionCost +TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { // FIXME: Assert that Opcode is either InsertElement or ExtractElement. // This is mentioned in the interface description and respected by all // callers, but never asserted upon. - InstructionCost Cost = TTIImpl->getVectorInstrCost(I, Val, Index); + InstructionCost Cost = TTIImpl->getVectorInstrCost(I, Val, CostKind, Index); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 87ba74b..fee11ef 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7383,11 +7383,11 @@ class VectorPromoteHelper { // The scalar chain of computation has to pay for the transition // scalar to vector. // The vector chain has to account for the combining cost. - InstructionCost ScalarCost = - TTI.getVectorInstrCost(*Transition, PromotedType, Index); - InstructionCost VectorCost = StoreExtractCombineCost; enum TargetTransformInfo::TargetCostKind CostKind = TargetTransformInfo::TCK_RecipThroughput; + InstructionCost ScalarCost = + TTI.getVectorInstrCost(*Transition, PromotedType, CostKind, Index); + InstructionCost VectorCost = StoreExtractCombineCost; for (const auto &Inst : InstsToBePromoted) { // Compute the cost. // By construction, all instructions being promoted are arithmetic ones. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 916eefc..c6e9e05 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2131,14 +2131,14 @@ InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, // Get the cost for the extract. We compute the cost (if any) for the extend // below. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, - Index, nullptr, nullptr); + CostKind, Index, nullptr, nullptr); // Legalize the types. auto VecLT = getTypeLegalizationCost(VecTy); auto DstVT = TLI->getValueType(DL, Dst); auto SrcVT = TLI->getValueType(DL, Src); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If the resulting type is still a vector and the destination type is legal, // we may get the extension for free. If not, get the default cost for the @@ -2225,13 +2225,16 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val, } InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */); } InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, - Type *Val, unsigned Index) { + Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) { return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 6eaff95..a22ba47 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -169,9 +169,11 @@ public: InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index); InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 00e6970..0c3324f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -790,6 +790,7 @@ GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, } InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { switch (Opcode) { @@ -800,7 +801,8 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, if (EltSize < 32) { if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) return 0; - return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, + Op1); } // Extracts are just reads of a subregister, so are free. Inserts are @@ -811,7 +813,7 @@ InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, return Index == ~0u ? 2 : 0; } default: - return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 4a1137d..7862f21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -162,6 +162,7 @@ public: using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1); bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const; diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp index c3dd321..c01f9c4 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp @@ -108,6 +108,7 @@ InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode, } InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { switch (Opcode) { @@ -116,7 +117,8 @@ InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned EltSize = DL.getTypeSizeInBits(cast(ValTy)->getElementType()); if (EltSize < 32) { - return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, + Op1); } // Extracts are just reads of a subregister, so are free. Inserts are @@ -127,7 +129,7 @@ InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, return Index == ~0u ? 2 : 0; } default: - return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); } } diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h index 9045cc7..8dacae0 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h @@ -62,6 +62,7 @@ public: const Instruction *I = nullptr); using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1); }; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 7d26dde..048790a 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -874,6 +874,7 @@ InstructionCost ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, } InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { // Penalize inserting into an D-subregister. We end up with a three times @@ -894,7 +895,8 @@ InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) return std::max( - BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1), 2U); + BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1), + 2U); } if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || @@ -907,7 +909,7 @@ InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); } - return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); } InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, @@ -1021,12 +1023,14 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) { // One scalaization insert, one scalarization extract and the cost of the // fcmps. - return BaseT::getScalarizationOverhead(VecValTy, false, true) + - BaseT::getScalarizationOverhead(VecCondTy, true, false) + + return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false, + /*Extract*/ true, CostKind) + + BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true, + /*Extract*/ false, CostKind) + VecValTy->getNumElements() * getCmpSelInstrCost(Opcode, ValTy->getScalarType(), - VecCondTy->getScalarType(), VecPred, CostKind, - I); + VecCondTy->getScalarType(), VecPred, + CostKind, I); } std::pair LT = getTypeLegalizationCost(ValTy); @@ -1039,7 +1043,8 @@ InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) { if (LT.first > 1) return LT.first * BaseCost + - BaseT::getScalarizationOverhead(VecCondTy, true, false); + BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true, + /*Extract*/ false, CostKind); return BaseCost; } } @@ -1442,7 +1447,8 @@ InstructionCost ARMTTIImpl::getArithmeticInstrCost( // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. SmallVector Tys(Args.size(), Ty); - return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; + return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) + + Num * Cost; } return BaseCost; @@ -1581,8 +1587,11 @@ InstructionCost ARMTTIImpl::getGatherScatterOpCost( // The scalarization cost should be a lot higher. We use the number of vector // elements plus the scalarization overhead. InstructionCost ScalarCost = - NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) + - BaseT::getScalarizationOverhead(VTy, false, true); + NumElems * LT.first + + BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false, + CostKind) + + BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true, + CostKind); if (EltSize < 8 || Alignment < EltSize / 8) return ScalarCost; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 6b1e644..69b7a31 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -240,8 +240,9 @@ public: const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr); diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 67e0723..979a436 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -139,14 +139,17 @@ ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth, } InstructionCost HexagonTTIImpl::getScalarizationOverhead( - VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) { - return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); + VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, + TTI::TargetCostKind CostKind) { + return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, + CostKind); } InstructionCost HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) { - return BaseT::getOperandsScalarizationOverhead(Args, Tys); + ArrayRef Tys, + TTI::TargetCostKind CostKind) { + return BaseT::getOperandsScalarizationOverhead(Args, Tys, CostKind); } InstructionCost HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, @@ -329,6 +332,7 @@ InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, } InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { Type *ElemTy = Val->isVectorTy() ? cast(Val)->getElementType() @@ -339,8 +343,8 @@ InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (ElemTy->isIntegerTy(32)) return Cost; // If it's not a 32-bit value, there will need to be an extract. - return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index, - Op0, Op1); + return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, CostKind, + Index, Op0, Op1); } if (Opcode == Instruction::ExtractElement) diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index d41299f..3d1e51a 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -107,9 +107,12 @@ public: InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, bool Extract); - InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys); + bool Insert, bool Extract, + TTI::TargetCostKind CostKind); + InstructionCost + getOperandsScalarizationOverhead(ArrayRef Args, + ArrayRef Tys, + TTI::TargetCostKind CostKind); InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys, TTI::TargetCostKind CostKind); @@ -154,8 +157,9 @@ public: TTI::TargetCostKind CostKind, const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 328a70e..594ba18 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -675,6 +675,7 @@ InstructionCost PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, } InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { assert(Val->isVectorTy() && "This must be a vector type"); @@ -687,7 +688,7 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return InstructionCost::getMax(); InstructionCost Cost = - BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); Cost *= CostFactor; if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { @@ -829,8 +830,8 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, if (Src->isVectorTy() && Opcode == Instruction::Store) for (int i = 0, e = cast(Src)->getNumElements(); i < e; ++i) - Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i, nullptr, - nullptr); + Cost += getVectorInstrCost(Instruction::ExtractElement, Src, CostKind, i, + nullptr, nullptr); return Cost; } diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 810a7d0..97377cb 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -126,8 +126,9 @@ public: TTI::TargetCostKind CostKind, const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index e5b7f40..bb50b5b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1198,13 +1198,14 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, } InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { assert(Val->isVectorTy() && "This must be a vector type"); if (Opcode != Instruction::ExtractElement && Opcode != Instruction::InsertElement) - return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); // Legalize the type. std::pair LT = getTypeLegalizationCost(Val); @@ -1218,7 +1219,7 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return LT.first; if (!isTypeLegal(Val)) - return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); // In RVV, we could use vslidedown + vmv.x.s to extract element from vector // and vslideup + vmv.s.x to insert element to vector. diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index c4cc798..78e035b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -157,8 +157,9 @@ public: const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 0635150..821efc1 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -532,7 +532,8 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost( return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); if (DivRemConst) { SmallVector Tys(Args.size(), Ty); - return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys); + return VF * DivMulSeqCost + + getScalarizationOverhead(VTy, Args, Tys, CostKind); } if ((SignedDivRem || UnsignedDivRem) && VF > 4) // Temporary hack: disable high vectorization factors with integer @@ -558,7 +559,8 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost( getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); SmallVector Tys(Args.size(), Ty); InstructionCost Cost = - (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys); + (VF * ScalarCost) + + getScalarizationOverhead(VTy, Args, Tys, CostKind); // FIXME: VF 2 for these FP operations are currently just as // expensive as for VF 4. if (VF == 2) @@ -576,8 +578,8 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost( // There is no native support for FRem. if (Opcode == Instruction::FRem) { SmallVector Tys(Args.size(), Ty); - InstructionCost Cost = - (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys); + InstructionCost Cost = (VF * LIBCALL_COST) + + getScalarizationOverhead(VTy, Args, Tys, CostKind); // FIXME: VF 2 for float is currently just as expensive as for VF 4. if (VF == 2 && ScalarBits == 32) Cost *= 2; @@ -865,8 +867,10 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) NeedsExtracts = false; - TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts); - TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false); + TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false, + NeedsExtracts, CostKind); + TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, + /*Extract*/ false, CostKind); // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) @@ -878,7 +882,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (Opcode == Instruction::FPTrunc) { if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements. return VF /*ldxbr/lexbr*/ + - getScalarizationOverhead(DstVecTy, true, false); + getScalarizationOverhead(DstVecTy, /*Insert*/ true, + /*Extract*/ false, CostKind); else // double -> float return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/); } @@ -891,7 +896,8 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, return VF * 2; } // -> fp128. VF * lxdb/lxeb + extraction of elements. - return VF + getScalarizationOverhead(SrcVecTy, false, true); + return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false, + /*Extract*/ true, CostKind); } } @@ -996,6 +1002,7 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, } InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { // vlvgp will insert two grs into a vector register, so only count half the @@ -1013,7 +1020,7 @@ InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return Cost; } - return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); } // Check if a load may be folded as a memory operand in its user. diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 33c3778..1c82e69 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -107,8 +107,9 @@ public: TTI::TargetCostKind CostKind, const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index b94dcd6..9a434d9 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -80,12 +80,12 @@ InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost( return Cost; } -InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, - Type *Val, - unsigned Index, - Value *Op0, Value *Op1) { - InstructionCost Cost = - BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index, Op0, Op1); +InstructionCost +WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1) { + InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost( + Opcode, Val, CostKind, Index, Op0, Op1); // SIMD128's insert/extract currently only take constant indices. if (Index == -1u) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index 4f54a76..a803fe5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -66,8 +66,9 @@ public: ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); /// @} diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 14f1f83..9366c1b 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4257,6 +4257,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, } InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { static const CostTblEntry SLMCostTbl[] = { @@ -4269,7 +4270,6 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, assert(Val->isVectorTy() && "This must be a vector type"); Type *ScalarType = Val->getScalarType(); InstructionCost RegisterFileMoveCost = 0; - TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput; // Non-immediate extraction/insertion can be handled as a sequence of // aliased loads+stores via the stack. @@ -4401,14 +4401,14 @@ InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) RegisterFileMoveCost += 1; - return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1) + + return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) + RegisterFileMoveCost; } -InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, - const APInt &DemandedElts, - bool Insert, - bool Extract) { +InstructionCost +X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, + bool Insert, bool Extract, + TTI::TargetCostKind CostKind) { assert(DemandedElts.getBitWidth() == cast(Ty)->getNumElements() && "Vector size mismatch"); @@ -4416,7 +4416,6 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, std::pair LT = getTypeLegalizationCost(Ty); MVT MScalarTy = LT.second.getScalarType(); unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); - TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput; InstructionCost Cost = 0; constexpr unsigned LaneBitWidth = 128; @@ -4436,8 +4435,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, // For types we can insert directly, insertion into 128-bit sub vectors is // cheap, followed by a cheap chain of concatenations. if (LegalVectorBitWidth <= LaneBitWidth) { - Cost += - BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); + Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, + /*Extract*/ false, CostKind); } else { // In each 128-lane, if at least one index is demanded but not all // indices are demanded and this 128-lane is not the first 128-lane of @@ -4477,7 +4476,7 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind, I * NumEltsPerLane, LaneTy); Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert, - false); + /*Extract*/ false, CostKind); } APInt AffectedLanes = @@ -4554,8 +4553,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, continue; Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind, I * NumEltsPerLane, LaneTy); - Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, false, - Extract); + Cost += BaseT::getScalarizationOverhead( + LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind); } return Cost; @@ -4563,7 +4562,8 @@ InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, } // Fallback to default extraction. - Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); + Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false, + Extract, CostKind); } return Cost; @@ -4815,7 +4815,7 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, CoalescedVecEltIdx, CoalescedVecEltIdx + 1); assert(DemandedElts.countPopulation() == 1 && "Inserting single value"); Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, - !IsLoad); + !IsLoad, CostKind); } // This isn't exactly right. We're using slow unaligned 32-byte accesses @@ -4856,15 +4856,15 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { // Scalarization APInt DemandedElts = APInt::getAllOnes(NumElem); - InstructionCost MaskSplitCost = - getScalarizationOverhead(MaskTy, DemandedElts, false, true); + InstructionCost MaskSplitCost = getScalarizationOverhead( + MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind); InstructionCost ScalarCompareCost = getCmpSelInstrCost( Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, CmpInst::BAD_ICMP_PREDICATE, CostKind); InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); - InstructionCost ValueSplitCost = - getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore); + InstructionCost ValueSplitCost = getScalarizationOverhead( + SrcVTy, DemandedElts, IsLoad, IsStore, CostKind); InstructionCost MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace, CostKind); @@ -5174,8 +5174,8 @@ X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, } // Add the final extract element to the cost. - return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0, - nullptr, nullptr); + return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, + CostKind, 0, nullptr, nullptr); } InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, @@ -5475,8 +5475,8 @@ X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, } // Add the final extract element to the cost. - return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0, - nullptr, nullptr); + return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, + CostKind, 0, nullptr, nullptr); } /// Calculate the cost of materializing a 64-bit value. This helper @@ -5781,7 +5781,7 @@ InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, auto *MaskTy = FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); MaskUnpackCost = getScalarizationOverhead( - MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true); + MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind); InstructionCost ScalarCompareCost = getCmpSelInstrCost( Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, CmpInst::BAD_ICMP_PREDICATE, CostKind); @@ -5791,7 +5791,7 @@ InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, InstructionCost AddressUnpackCost = getScalarizationOverhead( FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts, - /*Insert=*/false, /*Extract=*/true); + /*Insert=*/false, /*Extract=*/true, CostKind); // The cost of the scalar loads/stores. InstructionCost MemoryOpCost = @@ -5800,10 +5800,10 @@ InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, // The cost of forming the vector from loaded scalars/ // scalarizing the vector to perform scalar stores. - InstructionCost InsertExtractCost = - getScalarizationOverhead(cast(SrcVTy), DemandedElts, - /*Insert=*/Opcode == Instruction::Load, - /*Extract=*/Opcode == Instruction::Store); + InstructionCost InsertExtractCost = getScalarizationOverhead( + cast(SrcVTy), DemandedElts, + /*Insert=*/Opcode == Instruction::Load, + /*Extract=*/Opcode == Instruction::Store, CostKind); return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index c189e50..d0abfe2 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -147,11 +147,13 @@ public: TTI::TargetCostKind CostKind, const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, bool Extract); + bool Insert, bool Extract, + TTI::TargetCostKind CostKind); InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ca13e79..a28099d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1683,8 +1683,8 @@ private: /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Instruction *I, - ElementCount VF) const; + InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, + TTI::TargetCostKind CostKind) const; /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. @@ -3443,8 +3443,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, // to be vectors, so we need to extract individual elements from there, // execute VF scalar calls, and then gather the result into the vector return // value. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost ScalarCallCost = - TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); + TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); if (VF.isScalar()) return ScalarCallCost; @@ -3455,7 +3456,8 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); + InstructionCost ScalarizationCost = + getScalarizationOverhead(CI, VF, CostKind); InstructionCost Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; @@ -3471,7 +3473,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, // If the corresponding vector cost is cheaper, return its cost. InstructionCost VectorCallCost = - TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); + TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind); if (VectorCallCost < Cost) { NeedToScalarize = false; Cost = VectorCallCost; @@ -4478,7 +4480,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, // The cost of insertelement and extractelement instructions needed for // scalarization. - ScalarizationCost += getScalarizationOverhead(I, VF); + ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally @@ -6239,13 +6241,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), - APInt::getAllOnes(VF.getFixedValue()), true, false); + APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, + /*Extract*/ false, CostKind); ScalarCost += - VF.getFixedValue() * - TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); + VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); } // Compute the scalarization overhead of needed extractelement @@ -6261,7 +6264,8 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( else if (needsExtract(J, VF)) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(J->getType(), VF)), - APInt::getAllOnes(VF.getFixedValue()), false, true); + APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, + /*Extract*/ true, CostKind); } } @@ -6390,14 +6394,15 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; const Align Alignment = getLoadStoreAlignment(I); - Cost += VF.getKnownMinValue() * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, - AS, TTI::TCK_RecipThroughput); + Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), + ValTy->getScalarType(), + Alignment, AS, CostKind); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF); + Cost += getScalarizationOverhead(I, VF, CostKind); // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale @@ -6410,8 +6415,8 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); Cost += TTI.getScalarizationOverhead( Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), - /*Insert=*/false, /*Extract=*/true); - Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); + /*Insert=*/false, /*Extract=*/true, CostKind); + Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); if (useEmulatedMaskMemRefHack(I, VF)) // Artificially setting to a high enough value to practically disable @@ -6477,7 +6482,7 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - VF.getKnownMinValue() - 1)); + CostKind, VF.getKnownMinValue() - 1)); } InstructionCost @@ -6772,9 +6777,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, return VectorizationCostTy(C, TypeNotScalarized); } -InstructionCost -LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - ElementCount VF) const { +InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( + Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { // There is no mechanism yet to create a scalable scalarization loop, // so this is currently Invalid. @@ -6789,8 +6793,9 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, if (!RetTy->isVoidTy() && (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, - false); + cast(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), + /*Insert*/ true, + /*Extract*/ false, CostKind); // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6810,7 +6815,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, for (auto *V : filterExtractingOperands(Ops, VF)) Tys.push_back(MaybeVectorizeType(V->getType(), VF)); return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), Tys); + filterExtractingOperands(Ops, VF), Tys, CostKind); } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { @@ -7067,7 +7072,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); return ( TTI.getScalarizationOverhead( - Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + + Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), + /*Insert*/ false, /*Extract*/ true, CostKind) + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 9bcf73e..e3eb6b1 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6664,7 +6664,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, continue; } } - Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), Idx); + Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, + Idx); } // Add a cost for subvector extracts/inserts if required. for (const auto &Data : ExtractVectorsTys) { @@ -6792,7 +6793,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, bool NeedShuffle = VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof); InstructionCost InsertCost = - TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, + TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, /*Index=*/0, PoisonValue::get(VecTy), *It); return InsertCost + (NeedShuffle ? TTI->getShuffleCost( @@ -7047,7 +7048,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, } } return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy, - *getExtractIndex(I)); + CostKind, *getExtractIndex(I)); }; auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; }; return GetCostDiff(GetScalarCost, GetVectorCost); @@ -7116,7 +7117,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, InstructionCost Cost = 0; Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, - /*Insert*/ true, /*Extract*/ false); + /*Insert*/ true, /*Extract*/ false, + CostKind); // First cost - resize to actual vector size if not identity shuffle or // need to shift the vector. @@ -7995,6 +7997,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); @@ -8004,8 +8007,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), VecTy, EU.Lane); } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + CostKind, EU.Lane); } } @@ -8079,7 +8082,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { EstimateShufflesCost); InstructionCost InsertCost = TTI->getScalarizationOverhead( cast(FirstUsers[I].first->getType()), DemandedElts[I], - /*Insert*/ true, /*Extract*/ false); + /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput); Cost -= InsertCost; } @@ -8427,9 +8430,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef VL, InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, const APInt &ShuffledIndices, bool NeedToShuffle) const { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost = TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true, - /*Extract*/ false); + /*Extract*/ false, CostKind); if (NeedToShuffle) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 455fb39..2e48975 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -230,8 +230,10 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { InstructionCost OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); - OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, - /* Insert */ true, HasExtract); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + OldCost += + TTI.getScalarizationOverhead(MinVecTy, DemandedElts, + /* Insert */ true, HasExtract, CostKind); // New pattern: load VecPtr InstructionCost NewCost = @@ -346,9 +348,12 @@ ExtractElementInst *VectorCombine::getShuffleExtract( return nullptr; Type *VecTy = Ext0->getVectorOperand()->getType(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); - InstructionCost Cost0 = TTI.getVectorInstrCost(*Ext0, VecTy, Index0); - InstructionCost Cost1 = TTI.getVectorInstrCost(*Ext1, VecTy, Index1); + InstructionCost Cost0 = + TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); + InstructionCost Cost1 = + TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); // If both costs are invalid no shuffle is needed if (!Cost0.isValid() && !Cost1.isValid()) @@ -411,11 +416,12 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // both sequences. unsigned Ext0Index = Ext0IndexC->getZExtValue(); unsigned Ext1Index = Ext1IndexC->getZExtValue(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Extract0Cost = - TTI.getVectorInstrCost(*Ext0, VecTy, Ext0Index); + TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index); InstructionCost Extract1Cost = - TTI.getVectorInstrCost(*Ext1, VecTy, Ext1Index); + TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index); // A more expensive extract will always be replaced by a splat shuffle. // For example, if Ext0 is more expensive: @@ -645,15 +651,16 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) { Mask[Index] = Index + NumElts; Type *ScalarTy = VecTy->getScalarType(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost OldCost = TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) + - TTI.getVectorInstrCost(I, VecTy, Index); + TTI.getVectorInstrCost(I, VecTy, CostKind, Index); // If the extract has one use, it will be eliminated, so count it in the // original cost. If it has more than one use, ignore the cost because it will // be the same before/after. if (Extract->hasOneUse()) - OldCost += TTI.getVectorInstrCost(*Extract, VecTy, Index); + OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); InstructionCost NewCost = TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) + @@ -801,8 +808,9 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { // Get cost estimate for the insert element. This cost will factor into // both sequences. - InstructionCost InsertCost = - TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost InsertCost = TTI.getVectorInstrCost( + Instruction::InsertElement, VecTy, CostKind, Index); InstructionCost OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; InstructionCost NewCost = ScalarOpCost + InsertCost + @@ -891,8 +899,10 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { if (!VecTy) return false; - InstructionCost OldCost = TTI.getVectorInstrCost(*Ext0, VecTy, Index0); - OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, Index1); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost OldCost = + TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); + OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred) * @@ -912,7 +922,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, ShufMask); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); - NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CheapIndex); + NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); // Aggressively form vector ops if the cost is equal because the transform // may enable further optimization. @@ -1169,8 +1179,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { } auto *Index = dyn_cast(UI->getOperand(1)); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; OriginalCost += - TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, + TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, CostKind, Index ? Index->getZExtValue() : -1); ScalarizedCost += TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(), diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll index 5c5ee39..979c3a2 100644 --- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll @@ -51,7 +51,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) { ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void -- 2.7.4