return Cost;
}
+ unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ const Value *Ptr, bool VariableMask,
+ Align Alignment, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr) {
+ auto *VT = cast<FixedVectorType>(DataTy);
+ // Assume the target does not have support for gather/scatter operations
+ // and provide a rough estimate.
+ //
+ // First, compute the cost of extracting the individual addresses and the
+ // individual memory operations.
+ int LoadCost =
+ VT->getNumElements() *
+ (getVectorInstrCost(
+ Instruction::ExtractElement,
+ FixedVectorType::get(PointerType::get(VT->getElementType(), 0),
+ VT->getNumElements()),
+ -1) +
+ getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
+
+ // Next, compute the cost of packing the result in a vector.
+ int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store,
+ Opcode == Instruction::Store);
+
+ int ConditionalCost = 0;
+ if (VariableMask) {
+ // Compute the cost of conditionally executing the memory operations with
+ // variable masks. This includes extracting the individual conditions, a
+ // branches and PHIs to combine the results.
+ // NOTE: Estimating the cost of conditionally executing the memory
+ // operations accurately is quite difficult and the current solution
+ // provides a very rough estimate only.
+ ConditionalCost =
+ VT->getNumElements() *
+ (getVectorInstrCost(
+ Instruction::ExtractElement,
+ FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
+ VT->getNumElements()),
+ -1) +
+ getCFInstrCost(Instruction::Br, CostKind) +
+ getCFInstrCost(Instruction::PHI, CostKind));
+ }
+
+ return LoadCost + PackingCost + ConditionalCost;
+ }
+
unsigned getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32 immarg, <4 x i1>, <4 x i8>)
define <4 x i8> @gather_load_4xi8_constant_mask(<4 x i8*> %ptrs) {
; CHECK: gather_load_4xi8_constant_mask
-; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
;
%lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
ret <4 x i8> %lv
define <4 x i8> @gather_load_4xi8_variable_mask(<4 x i8*> %ptrs, <4 x i1> %cond) {
; CHECK: gather_load_4xi8_variable_mask
-; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8
;
%lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef)
ret <4 x i8> %lv
declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32 immarg, <4 x i1>)
define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x i8*> %ptrs) {
; CHECK: scatter_store_4xi8_constant_mask
-; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
;
call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
ret void
define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x i8*> %ptrs, <4 x i1> %cond) {
; CHECK: scatter_store_4xi8_variable_mask
-; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(
;
call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %val, <4 x i8*> %ptrs, i32 1, <4 x i1> %cond)
ret void
declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32 immarg, <4 x i1>, <4 x i32>)
define <4 x i32> @gather_load_4xi32_constant_mask(<4 x i32*> %ptrs) {
; CHECK: gather_load_4xi32_constant_mask
-; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
;
%lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
ret <4 x i32> %lv
define <4 x i32> @gather_load_4xi32_variable_mask(<4 x i32*> %ptrs, <4 x i1> %cond) {
; CHECK: gather_load_4xi32_variable_mask
-; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32
;
%lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef)
ret <4 x i32> %lv
declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32 immarg, <4 x i1>)
define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x i32*> %ptrs) {
; CHECK: scatter_store_4xi32_constant_mask
-; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-NEON: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
;
call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
ret void
define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x i32*> %ptrs, <4 x i1> %cond) {
; CHECK: scatter_store_4xi32_variable_mask
-; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
-; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-NEON: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-128: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-256: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
+; CHECK-SVE-512: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(
;
call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %val, <4 x i32*> %ptrs, i32 1, <4 x i1> %cond)
ret void
; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost '
; REMARK-NEXT: - Cost: '-7'
;
-; REMARK-LABEL: Function: gather_load
-; REMARK: Args:
-; REMARK-NEXT: - String: 'Stores SLP vectorized with cost
-; REMARK-NEXT: - Cost: '-2'
+; REMARK-NOT: Function: gather_load
define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
; CHECK-LABEL: @gather_multiple_use(
define void @gather_load(i16* noalias %ptr) {
; CHECK-LABEL: @gather_load(
; CHECK-NEXT: [[ARRAYIDX182:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 1
-; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> <i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 1, i64 0), i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 2, i64 1), i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 3, i64 2), i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 4, i64 3)>, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
-; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw <4 x i16> [[TMP2]], <i16 10, i16 20, i16 30, i16 40>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX182]] to <4 x i16>*
-; CHECK-NEXT: store <4 x i16> [[TMP3]], <4 x i16>* [[TMP4]], align 2
+; CHECK-NEXT: [[ARRAYIDX183:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 2
+; CHECK-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 3
+; CHECK-NEXT: [[ARRAYIDX185:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 4
+; CHECK-NEXT: [[L0:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 1, i64 0), align 1
+; CHECK-NEXT: [[CONV150:%.*]] = zext i8 [[L0]] to i16
+; CHECK-NEXT: [[ADD152:%.*]] = add nuw nsw i16 [[CONV150]], 10
+; CHECK-NEXT: [[L1:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 2, i64 1), align 1
+; CHECK-NEXT: [[CONV156:%.*]] = zext i8 [[L1]] to i16
+; CHECK-NEXT: [[ADD158:%.*]] = add nuw nsw i16 [[CONV156]], 20
+; CHECK-NEXT: [[L2:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 3, i64 2), align 1
+; CHECK-NEXT: [[CONV162:%.*]] = zext i8 [[L2]] to i16
+; CHECK-NEXT: [[ADD164:%.*]] = add nuw nsw i16 [[CONV162]], 30
+; CHECK-NEXT: [[L3:%.*]] = load i8, i8* getelementptr inbounds ([6 x [258 x i8]], [6 x [258 x i8]]* @data, i64 0, i64 4, i64 3), align 1
+; CHECK-NEXT: [[CONV168:%.*]] = zext i8 [[L3]] to i16
+; CHECK-NEXT: [[ADD170:%.*]] = add nuw nsw i16 [[CONV168]], 40
+; CHECK-NEXT: store i16 [[ADD152]], i16* [[ARRAYIDX182]], align 2
+; CHECK-NEXT: store i16 [[ADD158]], i16* [[ARRAYIDX183]], align 2
+; CHECK-NEXT: store i16 [[ADD164]], i16* [[ARRAYIDX184]], align 2
+; CHECK-NEXT: store i16 [[ADD170]], i16* [[ARRAYIDX185]], align 2
; CHECK-NEXT: ret void
;
%arrayidx182 = getelementptr inbounds i16, i16* %ptr, i64 1