Optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind) {
if (TTI::requiresOrderedReduction(FMF)) {
- if (!isa<ScalableVectorType>(ValTy))
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
+ if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
+ InstructionCost BaseCost =
+ BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
+ // Add on extra cost to reflect the extra overhead on some CPUs. We still
+ // end up vectorizing for more computationally intensive loops.
+ return BaseCost + FixedVTy->getNumElements();
+ }
if (Opcode != Instruction::FAdd)
return InstructionCost::getInvalid();
define void @strict_fp_reductions() {
; CHECK-LABEL: strict_fp_reductions
-; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
%fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
%fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
%fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
target triple="aarch64-unknown-linux-gnu"
-; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction: %add = fadd float %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction: %add = fadd float %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 21 for VF 4 For instruction: %add = fadd float %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 42 for VF 8 For instruction: %add = fadd float %0, %sum.07
define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) {
entry:
}
-; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction: %add = fadd double %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction: %add = fadd double %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 18 for VF 4 For instruction: %add = fadd double %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 36 for VF 8 For instruction: %add = fadd double %0, %sum.07
define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) {
entry: