[Analysis][AArch64] Make fixed-width ordered reductions slightly more expensive

author David Sherwood <david.sherwood@arm.com>

Wed, 18 Aug 2021 08:40:21 +0000 (09:40 +0100)

committer David Sherwood <david.sherwood@arm.com>

Wed, 18 Aug 2021 16:01:56 +0000 (17:01 +0100)
author David Sherwood <david.sherwood@arm.com>
Wed, 18 Aug 2021 08:40:21 +0000 (09:40 +0100)
committer David Sherwood <david.sherwood@arm.com>
Wed, 18 Aug 2021 16:01:56 +0000 (17:01 +0100)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

index 1c20ddd..869c2c1 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1999,8 +1999,13 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                                             Optional<FastMathFlags> FMF,
                                             TTI::TargetCostKind CostKind) {
    if (TTI::requiresOrderedReduction(FMF)) {
-    if (!isa<ScalableVectorType>(ValTy))
-      return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
+    if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
+      InstructionCost BaseCost =
+          BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
+      // Add on extra cost to reflect the extra overhead on some CPUs. We still
+      // end up vectorizing for more computationally intensive loops.
+      return BaseCost + FixedVTy->getNumElements();
+    }
  
      if (Opcode != Instruction::FAdd)
        return InstructionCost::getInvalid();
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll

index 935f245..b1f0c1d 100644 (file)
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -2,10 +2,10 @@
  
  define void @strict_fp_reductions() {
  ; CHECK-LABEL: strict_fp_reductions
-; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
    %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
    %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
    %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll

index 1f309ff..20ed971 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
@@ -6,8 +6,8 @@
  
  target triple="aarch64-unknown-linux-gnu"
  
-; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 21 for VF 4 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 42 for VF 8 For instruction:   %add = fadd float %0, %sum.07
  
  define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) {
  entry:
@@ -28,8 +28,8 @@ for.end:
  }
  
  
-; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction:   %add = fadd double %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 18 for VF 4 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 36 for VF 8 For instruction:   %add = fadd double %0, %sum.07
  
  define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) {
  entry:
author	David Sherwood <david.sherwood@arm.com>
	Wed, 18 Aug 2021 08:40:21 +0000 (09:40 +0100)
committer	David Sherwood <david.sherwood@arm.com>
	Wed, 18 Aug 2021 16:01:56 +0000 (17:01 +0100)
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp		patch \| blob \| history
llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll		patch \| blob \| history