[AArch64][LoopVectorize] Enable tail-folding of simple loops on neoverse-v1

author David Sherwood <david.sherwood@arm.com>

Tue, 25 Apr 2023 08:46:41 +0000 (08:46 +0000)

committer David Sherwood <david.sherwood@arm.com>

Thu, 18 May 2023 10:35:57 +0000 (10:35 +0000)
author David Sherwood <david.sherwood@arm.com>
Tue, 25 Apr 2023 08:46:41 +0000 (08:46 +0000)
committer David Sherwood <david.sherwood@arm.com>
Thu, 18 May 2023 10:35:57 +0000 (10:35 +0000)
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp

index 843c3b7..670f84d 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -226,6 +226,7 @@ void AArch64Subtarget::initializeProperties() {
      PrefLoopAlignment = Align(32);
      MaxBytesForLoopAlignment = 16;
      VScaleForTuning = 2;
+    DefaultSVETFOpts = TailFoldingOpts::Simple;
      break;
    case Neoverse512TVB:
      PrefFunctionAlignment = Align(16);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

index c326c12..385ba9a 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -39,6 +39,9 @@ static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
  static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
                                              cl::init(10), cl::Hidden);
  
+static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
+                                                  cl::init(15), cl::Hidden);
+
  namespace {
  class TailFoldingOption {
    // These bitfields will only ever be set to something non-zero in operator=,
@@ -3558,8 +3561,19 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
    if (Required == TailFoldingOpts::Disabled)
      Required |= TailFoldingOpts::Simple;
  
-  return TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
-                                        Required);
+  if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
+                                      Required))
+    return false;
+
+  // Don't tail-fold for tight loops where we would be better off interleaving
+  // with an unpredicated loop.
+  unsigned NumInsns = 0;
+  for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
+    NumInsns += BB->sizeWithoutDebug();
+  }
+
+  // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
+  return NumInsns >= SVETailFoldInsnThreshold;
  }
  
  InstructionCost
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll

index c65b10c..454a978 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
@@ -1,7 +1,5 @@
  ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
-; RUN:   -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG
-; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
-; RUN:   -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG
+; RUN:   -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG
  ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
  ; RUN:   -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
  ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll

index c3348fd..01f4d09 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
@@ -1,11 +1,16 @@
-; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF
-; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF
-; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF
-; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF
-; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED
-; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
-; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=all+noreverse -S | FileCheck %s -check-prefix=CHECK-TF-NOREV
-; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S | FileCheck %s -check-prefix=CHECK-NOTF
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -mcpu=neoverse-v1 -sve-tail-folding=default+reductions+recurrences+reverse | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all+noreverse -S | FileCheck %s -check-prefix=CHECK-TF-NOREV
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -sve-tail-folding=default -mcpu=neoverse-v1 | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -mcpu=neoverse-v1 -sve-tail-folding=default | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
+; RUN: opt < %s -passes=loop-vectorize -sve-tail-folding-insn-threshold=0 -S -mcpu=neoverse-v1 | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
  
  target triple = "aarch64-unknown-linux-gnu"
  
@@ -58,6 +63,14 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
  ; CHECK-TF-ONLYRED-NOT:     %{{.*}} = phi <vscale x 4 x i1>
  ; CHECK-TF-ONLYRED:         store <vscale x 4 x i32> %[[SPLAT]], ptr
  
+; CHECK-NEOVERSE-V1-LABEL: @simple_memset(
+; CHECK-NEOVERSE-V1:       vector.ph:
+; CHECK-NEOVERSE-V1:         %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i64 0
+; CHECK-NEOVERSE-V1:         %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
+
  entry:
    br label %while.body
  
@@ -129,6 +142,15 @@ define float @fadd_red_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
  ; CHECK-TF-ONLYRED:         %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
  ; CHECK-TF-ONLYRED:       middle.block:
  ; CHECK-TF-ONLYRED-NEXT:    call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+
+; CHECK-NEOVERSE-V1-LABEL: @fadd_red_fast
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         %[[LOAD:.*]] = load <vscale x 4 x float>
+; CHECK-NEOVERSE-V1:         %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
+; CHECK-NEOVERSE-V1:       middle.block:
+; CHECK-NEOVERSE-V1-NEXT:    call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
+
  entry:
    br label %for.body
  
@@ -225,6 +247,19 @@ define void @add_recur(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
  ; CHECK-TF-ONLYRED:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
  ; CHECK-TF-ONLYRED:         store <vscale x 4 x i32> %[[ADD]]
  
+; CHECK-NEOVERSE-V1-LABEL: @add_recur
+; CHECK-NEOVERSE-V1:       entry:
+; CHECK-NEOVERSE-V1:         %[[PRE:.*]] = load i32, ptr %src, align 4
+; CHECK-NEOVERSE-V1:       vector.ph:
+; CHECK-NEOVERSE-V1:         %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
+; CHECK-NEOVERSE-V1:         %[[LOAD]] = load <vscale x 4 x i32>
+; CHECK-NEOVERSE-V1:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-NEOVERSE-V1:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
+; CHECK-NEOVERSE-V1:         store <vscale x 4 x i32> %[[ADD]]
+
  entry:
    %.pre = load i32, ptr %src, align 4
    br label %for.body
@@ -276,6 +311,12 @@ define void @interleave(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
  ; CHECK-TF-NOREV:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  ; CHECK-TF-NOREV:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
  
+; CHECK-NEOVERSE-V1-LABEL: @interleave(
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1:         %[[LOAD:.*]] = load <8 x float>, ptr
+; CHECK-NEOVERSE-V1:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEOVERSE-V1:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
  entry:
    br label %for.body
  
@@ -335,6 +376,12 @@ define void @reverse(ptr noalias %dst, ptr noalias %src) #0 {
  ; CHECK-TF-NOREC:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
  ; CHECK-TF-NOREC:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0({{.*}} <vscale x 2 x i1> %reverse
  
+; CHECK-TF-NEOVERSE-V1-LABEL: @reverse(
+; CHECK-TF-NEOVERSE-V1:       vector.body:
+; CHECK-TF-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-TF-NEOVERSE-V1:         %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* %18, align 8
+; CHECK-TF-NEOVERSE-V1:         %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+
  entry:
    br label %for.body
  
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll

index 81ceb0a..45fca16 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
@@ -1,5 +1,5 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='loop-vectorize,instcombine' -sve-tail-folding=all -S < %s | FileCheck %s
+; RUN: opt -passes='loop-vectorize,instcombine' -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all -S < %s | FileCheck %s
  
  target triple = "aarch64"
author	David Sherwood <david.sherwood@arm.com>
	Tue, 25 Apr 2023 08:46:41 +0000 (08:46 +0000)
committer	David Sherwood <david.sherwood@arm.com>
	Thu, 18 May 2023 10:35:57 +0000 (10:35 +0000)
llvm/lib/Target/AArch64/AArch64Subtarget.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll		patch \| blob \| history