From 636efd2e3508f4a39717cee75c1255a48acb8f4e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Tue, 14 Mar 2023 18:15:03 +0000
Subject: [PATCH] [SVE][LoopVectorize] Add option to disable tail-folding for
 reverse loops

If we use tail-folding for reverse loops that contain loads
and stores then we will need to reverse the loop predicate.
This patch adds a new 'reverse' sve-tail-folding option and
ensures they are not considered 'simple'.

I did this by adding a function called
containsDecreasingPointers to AArch64TargetTransformInfo.cpp
that searches all instructions in the loop for loads or
stores with negative strides.

Differential Revision: https://reviews.llvm.org/D146128
---
 .../Vectorize/LoopVectorizationLegality.h          |  4 +
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  | 37 ++++++++-
 .../AArch64/sve-tail-folding-option.ll             | 89 +++++++++++++++++++++-
 3 files changed, 127 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 7f6430c..6c42327 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -388,6 +388,10 @@ public:
     return ConditionalAssumes;
   }
 
+  PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
+    return &PSE;
+  }
+
 private:
   /// Return true if the pre-header, exiting and latch blocks of \p Lp and all
   /// its nested loops are considered legal for vectorization. These legal
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 3e0f419..f145ae8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -49,8 +49,9 @@ public:
     TFDisabled = 0x0,
     TFReductions = 0x01,
     TFRecurrences = 0x02,
+    TFReverse = 0x04,
     TFSimple = 0x80,
-    TFAll = TFReductions | TFRecurrences | TFSimple
+    TFAll = TFReductions | TFRecurrences | TFReverse | TFSimple
   };
 
   void operator=(const std::string &Val) {
@@ -71,10 +72,14 @@ public:
         add(TFReductions);
       else if (TailFoldType == "recurrences")
         add(TFRecurrences);
+      else if (TailFoldType == "reverse")
+        add(TFReverse);
       else if (TailFoldType == "noreductions")
         remove(TFReductions);
       else if (TailFoldType == "norecurrences")
         remove(TFRecurrences);
+      else if (TailFoldType == "noreverse")
+        remove(TFReverse);
       else {
         errs()
             << "invalid argument " << TailFoldType.str()
@@ -106,7 +111,9 @@ cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
         "recurrences)"
         "\nreductions  Use tail-folding for loops containing reductions"
         "\nrecurrences Use tail-folding for loops containing fixed order "
-        "recurrences"),
+        "recurrences"
+        "\nreverse     Use tail-folding for loops requiring reversed "
+        "predicates"),
     cl::location(TailFoldingKindLoc));
 
 // Experimental option that will only be fully functional when the
@@ -3383,6 +3390,26 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
 }
 
+static bool containsDecreasingPointers(Loop *TheLoop,
+                                       PredicatedScalarEvolution *PSE) {
+  const ValueToValueMap &Strides = ValueToValueMap();
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // Scan the instructions in the block and look for addresses that are
+    // consecutive and decreasing.
+    for (Instruction &I : *BB) {
+      if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
+        Value *Ptr = getLoadStorePointerOperand(&I);
+        Type *AccessTy = getLoadStoreType(&I);
+        if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
+                         /*ShouldCheckWrap=*/false)
+                .value_or(0) < 0)
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
 bool AArch64TTIImpl::preferPredicateOverEpilogue(
     Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
     TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
@@ -3401,6 +3428,12 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(
     Required.add(TailFoldingKind::TFReductions);
   if (LVL->getFixedOrderRecurrences().size())
     Required.add(TailFoldingKind::TFRecurrences);
+
+  // We call this to discover whether any load/store pointers in the loop have
+  // negative strides. This will require extra work to reverse the loop
+  // predicate, which may be expensive.
+  if (containsDecreasingPointers(L, LVL->getPredicatedScalarEvolution()))
+    Required.add(TailFoldingKind::TFReverse);
   if (!Required)
     Required.add(TailFoldingKind::TFSimple);
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
index c805e27..bcc53f0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
@@ -1,9 +1,10 @@
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF
-; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
+; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+noreverse -S | FileCheck %s -check-prefix=CHECK-TF-NOREV
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -33,6 +34,14 @@ define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
 ; CHECK-TF-NOREC:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
 ; CHECK-TF-NOREC:         call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
 
+; CHECK-TF-NOREV-LABEL: @simple_memset(
+; CHECK-TF-NOREV:       vector.ph:
+; CHECK-TF-NOREV:         %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i64 0
+; CHECK-TF-NOREV:         %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-TF-NOREV:       vector.body:
+; CHECK-TF-NOREV:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF-NOREV:         call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
+
 ; CHECK-TF-LABEL: @simple_memset(
 ; CHECK-TF:       vector.ph:
 ; CHECK-TF:         %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i64 0
@@ -91,6 +100,16 @@ define float @fadd_red_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-TF-NOREC:       middle.block:
 ; CHECK-TF-NOREC-NEXT:    call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
 
+; CHECK-TF-NOREV-LABEL: @fadd_red_fast
+; CHECK-TF-NOREV:       vector.body:
+; CHECK-TF-NOREV:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF-NOREV:         %[[VEC_PHI:.*]] = phi <vscale x 4 x float>
+; CHECK-TF-NOREV:         %[[LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>{{.*}} %[[ACTIVE_LANE_MASK]]
+; CHECK-TF-NOREV:         %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
+; CHECK-TF-NOREV:         %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
+; CHECK-TF-NOREV:       middle.block:
+; CHECK-TF-NOREV-NEXT:    call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+
 ; CHECK-TF-LABEL: @fadd_red_fast
 ; CHECK-TF:       vector.body:
 ; CHECK-TF:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
@@ -167,6 +186,19 @@ define void @add_recur(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
 ; CHECK-TF-NOREC:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-NOREC:         store <vscale x 4 x i32> %[[ADD]]
 
+; CHECK-TF-NOREV-LABEL: @add_recur
+; CHECK-TF-NOREV:       entry:
+; CHECK-TF-NOREV:         %[[PRE:.*]] = load i32, i32* %src, align 4
+; CHECK-TF-NOREV:       vector.ph:
+; CHECK-TF-NOREV:         %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
+; CHECK-TF-NOREV:       vector.body:
+; CHECK-TF-NOREV:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF-NOREV:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
+; CHECK-TF-NOREV:         %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>{{.*}} %[[ACTIVE_LANE_MASK]]
+; CHECK-TF-NOREV:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NOREV:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
+; CHECK-TF-NOREV:         call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
+
 ; CHECK-TF-LABEL: @add_recur
 ; CHECK-TF:       entry:
 ; CHECK-TF:         %[[PRE:.*]] = load i32, i32* %src, align 4
@@ -238,6 +270,12 @@ define void @interleave(float* noalias %dst, float* noalias %src, i64 %n) #0 {
 ; CHECK-TF-NOREC:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-TF-NOREC:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 
+; CHECK-TF-NOREV-LABEL: @interleave(
+; CHECK-TF-NOREV:       vector.body:
+; CHECK-TF-NOREV:         %[[LOAD:.*]] = load <8 x float>, <8 x float>
+; CHECK-TF-NOREV:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-TF-NOREV:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
 entry:
   br label %for.body
 
@@ -266,6 +304,55 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+define void @reverse(double* noalias %dst, double* noalias %src) #0 {
+; CHECK-NOTF-LABEL: @reverse(
+; CHECK-NOTF:       vector.body:
+; CHECK-NOTF-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NOTF:         %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* %18, align 8
+; CHECK-NOTF:         %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+
+; CHECK-TF-NOREV-LABEL: @reverse(
+; CHECK-TF-NOREV:       vector.body:
+; CHECK-TF-NOREV-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-TF-NOREV:         %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* %18, align 8
+; CHECK-TF-NOREV:         %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+
+; CHECK-TF-LABEL: @reverse(
+; CHECK-TF:       vector.body:
+; CHECK-TF:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
+; CHECK-TF:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>{{.*}} <vscale x 2 x i1> %reverse
+
+; CHECK-TF-NORED-LABEL: @reverse(
+; CHECK-TF-NORED:       vector.body:
+; CHECK-TF-NORED:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
+; CHECK-TF-NORED:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF-NORED:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>{{.*}} <vscale x 2 x i1> %reverse
+
+; CHECK-TF-NOREC-LABEL: @reverse(
+; CHECK-TF-NOREC:       vector.body:
+; CHECK-TF-NOREC:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 2 x i1>
+; CHECK-TF-NOREC:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
+; CHECK-TF-NOREC:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>{{.*}} <vscale x 2 x i1> %reverse
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %src, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %dst, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp.not = icmp eq i64 %indvars.iv, 0
+  br i1 %cmp.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
 attributes #0 = { "target-features"="+sve" }
 
 !0 = distinct !{!0, !1, !2, !3, !4}
-- 
2.7.4