#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
using namespace llvm;
using namespace llvm::PatternMatch;
static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
cl::init(10), cl::Hidden);
+class TailFoldingKind {
+private:
+ uint8_t Bits = 0; // Currently defaults to disabled.
+
+public:
+ enum TailFoldingOpts {
+ TFDisabled = 0x0,
+ TFReductions = 0x01,
+ TFRecurrences = 0x02,
+ TFSimple = 0x80,
+ TFAll = TFReductions | TFRecurrences | TFSimple
+ };
+
+ void operator=(const std::string &Val) {
+ if (Val.empty())
+ return;
+ SmallVector<StringRef, 6> TailFoldTypes;
+ StringRef(Val).split(TailFoldTypes, '+', -1, false);
+ for (auto TailFoldType : TailFoldTypes) {
+ if (TailFoldType == "disabled")
+ Bits = 0;
+ else if (TailFoldType == "all")
+ Bits = TFAll;
+ else if (TailFoldType == "default")
+ Bits = 0; // Currently defaults to never tail-folding.
+ else if (TailFoldType == "simple")
+ add(TFSimple);
+ else if (TailFoldType == "reductions")
+ add(TFReductions);
+ else if (TailFoldType == "recurrences")
+ add(TFRecurrences);
+ else if (TailFoldType == "noreductions")
+ remove(TFReductions);
+ else if (TailFoldType == "norecurrences")
+ remove(TFRecurrences);
+ else {
+ errs()
+ << "invalid argument " << TailFoldType.str()
+ << " to -sve-tail-folding=; each element must be one of: disabled, "
+ "all, default, simple, reductions, noreductions, recurrences, "
+ "norecurrences\n";
+ }
+ }
+ }
+
+ operator uint8_t() const { return Bits; }
+
+ void add(uint8_t Flag) { Bits |= Flag; }
+ void remove(uint8_t Flag) { Bits &= ~Flag; }
+};
+
+TailFoldingKind TailFoldingKindLoc;
+
+cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
+ "sve-tail-folding",
+ cl::desc(
+ "Control the use of vectorisation using tail-folding for SVE:"
+ "\ndisabled No loop types will vectorize using tail-folding"
+ "\ndefault Uses the default tail-folding settings for the target "
+ "CPU"
+ "\nall All legal loop types will vectorize using tail-folding"
+ "\nsimple Use tail-folding for simple loops (not reductions or "
+ "recurrences)"
+ "\nreductions Use tail-folding for loops containing reductions"
+ "\nrecurrences Use tail-folding for loops containing first order "
+ "recurrences"),
+ cl::location(TailFoldingKindLoc));
+
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
}
+
+bool AArch64TTIImpl::preferPredicateOverEpilogue(
+ Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+ TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) {
+ if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
+ return false;
+
+ TailFoldingKind Required; // Defaults to 0.
+ if (LVL->getReductionVars().size())
+ Required.add(TailFoldingKind::TFReductions);
+ if (LVL->getFirstOrderRecurrences().size())
+ Required.add(TailFoldingKind::TFRecurrences);
+ if (!Required)
+ Required.add(TailFoldingKind::TFSimple);
+
+ return (TailFoldingKindLoc & Required) == Required;
+}
--- /dev/null
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 {
+; CHECK-NOTF-LABEL: @simple_memset(
+; CHECK-NOTF: vector.ph:
+; CHECK-NOTF: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
+; CHECK-NOTF: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NOTF: vector.body:
+; CHECK-NOTF-NOT: %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NOTF: store <vscale x 4 x i32> %[[SPLAT]], <vscale x 4 x i32>*
+
+; CHECK-TF-NORED-LABEL: @simple_memset(
+; CHECK-TF-NORED: vector.ph:
+; CHECK-TF-NORED: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
+; CHECK-TF-NORED: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-TF-NORED: vector.body:
+; CHECK-TF-NORED: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF-NORED: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
+
+; CHECK-TF-NOREC-LABEL: @simple_memset(
+; CHECK-TF-NOREC: vector.ph:
+; CHECK-TF-NOREC: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
+; CHECK-TF-NOREC: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-TF-NOREC: vector.body:
+; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF-NOREC: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
+
+; CHECK-TF-LABEL: @simple_memset(
+; CHECK-TF: vector.ph:
+; CHECK-TF: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
+; CHECK-TF: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-TF: vector.body:
+; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
+
+; CHECK-TF-ONLYRED-LABEL: @simple_memset(
+; CHECK-TF-ONLYRED: vector.ph:
+; CHECK-TF-ONLYRED: %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
+; CHECK-TF-ONLYRED: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-TF-ONLYRED: vector.body:
+; CHECK-TF-ONLYRED-NOT: %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-TF-ONLYRED: store <vscale x 4 x i32> %[[SPLAT]], <vscale x 4 x i32>*
+
+entry:
+ br label %while.body
+
+while.body: ; preds = %while.body, %entry
+ %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
+ %gep = getelementptr i32, i32* %ptr, i64 %index
+ store i32 %val, i32* %gep
+ %index.next = add nsw i64 %index, 1
+ %cmp10 = icmp ult i64 %index.next, %n
+ br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
+
+while.end.loopexit: ; preds = %while.body
+ ret void
+}
+
+define float @fadd_red_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
+; CHECK-NOTF-LABEL: @fadd_red_fast
+; CHECK-NOTF: vector.body:
+; CHECK-NOTF-NOT: %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NOTF: %[[LOAD:.*]] = load <vscale x 4 x float>
+; CHECK-NOTF: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
+; CHECK-NOTF: middle.block:
+; CHECK-NOTF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
+
+; CHECK-TF-NORED-LABEL: @fadd_red_fast
+; CHECK-TF-NORED: vector.body:
+; CHECK-TF-NORED-NOT: %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-TF-NORED: %[[LOAD:.*]] = load <vscale x 4 x float>
+; CHECK-TF-NORED: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
+; CHECK-TF-NORED: middle.block:
+; CHECK-TF-NORED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
+
+; CHECK-TF-NOREC-LABEL: @fadd_red_fast
+; CHECK-TF-NOREC: vector.body:
+; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF-NOREC: %[[VEC_PHI:.*]] = phi <vscale x 4 x float>
+; CHECK-TF-NOREC: %[[LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>{{.*}} %[[ACTIVE_LANE_MASK]]
+; CHECK-TF-NOREC: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
+; CHECK-TF-NOREC: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
+; CHECK-TF-NOREC: middle.block:
+; CHECK-TF-NOREC-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+
+; CHECK-TF-LABEL: @fadd_red_fast
+; CHECK-TF: vector.body:
+; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF: %[[VEC_PHI:.*]] = phi <vscale x 4 x float>
+; CHECK-TF: %[[LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>{{.*}} %[[ACTIVE_LANE_MASK]]
+; CHECK-TF: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
+; CHECK-TF: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
+; CHECK-TF: middle.block:
+; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+
+; CHECK-TF-ONLYRED-LABEL: @fadd_red_fast
+; CHECK-TF-ONLYRED: vector.body:
+; CHECK-TF-ONLYRED: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF-ONLYRED: %[[VEC_PHI:.*]] = phi <vscale x 4 x float>
+; CHECK-TF-ONLYRED: %[[LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>{{.*}} %[[ACTIVE_LANE_MASK]]
+; CHECK-TF-ONLYRED: %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
+; CHECK-TF-ONLYRED: %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
+; CHECK-TF-ONLYRED: middle.block:
+; CHECK-TF-ONLYRED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+ %0 = load float, float* %arrayidx, align 4
+ %add = fadd fast float %0, %sum.07
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+ ret float %add
+}
+
+define void @add_recur(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
+; CHECK-NOTF-LABEL: @add_recur
+; CHECK-NOTF: entry:
+; CHECK-NOTF: %[[PRE:.*]] = load i32, i32* %src, align 4
+; CHECK-NOTF: vector.ph:
+; CHECK-NOTF: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
+; CHECK-NOTF: vector.body:
+; CHECK-NOTF-NOT: %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NOTF: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
+; CHECK-NOTF: %[[LOAD]] = load <vscale x 4 x i32>
+; CHECK-NOTF: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-NOTF: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
+; CHECK-NOTF: store <vscale x 4 x i32> %[[ADD]]
+
+; CHECK-TF-NORED-LABEL: @add_recur
+; CHECK-TF-NORED: entry:
+; CHECK-TF-NORED: %[[PRE:.*]] = load i32, i32* %src, align 4
+; CHECK-TF-NORED: vector.ph:
+; CHECK-TF-NORED: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
+; CHECK-TF-NORED: vector.body:
+; CHECK-TF-NORED: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF-NORED: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
+; CHECK-TF-NORED: %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>{{.*}} %[[ACTIVE_LANE_MASK]]
+; CHECK-TF-NORED: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NORED: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
+; CHECK-TF-NORED: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
+
+; CHECK-TF-NOREC-LABEL: @add_recur
+; CHECK-TF-NOREC: entry:
+; CHECK-TF-NOREC: %[[PRE:.*]] = load i32, i32* %src, align 4
+; CHECK-TF-NOREC: vector.ph:
+; CHECK-TF-NOREC: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
+; CHECK-TF-NOREC: vector.body:
+; CHECK-TF-NOREC-NOT: %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-TF-NOREC: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
+; CHECK-TF-NOREC: %[[LOAD]] = load <vscale x 4 x i32>
+; CHECK-TF-NOREC: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-NOREC: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
+; CHECK-TF-NOREC: store <vscale x 4 x i32> %[[ADD]]
+
+; CHECK-TF-LABEL: @add_recur
+; CHECK-TF: entry:
+; CHECK-TF: %[[PRE:.*]] = load i32, i32* %src, align 4
+; CHECK-TF: vector.ph:
+; CHECK-TF: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
+; CHECK-TF: vector.body:
+; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-TF: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
+; CHECK-TF: %[[LOAD]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>{{.*}} %[[ACTIVE_LANE_MASK]]
+; CHECK-TF: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
+; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[ADD]], {{.*}} <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]])
+
+; CHECK-TF-ONLYRED-LABEL: @add_recur
+; CHECK-TF-ONLYRED: entry:
+; CHECK-TF-ONLYRED: %[[PRE:.*]] = load i32, i32* %src, align 4
+; CHECK-TF-ONLYRED: vector.ph:
+; CHECK-TF-ONLYRED: %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
+; CHECK-TF-ONLYRED: vector.body:
+; CHECK-TF-ONLYRED-NOT: %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-TF-ONLYRED: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
+; CHECK-TF-ONLYRED: %[[LOAD]] = load <vscale x 4 x i32>
+; CHECK-TF-ONLYRED: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-TF-ONLYRED: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
+; CHECK-TF-ONLYRED: store <vscale x 4 x i32> %[[ADD]]
+
+entry:
+ %.pre = load i32, i32* %src, align 4
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %0 = phi i32 [ %1, %for.body ], [ %.pre, %entry ]
+ %i.010 = phi i64 [ %add, %for.body ], [ 0, %entry ]
+ %add = add nuw nsw i64 %i.010, 1
+ %arrayidx1 = getelementptr inbounds i32, i32* %src, i64 %add
+ %1 = load i32, i32* %arrayidx1, align 4
+ %add2 = add nsw i32 %1, %0
+ %arrayidx3 = getelementptr inbounds i32, i32* %dst, i64 %i.010
+ store i32 %add2, i32* %arrayidx3, align 4
+ %exitcond.not = icmp eq i64 %add, %n
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}