This patch adds support for reverse loop vectorization.
It is possible to vectorize the following loop:
```
for (int i = n-1; i >= 0; --i)
a[i] = b[i] + 1.0;
```
with fixed or scalable vector.
The loop-vectorizer will use 'reverse' on the loads/stores to make
sure the lanes themselves are also handled in the right order.
This patch adds support for scalable vector on IRBuilder interface to
create a reverse vector. The IR function
CreateVectorReverse lowers to experimental.vector.reverse for scalable vector
and keedp the original behavior for fixed vector using shuffle reverse.
Differential Revision: https://reviews.llvm.org/D95363
/// address space before call and casted back to Ptr type after call.
Value *CreateStripInvariantGroup(Value *Ptr);
+ /// Return a vector value that contains the vector V reversed
+ Value *CreateVectorReverse(Value *V, const Twine &Name = "");
+
/// Return a vector value that contains \arg V broadcasted to \p
/// NumElts elements.
Value *CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name = "");
return Fn;
}
+Value *IRBuilderBase::CreateVectorReverse(Value *V, const Twine &Name) {
+ auto *Ty = cast<VectorType>(V->getType());
+ if (isa<ScalableVectorType>(Ty)) {
+ Module *M = BB->getParent()->getParent();
+ Function *F = Intrinsic::getDeclaration(
+ M, Intrinsic::experimental_vector_reverse, Ty);
+ return Insert(CallInst::Create(F, V), Name);
+ }
+ // Keep the original behaviour for fixed vector
+ SmallVector<int, 8> ShuffleMask;
+ int NumElts = Ty->getElementCount().getKnownMinValue();
+ for (int i = 0; i < NumElts; ++i)
+ ShuffleMask.push_back(NumElts - i - 1);
+ return CreateShuffleVector(V, ShuffleMask, Name);
+}
+
Value *IRBuilderBase::CreateVectorSplat(unsigned NumElts, Value *V,
const Twine &Name) {
auto EC = ElementCount::getFixed(NumElts);
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
assert(Vec->getType()->isVectorTy() && "Invalid type");
- assert(!VF.isScalable() && "Cannot reverse scalable vectors");
- SmallVector<int, 8> ShuffleMask;
- for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
- ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
-
- return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
+ return Builder.CreateVectorReverse(Vec, "reverse");
}
// Return whether we allow using masked interleave-groups (for dealing with
bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
InBounds = gep->isInBounds();
-
if (Reverse) {
- assert(!VF.isScalable() &&
- "Reversing vectors is not yet supported for scalable vectors.");
-
// If the address is consecutive but reversed, then the
// wide store needs to start at the last vector element.
- PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
- ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
+ // RunTimeVF = VScale * VF.getKnownMinValue()
+ // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
+ Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
+ // NumElt = -Part * RunTimeVF
+ Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
+ // LastLane = 1 - RunTimeVF
+ Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
+ PartPtr =
+ cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
PartPtr->setIsInBounds(InBounds);
- PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
- ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
+ PartPtr = cast<GetElementPtrInst>(
+ Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
PartPtr->setIsInBounds(InBounds);
if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
--- /dev/null
+; This is the loop in c++ being vectorize in this file with
+; experimental.vector.reverse
+
+;#pragma clang loop vectorize_width(4, scalable)
+; for (long int i = N - 1; i >= 0; i--)
+; {
+; if (cond[i])
+; a[i] += 1;
+; }
+
+; The test checks if the mask is being correctly created, reverted and used
+
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @vector_reverse_mask_nxv4i1(double* %a, double* %cond, i64 %N) #0 {
+; CHECK-LABEL: vector.body:
+; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double>* nonnull %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
+; CHECK-NEXT: %[[REVERSE7:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> %[[WIDEMSKLOAD]])
+; CHECK-NEXT: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[REVERSE7]]
+; CHECK-NEXT: %[[REVERSE8:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> %[[FADD]])
+; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: call void @llvm.masked.store.nxv4f64.p0nxv4f64(<vscale x 4 x double> %[[REVERSE8]], <vscale x 4 x double>* %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE9]]
+
+entry:
+ %cmp7 = icmp sgt i64 %N, 0
+ br i1 %cmp7, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup, %entry
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %i.08.in = phi i64 [ %i.08, %for.inc ], [ %N, %entry ]
+ %i.08 = add nsw i64 %i.08.in, -1
+ %arrayidx = getelementptr inbounds double, double* %cond, i64 %i.08
+ %0 = load double, double* %arrayidx, align 8
+ %tobool = fcmp une double %0, 0.000000e+00
+ br i1 %tobool, label %if.then, label %for.inc
+
+if.then: ; preds = %for.body
+ %arrayidx1 = getelementptr inbounds double, double* %a, i64 %i.08
+ %1 = load double, double* %arrayidx1, align 8
+ %add = fadd double %1, 1.000000e+00
+ store double %add, double* %arrayidx1, align 8
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %cmp = icmp sgt i64 %i.08.in, 1
+ br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve"}
+
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
--- /dev/null
+; This is the loop in c++ being vectorize in this file with
+;experimental.vector.reverse
+; #pragma clang loop vectorize_width(8, scalable)
+; for (int i = N-1; i >= 0; --i)
+; a[i] = b[i] + 1.0;
+
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0{
+; CHECK-LABEL: @vector_reverse_f64
+; CHECK-LABEL: vector.body:
+; CHECK: %[[ADD:.*]] = add i64 %{{.*}}, %N
+; CHECK-NEXT: %[[GEP:.*]] = getelementptr inbounds double, double* %b, i64 %[[ADD]]
+; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[MUL:.*]] = mul i32 %[[VSCALE]], -8
+; CHECK-NEXT: %[[OR:.*]] = or i32 %[[MUL]], 1
+; CHECK-NEXT: %[[SEXT:.*]] = sext i32 %[[OR]] to i64
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i64 %[[SEXT]]
+; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <vscale x 8 x double>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <vscale x 8 x double>, <vscale x 8 x double>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = call <vscale x 8 x double> @llvm.experimental.vector.reverse.nxv8f64(<vscale x 8 x double> %[[WIDE]])
+; CHECK-NEXT: %[[FADD:.*]] = fadd <vscale x 8 x double> %[[REVERSE]], shufflevector
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* %a, i64 %[[ADD]]
+; CHECK-NEXT: %[[REVERSE6:.*]] = call <vscale x 8 x double> @llvm.experimental.vector.reverse.nxv8f64(<vscale x 8 x double> %[[FADD]])
+; CHECK-NEXT: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], -8
+; CHECK-NEXT: %[[OR1:.*]] = or i32 %[[MUL1]], 1
+; CHECK-NEXT: %[[SEXT1:.*]] = sext i32 %[[OR1]] to i64
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i64 %[[SEXT1]]
+; CHECK-NEXT: %[[CAST1:.*]] = bitcast double* %[[GEP3]] to <vscale x 8 x double>*
+; CHECK-NEXT: store <vscale x 8 x double> %[[REVERSE6]], <vscale x 8 x double>* %[[CAST1]], align 8
+
+entry:
+ %cmp7 = icmp sgt i64 %N, 0
+ br i1 %cmp7, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.08.in = phi i64 [ %i.08, %for.body ], [ %N, %entry ]
+ %i.08 = add nsw i64 %i.08.in, -1
+ %arrayidx = getelementptr inbounds double, double* %b, i64 %i.08
+ %0 = load double, double* %arrayidx, align 8
+ %add = fadd double %0, 1.000000e+00
+ %arrayidx1 = getelementptr inbounds double, double* %a, i64 %i.08
+ store double %add, double* %arrayidx1, align 8
+ %cmp = icmp sgt i64 %i.08.in, 1
+ br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+
+define void @vector_reverse_i64(i64 %N, i64* %a, i64* %b) #0 {
+; CHECK-LABEL: vector_reverse_i64
+; CHECK-LABEL: vector.body:
+; CHECK: %[[ADD:.*]] = add i64 %{{.*}}, %N
+; CHECK-NEXT: %[[GEP:.*]] = getelementptr inbounds i64, i64* %b, i64 %[[ADD]]
+; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[MUL:.*]] = mul i32 %[[VSCALE]], -8
+; CHECK-NEXT: %[[OR:.*]] = or i32 %[[MUL]], 1
+; CHECK-NEXT: %[[SEXT:.*]] = sext i32 %[[OR]] to i64
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i64 %[[SEXT]]
+; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <vscale x 8 x i64>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <vscale x 8 x i64>, <vscale x 8 x i64>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> %[[WIDE]])
+; CHECK-NEXT: %[[ADD1:.*]] = add <vscale x 8 x i64> %[[REVERSE]]
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* %a, i64 %[[ADD]]
+; CHECK-NEXT: %[[REVERSE6]] = call <vscale x 8 x i64> @llvm.experimental.vector.reverse.nxv8i64(<vscale x 8 x i64> %[[ADD1]])
+; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[MUL1:.*]] = mul i32 %[[VSCALE]], -8
+; CHECK-NEXT: %[[OR1:.*]] = or i32 %[[MUL1]], 1
+; CHECK-NEXT: %[[SEXT1:.*]] = sext i32 %[[OR1]] to i64
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i64 %[[SEXT1]]
+; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP3]] to <vscale x 8 x i64>*
+; CHECK-NEXT: store <vscale x 8 x i64> %[[REVERSE6]], <vscale x 8 x i64>* %[[CAST1]], align 8
+
+entry:
+ %cmp8 = icmp sgt i64 %N, 0
+ br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.09.in = phi i64 [ %i.09, %for.body ], [ %N, %entry ]
+ %i.09 = add nsw i64 %i.09.in, -1
+ %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.09
+ %0 = load i64, i64* %arrayidx, align 8
+ %add = add i64 %0, 1
+ %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %i.09
+ store i64 %add, i64* %arrayidx2, align 8
+ %cmp = icmp sgt i64 %i.09.in, 1
+ br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" }
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 8}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+
--- /dev/null
+; This is the loop in c++ being vectorize in this file with
+; shuffle reverse
+
+;#pragma clang loop vectorize_width(4, fixed)
+; for (long int i = N - 1; i >= 0; i--)
+; {
+; if (cond[i])
+; a[i] += 1;
+; }
+
+; The test checks if the mask is being correctly created, reverted and used
+
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @vector_reverse_mask_v4i1(double* %a, double* %cond, i64 %N) #0 {
+; CHECK-LABEL: vector.body:
+; CHECK: %[[REVERSE6:.*]] = shufflevector <4 x i1> %{{.*}}, <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: %[[WIDEMSKLOAD:.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull %{{.*}}, i32 8, <4 x i1> %[[REVERSE6]], <4 x double> poison)
+; CHECK-NEXT: %[[FADD:.*]] = fadd <4 x double> %[[WIDEMSKLOAD]]
+; CHECK: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %[[FADD]], <4 x double>* %{{.*}}, i32 8, <4 x i1> %[[REVERSE6]])
+
+entry:
+ %cmp7 = icmp sgt i64 %N, 0
+ br i1 %cmp7, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup, %entry
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %i.08.in = phi i64 [ %i.08, %for.inc ], [ %N, %entry ]
+ %i.08 = add nsw i64 %i.08.in, -1
+ %arrayidx = getelementptr inbounds double, double* %cond, i64 %i.08
+ %0 = load double, double* %arrayidx, align 8
+ %tobool = fcmp une double %0, 0.000000e+00
+ br i1 %tobool, label %if.then, label %for.inc
+
+if.then: ; preds = %for.body
+ %arrayidx1 = getelementptr inbounds double, double* %a, i64 %i.08
+ %1 = load double, double* %arrayidx1, align 8
+ %add = fadd double %1, 1.000000e+00
+ store double %add, double* %arrayidx1, align 8
+ br label %for.inc
+
+for.inc: ; preds = %for.body, %if.then
+ %cmp = icmp sgt i64 %i.08.in, 1
+ br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+attributes #0 = {"target-cpu"="generic" "target-features"="+neon,+sve"}
+
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
--- /dev/null
+; Test VLA for reverse with fixed size vector
+; This is the loop in c++ being vectorize in this file with
+; shuffle reverse
+; #pragma clang loop vectorize_width(8, fixed)
+; for (int i = N-1; i >= 0; --i)
+; a[i] = b[i] + 1.0;
+
+; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.$
+; WARN-NOT: warning
+
+define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0 {
+; CHECK-LABEL: vector_reverse_f64
+; CHECK-LABEL: vector.body
+; CHECK: %[[GEP:.*]] = getelementptr inbounds double, double* %{{.*}}, i32 0
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <8 x double>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <8 x double>, <8 x double>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x double> %[[WIDE]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[FADD:.*]] = fadd <8 x double> %[[REVERSE]]
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* {{.*}}, i64 {{.*}}
+; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x double> %[[FADD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i32 0
+; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds double, double* %[[GEP3]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP4]] to <8 x double>*
+; CHECK-NEXT: store <8 x double> %[[REVERSE6]], <8 x double>* %[[CAST]], align 8
+
+entry:
+ %cmp7 = icmp sgt i64 %N, 0
+ br i1 %cmp7, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.08.in = phi i64 [ %i.08, %for.body ], [ %N, %entry ]
+ %i.08 = add nsw i64 %i.08.in, -1
+ %arrayidx = getelementptr inbounds double, double* %b, i64 %i.08
+ %0 = load double, double* %arrayidx, align 8
+ %add = fadd double %0, 1.000000e+00
+ %arrayidx1 = getelementptr inbounds double, double* %a, i64 %i.08
+ store double %add, double* %arrayidx1, align 8
+ %cmp = icmp sgt i64 %i.08.in, 1
+ br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+define void @vector_reverse_i64(i64 %N, i64* %a, i64* %b) #0 {
+; CHECK-LABEL: vector_reverse_i64
+; CHECK-LABEL: vector.body
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, i64* %{{.*}}, i32 0
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <8 x i64>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <8 x i64>, <8 x i64>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x i64> %[[WIDE]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[FADD:.*]] = add <8 x i64> %[[REVERSE]]
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* {{.*}}, i64 {{.*}}
+; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x i64> %[[FADD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i32 0
+; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds i64, i64* %[[GEP3]], i32 -7
+; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP4]] to <8 x i64>*
+; CHECK-NEXT: store <8 x i64> %[[REVERSE6]], <8 x i64>* %[[CAST1]], align 8
+
+entry:
+ %cmp8 = icmp sgt i64 %N, 0
+ br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.09.in = phi i64 [ %i.09, %for.body ], [ %N, %entry ]
+ %i.09 = add nsw i64 %i.09.in, -1
+ %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.09
+ %0 = load i64, i64* %arrayidx, align 8
+ %add = add i64 %0, 1
+ %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %i.09
+ store i64 %add, i64* %arrayidx2, align 8
+ %cmp = icmp sgt i64 %i.09.in, 1
+ br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
+}
+
+attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" }
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 8}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}