From 303b6d5e981947cff7e12626669c1fbeef046f18 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 17 Jun 2021 09:48:30 +0100
Subject: [PATCH] [LoopVectorize] Add support for scalable vectorization of
 invariant stores

Previously in setCostBasedWideningDecision if we encountered an
invariant store we just assumed that we could scalarize the store
and called getUniformMemOpCost to get the associated cost.
However, for scalable vectors this is not an option because it is
not currently possibly to scalarize the store. At the moment we
crash in VPReplicateRecipe::execute when trying to scalarize the
store.

Therefore, I have changed setCostBasedWideningDecision so that if
we are storing a scalable vector out to a uniform address and the
target supports scatter instructions, then we should use those
instead.

Tests have been added here:

  Transforms/LoopVectorize/AArch64/sve-inv-store.ll

Differential Revision: https://reviews.llvm.org/D104624
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp    | 13 +++-
 .../LoopVectorize/AArch64/sve-inv-store.ll         | 70 ++++++++++++++++++++++
 2 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3988d2f..f22e737 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7346,8 +7346,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
         // relying on instcombine to remove them.
         // Load: Scalar load + broadcast
         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
-        InstructionCost Cost = getUniformMemOpCost(&I, VF);
-        setWideningDecision(&I, VF, CM_Scalarize, Cost);
+        InstructionCost Cost;
+        if (isa<StoreInst>(&I) && VF.isScalable() &&
+            isLegalGatherOrScatter(&I)) {
+          Cost = getGatherScatterCost(&I, VF);
+          setWideningDecision(&I, VF, CM_GatherScatter, Cost);
+        } else {
+          assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
+                 "Cannot yet scalarize uniform stores");
+          Cost = getUniformMemOpCost(&I, VF);
+          setWideningDecision(&I, VF, CM_Scalarize, Cost);
+        }
         continue;
       }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
new file mode 100644
index 0000000..0e02af6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
@@ -0,0 +1,70 @@
+; RUN: opt -loop-vectorize -scalable-vectorization=on -S < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N) #0 {
+; CHECK-LABEL: @inv_store_i16(
+; CHECK:       vector.ph:
+; CHECK:         %[[TMP1:.*]] = insertelement <vscale x 4 x i16*> poison, i16* %dst, i32 0
+; CHECK-NEXT:    %[[SPLAT_PTRS:.*]] = shufflevector <vscale x 4 x i16*> %[[TMP1]], <vscale x 4 x i16*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:       vector.body:
+; CHECK:         %[[VECLOAD:.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* %{{.*}}, align 2
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> %[[VECLOAD]], <vscale x 4 x i16*> %[[SPLAT_PTRS]], i32 2
+entry:
+  br label %for.body14
+
+for.body14:                                       ; preds = %for.body14, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body14 ]
+  %arrayidx = getelementptr inbounds i16, i16* %src, i64 %indvars.iv
+  %ld = load i16, i16* %arrayidx
+  store i16 %ld, i16* %dst, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond.not, label %for.inc24, label %for.body14, !llvm.loop !0
+
+for.inc24:                                        ; preds = %for.body14, %for.body
+  ret void
+}
+
+
+define void @cond_inv_store_i32(i32* noalias %dst, i32* noalias readonly %src, i64 %N) #0 {
+; CHECK-LABEL: @cond_inv_store_i32(
+; CHECK:       vector.ph:
+; CHECK:         %[[TMP1:.*]] = insertelement <vscale x 4 x i32*> poison, i32* %dst, i32 0
+; CHECK-NEXT:    %[[SPLAT_PTRS:.*]] = shufflevector <vscale x 4 x i32*> %[[TMP1]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:       vector.body:
+; CHECK:         %[[VECLOAD:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* %{{.*}}, align 4
+; CHECK-NEXT:    %[[MASK:.*]] = icmp sgt <vscale x 4 x i32> %[[VECLOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 0, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[VECLOAD]], <vscale x 4 x i32*> %[[SPLAT_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]])
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.09 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %src, i64 %i.09
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  store i32 %0, i32* %dst, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i64 %i.09, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.inc, %entry
+  ret void
+}
+
+attributes #0 = { "target-features"="+neon,+sve" }
+
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+!5 = !{!"llvm.loop.interleave.count", i32 1}
+
-- 
2.7.4