From 303b6d5e981947cff7e12626669c1fbeef046f18 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 17 Jun 2021 09:48:30 +0100 Subject: [PATCH] [LoopVectorize] Add support for scalable vectorization of invariant stores Previously in setCostBasedWideningDecision if we encountered an invariant store we just assumed that we could scalarize the store and called getUniformMemOpCost to get the associated cost. However, for scalable vectors this is not an option because it is not currently possibly to scalarize the store. At the moment we crash in VPReplicateRecipe::execute when trying to scalarize the store. Therefore, I have changed setCostBasedWideningDecision so that if we are storing a scalable vector out to a uniform address and the target supports scatter instructions, then we should use those instead. Tests have been added here: Transforms/LoopVectorize/AArch64/sve-inv-store.ll Differential Revision: https://reviews.llvm.org/D104624 --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +++- .../LoopVectorize/AArch64/sve-inv-store.ll | 70 ++++++++++++++++++++++ 2 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3988d2f..f22e737 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7346,8 +7346,17 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // relying on instcombine to remove them. // Load: Scalar load + broadcast // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract - InstructionCost Cost = getUniformMemOpCost(&I, VF); - setWideningDecision(&I, VF, CM_Scalarize, Cost); + InstructionCost Cost; + if (isa(&I) && VF.isScalable() && + isLegalGatherOrScatter(&I)) { + Cost = getGatherScatterCost(&I, VF); + setWideningDecision(&I, VF, CM_GatherScatter, Cost); + } else { + assert((isa(&I) || !VF.isScalable()) && + "Cannot yet scalarize uniform stores"); + Cost = getUniformMemOpCost(&I, VF); + setWideningDecision(&I, VF, CM_Scalarize, Cost); + } continue; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll new file mode 100644 index 0000000..0e02af6 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll @@ -0,0 +1,70 @@ +; RUN: opt -loop-vectorize -scalable-vectorization=on -S < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N) #0 { +; CHECK-LABEL: @inv_store_i16( +; CHECK: vector.ph: +; CHECK: %[[TMP1:.*]] = insertelement poison, i16* %dst, i32 0 +; CHECK-NEXT: %[[SPLAT_PTRS:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK: vector.body: +; CHECK: %[[VECLOAD:.*]] = load , * %{{.*}}, align 2 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %[[VECLOAD]], %[[SPLAT_PTRS]], i32 2 +entry: + br label %for.body14 + +for.body14: ; preds = %for.body14, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body14 ] + %arrayidx = getelementptr inbounds i16, i16* %src, i64 %indvars.iv + %ld = load i16, i16* %arrayidx + store i16 %ld, i16* %dst, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond.not, label %for.inc24, label %for.body14, !llvm.loop !0 + +for.inc24: ; preds = %for.body14, %for.body + ret void +} + + +define void @cond_inv_store_i32(i32* noalias %dst, i32* noalias readonly %src, i64 %N) #0 { +; CHECK-LABEL: @cond_inv_store_i32( +; CHECK: vector.ph: +; CHECK: %[[TMP1:.*]] = insertelement poison, i32* %dst, i32 0 +; CHECK-NEXT: %[[SPLAT_PTRS:.*]] = shufflevector %[[TMP1]], poison, zeroinitializer +; CHECK: vector.body: +; CHECK: %[[VECLOAD:.*]] = load , * %{{.*}}, align 4 +; CHECK-NEXT: %[[MASK:.*]] = icmp sgt %[[VECLOAD]], shufflevector ( insertelement ( poison, i32 0, i32 0), poison, zeroinitializer) +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %[[VECLOAD]], %[[SPLAT_PTRS]], i32 4, %[[MASK]]) +entry: + br label %for.body + +for.body: ; preds = %entry, %for.inc + %i.09 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %i.09 + %0 = load i32, i32* %arrayidx, align 4 + %cmp1 = icmp sgt i32 %0, 0 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + store i32 %0, i32* %dst, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw nsw i64 %i.09, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.inc, %entry + ret void +} + +attributes #0 = { "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true} +!5 = !{!"llvm.loop.interleave.count", i32 1} + -- 2.7.4