[LoopVectorize] Take vscale into account when deciding to create epilogues

author David Sherwood <david.sherwood@arm.com>

Mon, 3 Apr 2023 16:14:09 +0000 (16:14 +0000)

committer David Sherwood <david.sherwood@arm.com>

Mon, 17 Apr 2023 10:49:40 +0000 (10:49 +0000)
author David Sherwood <david.sherwood@arm.com>
Mon, 3 Apr 2023 16:14:09 +0000 (16:14 +0000)
committer David Sherwood <david.sherwood@arm.com>
Mon, 17 Apr 2023 10:49:40 +0000 (10:49 +0000)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index 6d2da31..0e29907 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5619,9 +5619,11 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
    // consider interleaving beneficial (eg. MVE).
    if (TTI.getMaxInterleaveFactor(VF) <= 1)
      return false;
-  // FIXME: We should consider changing the threshold for scalable
-  // vectors to take VScaleForTuning into account.
-  if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
+
+  unsigned Multiplier = 1;
+  if (VF.isScalable())
+    Multiplier = getVScaleForTuning().value_or(1);
+  if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
      return true;
    return false;
  }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll

index 4dd6522..963cdad 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
@@ -1,5 +1,7 @@
-; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -S %s | FileCheck --check-prefixes=CHECK,DEFAULT %s
-; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefixes=CHECK,THRESHOLD %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 \
+; RUN:   -enable-epilogue-vectorization=false -S %s | FileCheck --check-prefixes=CHECK,DEFAULT %s
+; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 \
+; RUN:   -enable-epilogue-vectorization=false -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefixes=CHECK,THRESHOLD %s
  
  ; Tests for loops with large numbers of runtime checks. Check that loops are
  ; vectorized, if the loop trip counts are large and the impact of the runtime
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll

new file mode 100644 (file)

index 0000000..c65b10c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
+; RUN:   -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG
+; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
+; RUN:   -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG
+; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
+; RUN:   -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
+; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
+; RUN:   -mcpu=cortex-x2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @foo(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i64 %len) #0 {
+; CHECK-EPILOG:      vec.epilog.ph:
+; CHECK-EPILOG:      vec.epilog.vector.body:
+; CHECK-EPILOG:        load <vscale x 4 x i16>
+
+; CHECK-NO-EPILOG-NOT:  vec.epilog.vector.ph:
+; CHECK-NO-EPILOG-NOT:  vec.epilog.vector.body:
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, ptr %p, i64 %indvars.iv
+  %0 = load i16, ptr %arrayidx
+  %add = add nuw nsw i16 %0, 2
+  %arrayidx3 = getelementptr inbounds i16, ptr %q, i64 %indvars.iv
+  store i16 %add, ptr %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %len
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" vscale_range(1,16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll

index 6b7b49c..995950e 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll
@@ -1,13 +1,12 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-vectorize,dce -mtriple aarch64-linux-gnu -mattr=+sve \
-; RUN:   -prefer-predicate-over-epilogue=scalar-epilogue < %s -S | FileCheck %s
+; RUN: opt -passes=loop-vectorize,dce -prefer-predicate-over-epilogue=scalar-epilogue \
+; RUN:   -enable-epilogue-vectorization=false < %s -S | FileCheck %s
  
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
  target triple = "aarch64-unknown-linux-gnu"
  
  ; This should be vscale x 8 vectorized, maybe with some interleaving.
  
-define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef readonly %s, i32 noundef %n) {
+define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef readonly %s, i32 noundef %n) #0 {
  ; CHECK-LABEL: @fneg(
  ; CHECK-NEXT:  entry:
  ; CHECK-NEXT:    [[S2:%.*]] = ptrtoint ptr [[S:%.*]] to i64
@@ -100,3 +99,5 @@ for.body:                                         ; preds = %for.body.preheader,
    %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
  }
+
+attributes #0 = { "target-features"="+sve" vscale_range(1,16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll

index 34de689..f49b371 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
@@ -1,7 +1,7 @@
  ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
  ; REQUIRES: asserts
  ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
-; RUN:   -debug-only=loop-vectorize 2>%t < %s | FileCheck %s
+; RUN:   -enable-epilogue-vectorization=false -debug-only=loop-vectorize 2>%t < %s | FileCheck %s
  ; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST
  
  target triple = "aarch64-unknown-linux-gnu"
@@ -17,7 +17,6 @@ define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur
  ; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction:   %conv = zext i8 %0 to i32
  ; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %conv = zext i8 %0 to i32
  ; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %conv = zext i8 %0 to i32
-
  ; CHECK-LABEL: define void @zext_i8_i16
  ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
  ; CHECK-NEXT:  entry:
@@ -101,7 +100,6 @@ define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur
  ; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction:   %conv = sext i8 %0 to i32
  ; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction:   %conv = sext i8 %0 to i32
  ; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction:   %conv = sext i8 %0 to i32
-
  ; CHECK-LABEL: define void @sext_i8_i16
  ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
  ; CHECK-NEXT:  entry:
author	David Sherwood <david.sherwood@arm.com>
	Mon, 3 Apr 2023 16:14:09 +0000 (16:14 +0000)
committer	David Sherwood <david.sherwood@arm.com>
	Mon, 17 Apr 2023 10:49:40 +0000 (10:49 +0000)
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll	[new file with mode: 0644]	patch \| blob
llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll		patch \| blob \| history