From 269bc684e7a0c3f727ea5e74270112585acaf55d Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 25 Oct 2022 14:01:33 -0700
Subject: [PATCH] [LV][RISCV] Disable vectorization of epilogue loops

Epilogue loop vectorization is a feature in the vectorize intended to avoid running fully scalar code when the vector length of the main loop turns out to be either longer than the trip count of the actual loop, or with a huge remainder.

In practice, this feature appears to not have been well tuned. I honestly don't think it should be on by default at all, but it definitely shouldn't be on for RISCV. Note that other targets have also disabled it, but they've done so via disabling interleaving - which is, well, completely unrelated - and we don't want to do that for RISCV.

In the near term, many examples I'm seeing have terrible codegen for epilogue vectorization. We are greatly increasing code size for little value at reasonable VLEN values for small types. In the long term, the cases that epilogue vectorization are intended to handle are likely better handled via tail folding on RISCV.

As an aside, I also don't really trust the correctness of epilogue vectorization. The code structure is such that otherwise straight forward changes sometimes break only epilogue vectorization. The reuse of an existing vplan without careful validation opens significant room for nasty bugs. Given how rarely the code is exercised, that is not a good combination.

As such, this patch introduces a TTI hook, and completely disables epilogue vectorization on RISCV.

Differential Revision: https://reviews.llvm.org/D136695
---
 llvm/include/llvm/Analysis/TargetTransformInfo.h   | 10 ++++++
 .../llvm/Analysis/TargetTransformInfoImpl.h        |  4 +++
 llvm/lib/Analysis/TargetTransformInfo.cpp          |  4 +++
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h   |  7 ++++
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp    |  5 +++
 llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll | 39 ++++------------------
 6 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a25061f..b09b1e3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1478,6 +1478,10 @@ public:
   bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
                                        ReductionFlags Flags) const;
 
+  /// Return true if the loop vectorizer should consider vectorizing an
+  /// otherwise scalar epilogue loop.
+  bool preferEpilogueVectorization() const;
+
   /// \returns True if the target wants to expand the given reduction intrinsic
   /// into a shuffle sequence.
   bool shouldExpandReduction(const IntrinsicInst *II) const;
@@ -1881,6 +1885,8 @@ public:
                                      ReductionFlags) const = 0;
   virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
                                                ReductionFlags) const = 0;
+  virtual bool preferEpilogueVectorization() const = 0;
+
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
   virtual unsigned getGISelRematGlobalCost() const = 0;
   virtual unsigned getMinTripCountTailFoldingThreshold() const = 0;
@@ -2526,6 +2532,10 @@ public:
                                        ReductionFlags Flags) const override {
     return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags);
   }
+  bool preferEpilogueVectorization() const override {
+    return Impl.preferEpilogueVectorization();
+  }
+
   bool shouldExpandReduction(const IntrinsicInst *II) const override {
     return Impl.shouldExpandReduction(II);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 36e363a..8fcaa52 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -828,6 +828,10 @@ public:
     return false;
   }
 
+  bool preferEpilogueVectorization() const {
+    return true;
+  }
+
   bool shouldExpandReduction(const IntrinsicInst *II) const { return true; }
 
   unsigned getGISelRematGlobalCost() const { return 1; }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 0d30585..2c33f2d 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1145,6 +1145,10 @@ bool TargetTransformInfo::preferPredicatedReductionSelect(
   return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty, Flags);
 }
 
+bool TargetTransformInfo::preferEpilogueVectorization() const {
+  return TTIImpl->preferEpilogueVectorization();
+}
+
 TargetTransformInfo::VPLegalization
 TargetTransformInfo::getVPLegalizationStrategy(const VPIntrinsic &VPI) const {
   return TTIImpl->getVPLegalizationStrategy(VPI);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 704d0db..6f9c958 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -84,6 +84,13 @@ public:
 
   unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
 
+  bool preferEpilogueVectorization() const {
+    // Epilogue vectorization is usually unprofitable - tail folding or
+    // a smaller VF would have been better.  This a blunt hammer - we
+    // should re-examine this once vectorization is better tuned.
+    return false;
+  }
+
   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                         Align Alignment, unsigned AddressSpace,
                                         TTI::TargetCostKind CostKind);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 108cf6f..7983165 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5506,6 +5506,11 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
   // as register pressure, code size increase and cost of extra branches into
   // account. For now we apply a very crude heuristic and only consider loops
   // with vectorization factors larger than a certain value.
+
+  // Allow the target to opt out entirely.
+  if (!TTI.preferEpilogueVectorization())
+    return false;
+
   // We also consider epilogue vectorization unprofitable for targets that don't
   // consider interleaving beneficial (eg. MVE).
   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index 5997b64..5c7f5be 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -1015,10 +1015,8 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    ret void
 ;
 ; FIXED-LABEL: @predicated_sdiv_by_minus_one(
-; FIXED-NEXT:  iter.check:
-; FIXED-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; FIXED:       vector.main.loop.iter.check:
-; FIXED-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:  entry:
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
@@ -1048,35 +1046,12 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; FIXED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; FIXED:       vec.epilog.iter.check:
-; FIXED-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; FIXED:       vec.epilog.ph:
-; FIXED-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; FIXED-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; FIXED:       vec.epilog.vector.body:
-; FIXED-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0
-; FIXED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
-; FIXED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i8>, ptr [[TMP17]], align 1
-; FIXED-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i8> [[WIDE_LOAD5]], <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
-; FIXED-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; FIXED-NEXT:    [[TMP20:%.*]] = sdiv <8 x i8> [[WIDE_LOAD5]], [[TMP19]]
-; FIXED-NEXT:    [[TMP21:%.*]] = xor <8 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; FIXED-NEXT:    [[PREDPHI6:%.*]] = select <8 x i1> [[TMP18]], <8 x i8> [[TMP20]], <8 x i8> [[WIDE_LOAD5]]
-; FIXED-NEXT:    store <8 x i8> [[PREDPHI6]], ptr [[TMP17]], align 1
-; FIXED-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[OFFSET_IDX]], 8
-; FIXED-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT7]], 1024
-; FIXED-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; FIXED:       vec.epilog.middle.block:
-; FIXED-NEXT:    [[CMP_N3:%.*]] = icmp eq i64 1024, 1024
-; FIXED-NEXT:    br i1 [[CMP_N3]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
-; FIXED:       vec.epilog.scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; FIXED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; FIXED:       scalar.ph:
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
 ; FIXED:       for.body:
-; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
 ; FIXED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
 ; FIXED-NEXT:    [[ELEM:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; FIXED-NEXT:    [[C:%.*]] = icmp ne i8 [[ELEM]], -128
@@ -1089,7 +1064,7 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    store i8 [[PHI]], ptr [[ARRAYIDX]], align 1
 ; FIXED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; FIXED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; FIXED:       for.end:
 ; FIXED-NEXT:    ret void
 ;
-- 
2.7.4