[LV][RISCV] Disable vectorization of epilogue loops

author Philip Reames <preames@rivosinc.com>

Tue, 25 Oct 2022 21:01:33 +0000 (14:01 -0700)

committer Philip Reames <listmail@philipreames.com>

Tue, 25 Oct 2022 21:28:02 +0000 (14:28 -0700)
author Philip Reames <preames@rivosinc.com>
Tue, 25 Oct 2022 21:01:33 +0000 (14:01 -0700)
committer Philip Reames <listmail@philipreames.com>
Tue, 25 Oct 2022 21:28:02 +0000 (14:28 -0700)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h

index a25061f06c148d41313559446393c21e39af6c57..b09b1e360c10e6bd5bd7979bfb7588ae36e01527 100644 (file)
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1478,6 +1478,10 @@ public:
    bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
                                         ReductionFlags Flags) const;
  
+  /// Return true if the loop vectorizer should consider vectorizing an
+  /// otherwise scalar epilogue loop.
+  bool preferEpilogueVectorization() const;
+
    /// \returns True if the target wants to expand the given reduction intrinsic
    /// into a shuffle sequence.
    bool shouldExpandReduction(const IntrinsicInst *II) const;
@@ -1881,6 +1885,8 @@ public:
                                       ReductionFlags) const = 0;
    virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
                                                 ReductionFlags) const = 0;
+  virtual bool preferEpilogueVectorization() const = 0;
+
    virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
    virtual unsigned getGISelRematGlobalCost() const = 0;
    virtual unsigned getMinTripCountTailFoldingThreshold() const = 0;
@@ -2526,6 +2532,10 @@ public:
                                         ReductionFlags Flags) const override {
      return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags);
    }
+  bool preferEpilogueVectorization() const override {
+    return Impl.preferEpilogueVectorization();
+  }
+
    bool shouldExpandReduction(const IntrinsicInst *II) const override {
      return Impl.shouldExpandReduction(II);
    }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

index 36e363ad9cb73d878d292bcf72c8e78f69dbe49a..8fcaa52a7bf518dc160a59ede69485f950ba7f94 100644 (file)
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -828,6 +828,10 @@ public:
      return false;
    }
  
+  bool preferEpilogueVectorization() const {
+    return true;
+  }
+
    bool shouldExpandReduction(const IntrinsicInst *II) const { return true; }
  
    unsigned getGISelRematGlobalCost() const { return 1; }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp

index 0d3058512e05e7cbe3265a0a241f1678969ea343..2c33f2d59ca12479b4c0b6912fb031658ba0dfed 100644 (file)
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1145,6 +1145,10 @@ bool TargetTransformInfo::preferPredicatedReductionSelect(
    return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty, Flags);
  }
  
+bool TargetTransformInfo::preferEpilogueVectorization() const {
+  return TTIImpl->preferEpilogueVectorization();
+}
+
  TargetTransformInfo::VPLegalization
  TargetTransformInfo::getVPLegalizationStrategy(const VPIntrinsic &VPI) const {
    return TTIImpl->getVPLegalizationStrategy(VPI);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

index 704d0dbdffaff55d35ceca7e5ed183dab63d6ea1..6f9c958ecef9bf86baafbf8724b7191427d91afd 100644 (file)
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -84,6 +84,13 @@ public:
  
    unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
  
+  bool preferEpilogueVectorization() const {
+    // Epilogue vectorization is usually unprofitable - tail folding or
+    // a smaller VF would have been better.  This a blunt hammer - we
+    // should re-examine this once vectorization is better tuned.
+    return false;
+  }
+
    InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                          Align Alignment, unsigned AddressSpace,
                                          TTI::TargetCostKind CostKind);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index 108cf6f17f52ac5e7c9dff057732ce14bdccde93..7983165f09842de15dad3b149dd90c320b7dcbdf 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5506,6 +5506,11 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
    // as register pressure, code size increase and cost of extra branches into
    // account. For now we apply a very crude heuristic and only consider loops
    // with vectorization factors larger than a certain value.
+
+  // Allow the target to opt out entirely.
+  if (!TTI.preferEpilogueVectorization())
+    return false;
+
    // We also consider epilogue vectorization unprofitable for targets that don't
    // consider interleaving beneficial (eg. MVE).
    if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll

index 5997b64c9f88a2edd352d466e0d45464ac9a9391..5c7f5be57292856b2326e977ed41b8348f9091fb 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -1015,10 +1015,8 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
  ; CHECK-NEXT:    ret void
  ;
  ; FIXED-LABEL: @predicated_sdiv_by_minus_one(
-; FIXED-NEXT:  iter.check:
-; FIXED-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; FIXED:       vector.main.loop.iter.check:
-; FIXED-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-NEXT:  entry:
+; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
  ; FIXED:       vector.ph:
  ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
  ; FIXED:       vector.body:
@@ -1048,35 +1046,12 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
  ; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
  ; FIXED:       middle.block:
  ; FIXED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; FIXED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; FIXED:       vec.epilog.iter.check:
-; FIXED-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; FIXED:       vec.epilog.ph:
-; FIXED-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; FIXED-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; FIXED:       vec.epilog.vector.body:
-; FIXED-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0
-; FIXED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
-; FIXED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; FIXED-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i8>, ptr [[TMP17]], align 1
-; FIXED-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i8> [[WIDE_LOAD5]], <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
-; FIXED-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; FIXED-NEXT:    [[TMP20:%.*]] = sdiv <8 x i8> [[WIDE_LOAD5]], [[TMP19]]
-; FIXED-NEXT:    [[TMP21:%.*]] = xor <8 x i1> [[TMP18]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; FIXED-NEXT:    [[PREDPHI6:%.*]] = select <8 x i1> [[TMP18]], <8 x i8> [[TMP20]], <8 x i8> [[WIDE_LOAD5]]
-; FIXED-NEXT:    store <8 x i8> [[PREDPHI6]], ptr [[TMP17]], align 1
-; FIXED-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[OFFSET_IDX]], 8
-; FIXED-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT7]], 1024
-; FIXED-NEXT:    br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; FIXED:       vec.epilog.middle.block:
-; FIXED-NEXT:    [[CMP_N3:%.*]] = icmp eq i64 1024, 1024
-; FIXED-NEXT:    br i1 [[CMP_N3]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
-; FIXED:       vec.epilog.scalar.ph:
-; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; FIXED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; FIXED:       scalar.ph:
+; FIXED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
  ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
  ; FIXED:       for.body:
-; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
+; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
  ; FIXED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
  ; FIXED-NEXT:    [[ELEM:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
  ; FIXED-NEXT:    [[C:%.*]] = icmp ne i8 [[ELEM]], -128
@@ -1089,7 +1064,7 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
  ; FIXED-NEXT:    store i8 [[PHI]], ptr [[ARRAYIDX]], align 1
  ; FIXED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
  ; FIXED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
  ; FIXED:       for.end:
  ; FIXED-NEXT:    ret void
  ;
author	Philip Reames <preames@rivosinc.com>
	Tue, 25 Oct 2022 21:01:33 +0000 (14:01 -0700)
committer	Philip Reames <listmail@philipreames.com>
	Tue, 25 Oct 2022 21:28:02 +0000 (14:28 -0700)
llvm/include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h		patch \| blob \| history
llvm/lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll		patch \| blob \| history