[LV] Don't optimize exit cond during epilogue vectorization.

author Florian Hahn <flo@fhahn.com>

Fri, 1 Jul 2022 12:48:38 +0000 (13:48 +0100)

committer Florian Hahn <flo@fhahn.com>

Fri, 1 Jul 2022 12:48:38 +0000 (13:48 +0100)
author Florian Hahn <flo@fhahn.com>
Fri, 1 Jul 2022 12:48:38 +0000 (13:48 +0100)
committer Florian Hahn <flo@fhahn.com>
Fri, 1 Jul 2022 12:48:38 +0000 (13:48 +0100)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

index 342727d..0cb2032 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -303,8 +303,12 @@ public:
  
    /// Generate the IR code for the body of the vectorized loop according to the
    /// best selected \p VF, \p UF and VPlan \p BestPlan.
+  /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
+  /// vectorization re-using plans for both the main and epilogue vector loops.
+  /// It should be removed once the re-use issue has been fixed.
    void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
-                   InnerLoopVectorizer &LB, DominatorTree *DT);
+                   InnerLoopVectorizer &LB, DominatorTree *DT,
+                   bool IsEpilogueVectorization);
  
  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    void printPlans(raw_ostream &O);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

index 558ebf7..a99db18 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7520,7 +7520,8 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) {
  void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
                                             VPlan &BestVPlan,
                                             InnerLoopVectorizer &ILV,
-                                           DominatorTree *DT) {
+                                           DominatorTree *DT,
+                                           bool IsEpilogueVectorization) {
    LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
                      << '\n');
  
@@ -7564,7 +7565,8 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
    // 2. Copy and widen instructions from the old loop into the new loop.
    BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
                               ILV.getOrCreateVectorTripCount(nullptr),
-                             CanonicalIVStartValue, State);
+                             CanonicalIVStartValue, State,
+                             IsEpilogueVectorization);
  
    BestVPlan.execute(&State);
  
@@ -10155,7 +10157,7 @@ static bool processLoopInVPlanNativePath(
                             &CM, BFI, PSI, Checks);
      LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                        << L->getHeader()->getParent()->getName() << "\"\n");
-    LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
+    LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
    }
  
    // Mark the loop as already vectorized to avoid vectorizing again.
@@ -10499,7 +10501,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                   &CM, BFI, PSI, Checks);
  
        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-      LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
+      LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
  
        ORE->emit([&]() {
          return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10524,7 +10526,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
  
          VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
          LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
-                        DT);
+                        DT, true);
          ++LoopsVectorized;
  
          // Second pass vectorizes the epilogue and adjusts the control flow
@@ -10553,7 +10555,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
          }
  
          LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
-                        DT);
+                        DT, true);
          ++LoopsEpilogueVectorized;
  
          if (!MainILV.areSafetyChecksAdded())
@@ -10563,7 +10565,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                 &LVL, &CM, BFI, PSI, Checks);
  
          VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
+        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
          ++LoopsVectorized;
  
          // Add metadata to disable runtime unrolling a scalar loop when there
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp

index 6e4c8f0..1f1a885 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -546,13 +546,14 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
  
  void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
                               Value *CanonicalIVStartValue,
-                             VPTransformState &State) {
+                             VPTransformState &State,
+                             bool IsEpilogueVectorization) {
  
    VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock();
    auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back());
    // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when
    // preparing to execute the plan for the main vector loop.
-  if (!CanonicalIVStartValue && Term &&
+  if (!IsEpilogueVectorization && Term &&
        Term->getOpcode() == VPInstruction::BranchOnCount &&
        isa<ConstantInt>(TripCountV)) {
      ConstantInt *C = cast<ConstantInt>(TripCountV);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h

index 16190a4..3497c4b 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2511,7 +2511,8 @@ public:
  
    /// Prepare the plan for execution, setting up the required live-in values.
    void prepareToExecute(Value *TripCount, Value *VectorTripCount,
-                        Value *CanonicalIVStartValue, VPTransformState &State);
+                        Value *CanonicalIVStartValue, VPTransformState &State,
+                        bool IsEpilogueVectorization);
  
    /// Generate the IR code for this VPlan.
    void execute(struct VPTransformState *State);
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll

index 7d469e7..52440ec 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
@@ -9,18 +9,26 @@
  define zeroext i8 @sum() {
  ; CHECK-LABEL: @sum(
  ; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 0
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 [[INDEX]]
  ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <64 x i8>*
  ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP1]], align 16
  ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 64
  ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <64 x i8>*
  ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <64 x i8>, <64 x i8>* [[TMP3]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = add <64 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = add <64 x i8> [[WIDE_LOAD2]], zeroinitializer
-; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add nuw i64 0, 128
+; CHECK-NEXT:    [[TMP4]] = add <64 x i8> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP5]] = add <64 x i8> [[WIDE_LOAD2]], [[VEC_PHI1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX]], 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
  ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
-; CHECK-NEXT:    ret i8 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
+; CHECK-NEXT:    ret i8 [[TMP7]]
  ;
  entry:
    br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll

index 74b13d3..7340167 100644 (file)
--- a/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll
@@ -26,7 +26,8 @@ define void @pr56319(ptr noalias %src, ptr noalias %dst) {
  ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
  ; CHECK-NEXT:    store <32 x i8> [[STRIDED_VEC]], ptr [[TMP4]], align 1
  ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
  ; CHECK:       middle.block:
  ; CHECK-NEXT:    br label [[VEC_EPILOG_ITER_CHECK:%.*]]
  ; CHECK:       vec.epilog.iter.check:
@@ -36,16 +37,17 @@ define void @pr56319(ptr noalias %src, ptr noalias %dst) {
  ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
  ; CHECK:       vec.epilog.vector.body:
  ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC]], i64 [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <24 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC]], i64 [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <24 x i8>, ptr [[TMP8]], align 1
  ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC2]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    store <8 x i8> [[STRIDED_VEC3]], ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    store <8 x i8> [[STRIDED_VEC3]], ptr [[TMP10]], align 1
  ; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 24
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
  ; CHECK:       vec.epilog.middle.block:
  ; CHECK-NEXT:    br label [[VEC_EPILOG_SCALAR_PH]]
  ; CHECK:       vec.epilog.scalar.ph:
author	Florian Hahn <flo@fhahn.com>
	Fri, 1 Jul 2022 12:48:38 +0000 (13:48 +0100)
committer	Florian Hahn <flo@fhahn.com>
	Fri, 1 Jul 2022 12:48:38 +0000 (13:48 +0100)
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h		patch \| blob \| history
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/VPlan.cpp		patch \| blob \| history
llvm/lib/Transforms/Vectorize/VPlan.h		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/X86/pr42674.ll		patch \| blob \| history
llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll		patch \| blob \| history