From bc19b7c3cc169151ef30883c9b313114d7d5a9db Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 7 Jul 2022 08:40:26 -0700
Subject: [PATCH] [LV] Remove collectTriviallyDeadInstructions, already handled
 by VP DCE.

Now that removeDeadRecipes can remove most dead recipes across a whole
VPlan, there is no need to first collect some dead instructions.
Instead removeDeadRecipes can simply clean them up.

Depends D127580.

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D128408
---
 .../Vectorize/LoopVectorizationPlanner.h           |  5 --
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp    | 55 +---------------------
 .../LoopVectorize/X86/consecutive-ptr-uniforms.ll  |  6 +--
 .../Transforms/LoopVectorize/pointer-induction.ll  |  3 +-
 4 files changed, 4 insertions(+), 65 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 933ea9e..0845ec5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -332,11 +332,6 @@ public:
   bool requiresTooManyRuntimeChecks() const;
 
 protected:
-  /// Collect the instructions from the original loop that would be trivially
-  /// dead in the vectorized loop if generated.
-  void collectTriviallyDeadInstructions(
-      SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6926da5..83c3122 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7645,51 +7645,6 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
 }
 #endif
 
-void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
-    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
-
-  // We create new control-flow for the vectorized loop, so the original exit
-  // conditions will be dead after vectorization if it's only used by the
-  // terminator
-  SmallVector<BasicBlock*> ExitingBlocks;
-  OrigLoop->getExitingBlocks(ExitingBlocks);
-  for (auto *BB : ExitingBlocks) {
-    auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
-    if (!Cmp || !Cmp->hasOneUse())
-      continue;
-
-    // TODO: we should introduce a getUniqueExitingBlocks on Loop
-    if (!DeadInstructions.insert(Cmp).second)
-      continue;
-
-    // The operands of the icmp is often a dead trunc, used by IndUpdate.
-    // TODO: can recurse through operands in general
-    for (Value *Op : Cmp->operands()) {
-      if (isa<TruncInst>(Op) && Op->hasOneUse())
-          DeadInstructions.insert(cast<Instruction>(Op));
-    }
-  }
-
-  // We create new "steps" for induction variable updates to which the original
-  // induction variables map. An original update instruction will be dead if
-  // all its users except the induction variable are dead.
-  auto *Latch = OrigLoop->getLoopLatch();
-  for (auto &Induction : Legal->getInductionVars()) {
-    PHINode *Ind = Induction.first;
-    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
-
-    // If the tail is to be folded by masking, the primary induction variable,
-    // if exists, isn't dead: it will be used for masking. Don't kill it.
-    if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
-      continue;
-
-    if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
-          return U == Ind || DeadInstructions.count(cast<Instruction>(U));
-        }))
-      DeadInstructions.insert(IndUpdate);
-  }
-}
-
 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
 
 //===--------------------------------------------------------------------===//
@@ -8577,19 +8532,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
-  // Collect instructions from the original loop that will become trivially dead
-  // in the vectorized loop. We don't need to vectorize these instructions. For
-  // example, original induction update instructions can become dead because we
-  // separately emit induction "steps" when generating code for the new loop.
-  // Similarly, we create a new latch condition when setting up the structure
-  // of the new loop, so the old one can become dead.
-  SmallPtrSet<Instruction *, 4> DeadInstructions;
-  collectTriviallyDeadInstructions(DeadInstructions);
-
   // Add assume instructions we need to drop to DeadInstructions, to prevent
   // them from being added to the VPlan.
   // TODO: We only need to drop assumes in blocks that get flattend. If the
   // control flow is preserved, we should keep them.
+  SmallPtrSet<Instruction *, 4> DeadInstructions;
   auto &ConditionalAssumes = Legal->getConditionalAssumes();
   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index 53a95d9..6088fbe 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -98,10 +98,9 @@ attributes #0 = { "target-cpu"="knl" }
 ; FORCE-NEXT:    store i32 [[TMP0]], i32* @b, align 1
 ; FORCE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP0]]
 ; FORCE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 1
-; FORCE-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
 ; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; FORCE:       pred.load.continue:
-; FORCE-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
+; FORCE-NEXT:    [[TMP9:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
 ; FORCE-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
 ; FORCE-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
 ; FORCE:       pred.load.if1:
@@ -109,10 +108,9 @@ attributes #0 = { "target-cpu"="knl" }
 ; FORCE-NEXT:    store i32 [[TMP1]], i32* @b, align 1
 ; FORCE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP1]]
 ; FORCE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 1
-; FORCE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
 ; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
 ; FORCE:       pred.load.continue2:
-; FORCE-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ]
+; FORCE-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, %pred.load.continue ], [ [[TMP12]], %pred.load.if1 ]
 ; FORCE-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
 ; FORCE-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; FORCE-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index f10ac3f..7859688 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -36,8 +36,7 @@ define void @a(i8* readnone %b) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i64 -1
-; CHECK-NEXT:    store i8 95, i8* [[TMP11]], align 1
+; CHECK-NEXT:    store i8 95, i8* [[TMP4]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP9]], i32 1
-- 
2.7.4