From 300870a95c22fde840862cf57d82adba3e5bd633 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 22 Sep 2021 09:30:53 +0100
Subject: [PATCH] [VectorCombine] Switch to using a worklist.

This patch updates VectorCombine to use a worklist to allow iterative
simplifications where a combine enables other combines.

Suggested in D100302.

The main use case at the moment is foldSingleElementStore and
scalarizeLoadExtract working together to improve scalarization.

Note that we now also do not run SimplifyInstructionsInBlock on the
whole function if there have been changes. This means we fail to
remove/simplify instructions not related to any of the vector combines.
IMO this is fine, as simplifying the whole function seems more like a
workaround for not tracking the changed instructions.

Compile-time impact looks neutral:
NewPM-O3: +0.02%
NewPM-ReleaseThinLTO: -0.00%
NewPM-ReleaseLTO-g: -0.02%

http://llvm-compile-time-tracker.com/compare.php?from=52832cd917af00e2b9c6a9d1476ba79754dcabff&to=e66520a4637290550a945d528e3e59573485dd40&stat=instructions

Reviewed By: spatel, lebedev.ri

Differential Revision: https://reviews.llvm.org/D110171
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 72 +++++++++++++------
 .../AArch64/matrix-extract-insert.ll          | 11 +--
 ...load-extract-insert-store-scalarization.ll | 33 +++++----
 .../X86/extract-binop-inseltpoison.ll         |  4 +-
 .../VectorCombine/X86/extract-binop.ll        |  4 +-
 .../VectorCombine/load-insert-store.ll        |  4 +-
 6 files changed, 82 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 8d01da908f14..dcf7b68bd792 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -31,10 +31,12 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
 
+#define DEBUG_TYPE "vector-combine"
+#include "llvm/Transforms/Utils/InstructionWorklist.h"
+
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-#define DEBUG_TYPE "vector-combine"
 STATISTIC(NumVecLoad, "Number of vector loads formed");
 STATISTIC(NumVecCmp, "Number of vector compares formed");
 STATISTIC(NumVecBO, "Number of vector binops formed");
@@ -73,6 +75,7 @@ private:
   const DominatorTree &DT;
   AAResults &AA;
   AssumptionCache &AC;
+  InstructionWorklist Worklist;
 
   bool vectorizeLoadInsert(Instruction &I);
   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
@@ -92,14 +95,26 @@ private:
   bool foldExtractedCmps(Instruction &I);
   bool foldSingleElementStore(Instruction &I);
   bool scalarizeLoadExtract(Instruction &I);
+
+  void replaceValue(Value &Old, Value &New) {
+    Old.replaceAllUsesWith(&New);
+    New.takeName(&Old);
+    if (auto *NewI = dyn_cast<Instruction>(&New)) {
+      Worklist.pushUsersToWorkList(*NewI);
+      Worklist.pushValue(NewI);
+    }
+    Worklist.pushValue(&Old);
+  }
+
+  void eraseInstruction(Instruction &I) {
+    for (Value *Op : I.operands())
+      Worklist.pushValue(Op);
+    Worklist.remove(&I);
+    I.eraseFromParent();
+  }
 };
 } // namespace
 
-static void replaceValue(Value &Old, Value &New) {
-  Old.replaceAllUsesWith(&New);
-  New.takeName(&Old);
-}
-
 bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // Match insert into fixed vector of scalar value.
   // TODO: Handle non-zero insert index.
@@ -501,6 +516,8 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
   else
     foldExtExtBinop(Ext0, Ext1, I);
 
+  Worklist.push(Ext0);
+  Worklist.push(Ext1);
   return true;
 }
 
@@ -928,8 +945,7 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
         DL);
     NSI->setAlignment(ScalarOpAlignment);
     replaceValue(I, *NSI);
-    // Need erasing the store manually.
-    I.eraseFromParent();
+    eraseInstruction(I);
     return true;
   }
 
@@ -939,11 +955,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
 /// Try to scalarize vector loads feeding extractelement instructions.
 bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   Value *Ptr;
-  Value *Idx;
-  if (!match(&I, m_ExtractElt(m_Load(m_Value(Ptr)), m_Value(Idx))))
+  if (!match(&I, m_Load(m_Value(Ptr))))
     return false;
 
-  auto *LI = cast<LoadInst>(I.getOperand(0));
+  auto *LI = cast<LoadInst>(&I);
   const DataLayout &DL = I.getModule()->getDataLayout();
   if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType()))
     return false;
@@ -1039,6 +1054,16 @@ bool VectorCombine::run() {
     return false;
 
   bool MadeChange = false;
+  auto FoldInst = [this, &MadeChange](Instruction &I) {
+    Builder.SetInsertPoint(&I);
+    MadeChange |= vectorizeLoadInsert(I);
+    MadeChange |= foldExtractExtract(I);
+    MadeChange |= foldBitcastShuf(I);
+    MadeChange |= scalarizeBinopOrCmp(I);
+    MadeChange |= foldExtractedCmps(I);
+    MadeChange |= scalarizeLoadExtract(I);
+    MadeChange |= foldSingleElementStore(I);
+  };
   for (BasicBlock &BB : F) {
     // Ignore unreachable basic blocks.
     if (!DT.isReachableFromEntry(&BB))
@@ -1047,21 +1072,22 @@ bool VectorCombine::run() {
     for (Instruction &I : make_early_inc_range(BB)) {
       if (isa<DbgInfoIntrinsic>(I))
         continue;
-      Builder.SetInsertPoint(&I);
-      MadeChange |= vectorizeLoadInsert(I);
-      MadeChange |= foldExtractExtract(I);
-      MadeChange |= foldBitcastShuf(I);
-      MadeChange |= scalarizeBinopOrCmp(I);
-      MadeChange |= foldExtractedCmps(I);
-      MadeChange |= scalarizeLoadExtract(I);
-      MadeChange |= foldSingleElementStore(I);
+      FoldInst(I);
     }
   }
 
-  // We're done with transforms, so remove dead instructions.
-  if (MadeChange)
-    for (BasicBlock &BB : F)
-      SimplifyInstructionsInBlock(&BB);
+  while (!Worklist.isEmpty()) {
+    Instruction *I = Worklist.removeOne();
+    if (!I)
+      continue;
+
+    if (isInstructionTriviallyDead(I)) {
+      eraseInstruction(*I);
+      continue;
+    }
+
+    FoldInst(*I);
+  }
 
   return MadeChange;
 }
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index 1089f54a0526..8ba940f319b8 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -20,13 +20,14 @@ define void @matrix_extract_insert_scalar(i32 %i, i32 %k, i32 %j, [225 x double]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 225
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP6]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <225 x double>, <225 x double>* [[TMP7]], align 8
-; CHECK-NEXT:    [[MATRIXEXT4:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP5]]
+; CHECK-NEXT:    [[MATRIXEXT4:%.*]] = load double, double* [[TMP8]], align 8
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[MATRIXEXT]], [[MATRIXEXT4]]
-; CHECK-NEXT:    [[MATRIXEXT7:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP1]]
-; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]]
-; CHECK-NEXT:    store double [[SUB]], double* [[TMP9]], align 8
+; CHECK-NEXT:    [[MATRIXEXT7:%.*]] = load double, double* [[TMP9]], align 8
+; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    store double [[SUB]], double* [[TMP10]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll
index f06c3f3ee5b7..a1586c6ca0df 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll
@@ -6,13 +6,14 @@ target triple = "arm64-apple-darwin"
 define void @load_extract_insert_store_const_idx(<225 x double>* %A) {
 ; CHECK-LABEL: @load_extract_insert_store_const_idx(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8
-; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 0
+; CHECK-NEXT:    [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]]
-; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 1
+; CHECK-NEXT:    [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1
-; CHECK-NEXT:    store double [[SUB]], double* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1
+; CHECK-NEXT:    store double [[SUB]], double* [[TMP2]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -33,13 +34,14 @@ define void @load_extract_insert_store_var_idx_assume_valid(i64 %idx.1, i64 %idx
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_1]])
 ; CHECK-NEXT:    [[CMP_2:%.*]] = icmp ult i64 [[IDX_2:%.*]], 225
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_2]])
-; CHECK-NEXT:    [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8
-; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 [[IDX_1]]
+; CHECK-NEXT:    [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]]
-; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 [[IDX_2]]
+; CHECK-NEXT:    [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]]
-; CHECK-NEXT:    store double [[SUB]], double* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]]
+; CHECK-NEXT:    store double [[SUB]], double* [[TMP2]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -69,13 +71,14 @@ define void @load_extract_insert_store_var_idx_assume_valid_in_dominating_block(
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_2]])
 ; CHECK-NEXT:    br i1 [[C_1:%.*]], label [[LOOP:%.*]], label [[EXIT:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8
-; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 [[IDX_1]]
+; CHECK-NEXT:    [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]]
-; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 [[IDX_2]]
+; CHECK-NEXT:    [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]]
-; CHECK-NEXT:    store double [[SUB]], double* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]]
+; CHECK-NEXT:    store double [[SUB]], double* [[TMP2]], align 8
 ; CHECK-NEXT:    [[C_2:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index 472c78afaaa1..0e8d477de37f 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -453,7 +453,9 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @PR34724(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
 ; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
 ; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index abf5ab57efcb..a08b2c0e227f 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -453,7 +453,9 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: @PR34724(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
 ; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
 ; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
index 636ab238c17c..c56c13ec58f0 100644
--- a/llvm/test/Transforms/VectorCombine/load-insert-store.ll
+++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
@@ -465,7 +465,9 @@ define void @insert_store_ptr_strip(<16 x i8>* %q, i8 zeroext %s) {
 ; CHECK-LABEL: @insert_store_ptr_strip(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ADDR0:%.*]] = bitcast <16 x i8>* [[Q:%.*]] to <2 x i64>*
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i32 0, i32 3
+; CHECK-NEXT:    [[ADDR1:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[ADDR0]], i64 0
+; CHECK-NEXT:    [[ADDR2:%.*]] = bitcast <2 x i64>* [[ADDR1]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[ADDR2]], i32 0, i32 3
 ; CHECK-NEXT:    store i8 [[S:%.*]], i8* [[TMP0]], align 1
 ; CHECK-NEXT:    ret void
 ;
-- 
2.34.1