From 300870a95c22fde840862cf57d82adba3e5bd633 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 22 Sep 2021 09:30:53 +0100 Subject: [PATCH] [VectorCombine] Switch to using a worklist. This patch updates VectorCombine to use a worklist to allow iterative simplifications where a combine enables other combines. Suggested in D100302. The main use case at the moment is foldSingleElementStore and scalarizeLoadExtract working together to improve scalarization. Note that we now also do not run SimplifyInstructionsInBlock on the whole function if there have been changes. This means we fail to remove/simplify instructions not related to any of the vector combines. IMO this is fine, as simplifying the whole function seems more like a workaround for not tracking the changed instructions. Compile-time impact looks neutral: NewPM-O3: +0.02% NewPM-ReleaseThinLTO: -0.00% NewPM-ReleaseLTO-g: -0.02% http://llvm-compile-time-tracker.com/compare.php?from=52832cd917af00e2b9c6a9d1476ba79754dcabff&to=e66520a4637290550a945d528e3e59573485dd40&stat=instructions Reviewed By: spatel, lebedev.ri Differential Revision: https://reviews.llvm.org/D110171 --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 72 +++++++++++++++------- .../PhaseOrdering/AArch64/matrix-extract-insert.ll | 11 ++-- .../load-extract-insert-store-scalarization.ll | 33 +++++----- .../X86/extract-binop-inseltpoison.ll | 4 +- .../Transforms/VectorCombine/X86/extract-binop.ll | 4 +- .../Transforms/VectorCombine/load-insert-store.ll | 4 +- 6 files changed, 82 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 8d01da9..dcf7b68 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -31,10 +31,12 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" +#define DEBUG_TYPE "vector-combine" +#include "llvm/Transforms/Utils/InstructionWorklist.h" + using namespace llvm; using namespace llvm::PatternMatch; -#define DEBUG_TYPE "vector-combine" STATISTIC(NumVecLoad, "Number of vector loads formed"); STATISTIC(NumVecCmp, "Number of vector compares formed"); STATISTIC(NumVecBO, "Number of vector binops formed"); @@ -73,6 +75,7 @@ private: const DominatorTree &DT; AAResults &AA; AssumptionCache &AC; + InstructionWorklist Worklist; bool vectorizeLoadInsert(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, @@ -92,14 +95,26 @@ private: bool foldExtractedCmps(Instruction &I); bool foldSingleElementStore(Instruction &I); bool scalarizeLoadExtract(Instruction &I); + + void replaceValue(Value &Old, Value &New) { + Old.replaceAllUsesWith(&New); + New.takeName(&Old); + if (auto *NewI = dyn_cast(&New)) { + Worklist.pushUsersToWorkList(*NewI); + Worklist.pushValue(NewI); + } + Worklist.pushValue(&Old); + } + + void eraseInstruction(Instruction &I) { + for (Value *Op : I.operands()) + Worklist.pushValue(Op); + Worklist.remove(&I); + I.eraseFromParent(); + } }; } // namespace -static void replaceValue(Value &Old, Value &New) { - Old.replaceAllUsesWith(&New); - New.takeName(&Old); -} - bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // Match insert into fixed vector of scalar value. // TODO: Handle non-zero insert index. @@ -501,6 +516,8 @@ bool VectorCombine::foldExtractExtract(Instruction &I) { else foldExtExtBinop(Ext0, Ext1, I); + Worklist.push(Ext0); + Worklist.push(Ext1); return true; } @@ -928,8 +945,7 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) { DL); NSI->setAlignment(ScalarOpAlignment); replaceValue(I, *NSI); - // Need erasing the store manually. - I.eraseFromParent(); + eraseInstruction(I); return true; } @@ -939,11 +955,10 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) { /// Try to scalarize vector loads feeding extractelement instructions. bool VectorCombine::scalarizeLoadExtract(Instruction &I) { Value *Ptr; - Value *Idx; - if (!match(&I, m_ExtractElt(m_Load(m_Value(Ptr)), m_Value(Idx)))) + if (!match(&I, m_Load(m_Value(Ptr)))) return false; - auto *LI = cast(I.getOperand(0)); + auto *LI = cast(&I); const DataLayout &DL = I.getModule()->getDataLayout(); if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType())) return false; @@ -1039,6 +1054,16 @@ bool VectorCombine::run() { return false; bool MadeChange = false; + auto FoldInst = [this, &MadeChange](Instruction &I) { + Builder.SetInsertPoint(&I); + MadeChange |= vectorizeLoadInsert(I); + MadeChange |= foldExtractExtract(I); + MadeChange |= foldBitcastShuf(I); + MadeChange |= scalarizeBinopOrCmp(I); + MadeChange |= foldExtractedCmps(I); + MadeChange |= scalarizeLoadExtract(I); + MadeChange |= foldSingleElementStore(I); + }; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(&BB)) @@ -1047,21 +1072,22 @@ bool VectorCombine::run() { for (Instruction &I : make_early_inc_range(BB)) { if (isa(I)) continue; - Builder.SetInsertPoint(&I); - MadeChange |= vectorizeLoadInsert(I); - MadeChange |= foldExtractExtract(I); - MadeChange |= foldBitcastShuf(I); - MadeChange |= scalarizeBinopOrCmp(I); - MadeChange |= foldExtractedCmps(I); - MadeChange |= scalarizeLoadExtract(I); - MadeChange |= foldSingleElementStore(I); + FoldInst(I); } } - // We're done with transforms, so remove dead instructions. - if (MadeChange) - for (BasicBlock &BB : F) - SimplifyInstructionsInBlock(&BB); + while (!Worklist.isEmpty()) { + Instruction *I = Worklist.removeOne(); + if (!I) + continue; + + if (isInstructionTriviallyDead(I)) { + eraseInstruction(*I); + continue; + } + + FoldInst(*I); + } return MadeChange; } diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll index 1089f54..8ba940f 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -20,13 +20,14 @@ define void @matrix_extract_insert_scalar(i32 %i, i32 %k, i32 %j, [225 x double] ; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 225 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP6]]) ; CHECK-NEXT: [[TMP7:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>* -; CHECK-NEXT: [[TMP8:%.*]] = load <225 x double>, <225 x double>* [[TMP7]], align 8 -; CHECK-NEXT: [[MATRIXEXT4:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP5]] +; CHECK-NEXT: [[MATRIXEXT4:%.*]] = load double, double* [[TMP8]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[MATRIXEXT]], [[MATRIXEXT4]] -; CHECK-NEXT: [[MATRIXEXT7:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP1]] -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]] -; CHECK-NEXT: store double [[SUB]], double* [[TMP9]], align 8 +; CHECK-NEXT: [[MATRIXEXT7:%.*]] = load double, double* [[TMP9]], align 8 +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]] +; CHECK-NEXT: store double [[SUB]], double* [[TMP10]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll index f06c3f3..a1586c6 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll @@ -6,13 +6,14 @@ target triple = "arm64-apple-darwin" define void @load_extract_insert_store_const_idx(<225 x double>* %A) { ; CHECK-LABEL: @load_extract_insert_store_const_idx( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8 -; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 0 +; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]] -; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 1 +; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8 ; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1 -; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1 +; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -33,13 +34,14 @@ define void @load_extract_insert_store_var_idx_assume_valid(i64 %idx.1, i64 %idx ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_1]]) ; CHECK-NEXT: [[CMP_2:%.*]] = icmp ult i64 [[IDX_2:%.*]], 225 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_2]]) -; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8 -; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_1]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 [[IDX_1]] +; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]] -; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_2]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 [[IDX_2]] +; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8 ; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] -; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] +; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -69,13 +71,14 @@ define void @load_extract_insert_store_var_idx_assume_valid_in_dominating_block( ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_2]]) ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[LOOP:%.*]], label [[EXIT:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8 -; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_1]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 [[IDX_1]] +; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]] -; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_2]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 [[IDX_2]] +; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8 ; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] -; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] +; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8 ; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[C_2]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll index 472c78a..0e8d477 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll @@ -453,7 +453,9 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) { define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @PR34724( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] ; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 ; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll index abf5ab5..a08b2c0 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -453,7 +453,9 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) { define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @PR34724( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] ; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 ; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll index 636ab23..c56c13e 100644 --- a/llvm/test/Transforms/VectorCombine/load-insert-store.ll +++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll @@ -465,7 +465,9 @@ define void @insert_store_ptr_strip(<16 x i8>* %q, i8 zeroext %s) { ; CHECK-LABEL: @insert_store_ptr_strip( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ADDR0:%.*]] = bitcast <16 x i8>* [[Q:%.*]] to <2 x i64>* -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i32 0, i32 3 +; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[ADDR0]], i64 0 +; CHECK-NEXT: [[ADDR2:%.*]] = bitcast <2 x i64>* [[ADDR1]] to <16 x i8>* +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[ADDR2]], i32 0, i32 3 ; CHECK-NEXT: store i8 [[S:%.*]], i8* [[TMP0]], align 1 ; CHECK-NEXT: ret void ; -- 2.7.4