From ca2396e6732dfab06cf96a475520b9266852277a Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 12 Feb 2018 14:54:48 +0000 Subject: [PATCH] [SLP] Take user instructions cost into consideration in insertelement vectorization. Summary: For better vectorization result we should take into consideration the cost of the user insertelement instructions when we try to vectorize sequences that build the whole vector. I.e. if we have the following scalar code: ``` insertelement , ... ``` we should consider the cost of the last `insertelement ` instructions as the cost of the scalar code. Reviewers: RKSimon, spatel, hfinkel, mkuper Subscribers: javed.absar, llvm-commits Differential Revision: https://reviews.llvm.org/D42657 llvm-svn: 324893 --- .../llvm/Transforms/Vectorize/SLPVectorizer.h | 4 +- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 28 +++- .../SLPVectorizer/AArch64/gather-root.ll | 176 +++++++++++---------- .../X86/insert-element-build-vector.ll | 16 +- .../SLPVectorizer/X86/jumbled-load-multiuse.ll | 27 ++-- .../Transforms/SLPVectorizer/X86/sign-extend.ll | 21 ++- .../test/Transforms/SLPVectorizer/X86/value-bug.ll | 30 ++-- 7 files changed, 160 insertions(+), 142 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h index 781a628..979d5ef 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -95,9 +95,11 @@ private: bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R); /// \brief Try to vectorize a list of operands. + /// \param UserCost Cost of the user operations of \p VL if they may affect + /// the cost of the vectorization. /// \returns true if a value was vectorized. bool tryToVectorizeList(ArrayRef VL, slpvectorizer::BoUpSLP &R, - bool AllowReorder = false); + int UserCost = 0, bool AllowReorder = false); /// \brief Try to vectorize a chain that may start at the operands of \p I. bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8b63fe1..0f0b82b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4702,11 +4702,11 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R, true); + return tryToVectorizeList(VL, R, /*UserCost=*/0, true); } bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, - bool AllowReorder) { + int UserCost, bool AllowReorder) { if (VL.size() < 2) return false; @@ -4815,7 +4815,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, continue; R.computeMinimumValueSizes(); - int Cost = R.getTreeCost(); + int Cost = R.getTreeCost() - UserCost; CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -5748,9 +5748,17 @@ private: /// /// Returns true if it matches static bool findBuildVector(InsertElementInst *LastInsertElem, - SmallVectorImpl &BuildVectorOpds) { + TargetTransformInfo *TTI, + SmallVectorImpl &BuildVectorOpds, + int &UserCost) { + UserCost = 0; Value *V = nullptr; do { + if (auto *CI = dyn_cast(LastInsertElem->getOperand(2))) { + UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, + LastInsertElem->getType(), + CI->getZExtValue()); + } BuildVectorOpds.push_back(LastInsertElem->getOperand(1)); V = LastInsertElem->getOperand(0); if (isa(V)) @@ -5965,13 +5973,17 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { + int UserCost; SmallVector BuildVectorOpds; - if (!findBuildVector(IEI, BuildVectorOpds)) + if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) || + (llvm::all_of(BuildVectorOpds, + [](Value *V) { return isa(V); }) && + isShuffle(BuildVectorOpds))) return false; // Vectorize starting with the build vector operands ignoring the BuildVector // instructions for the purpose of scheduling and user extraction. - return tryToVectorizeList(BuildVectorOpds, R); + return tryToVectorizeList(BuildVectorOpds, R, UserCost); } bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, @@ -6049,8 +6061,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. bool AllowReorder = NumElts == 2; - if (NumElts > 1 && - tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) { + if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, + /*UserCost=*/0, AllowReorder)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll index dda8a0c..76f4164 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -31,50 +31,54 @@ define void @PR28330(i32 %n) { ; ; GATHER-LABEL: @PR28330( ; GATHER-NEXT: entry: -; GATHER-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer -; GATHER-NEXT: [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; GATHER-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0 -; GATHER-NEXT: [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; GATHER-NEXT: [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0 -; GATHER-NEXT: [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 -; GATHER-NEXT: [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0 -; GATHER-NEXT: [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 -; GATHER-NEXT: [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0 -; GATHER-NEXT: [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 -; GATHER-NEXT: [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0 -; GATHER-NEXT: [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 -; GATHER-NEXT: [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0 +; GATHER-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 +; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer ; GATHER-NEXT: br label [[FOR_BODY:%.*]] ; GATHER: for.body: -; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; GATHER-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> , <2 x i32> -; GATHER-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; GATHER-NEXT: [[TMP20:%.*]] = add i32 [[TMP17]], [[TMP3]] -; GATHER-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; GATHER-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]] -; GATHER-NEXT: [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]] -; GATHER-NEXT: [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]] -; GATHER-NEXT: [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]] -; GATHER-NEXT: [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]] -; GATHER-NEXT: [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]] -; GATHER-NEXT: [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 -; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1 -; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2 -; GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3 -; GATHER-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4 -; GATHER-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5 -; GATHER-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6 -; GATHER-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]]) -; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP13]], [[TMP17]] -; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]] +; GATHER-NEXT: [[TMPP17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; GATHER-NEXT: [[TMP3:%.*]] = insertelement <8 x i1> undef, i1 [[TMP2]], i32 0 +; GATHER-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i1> [[TMP3]], i1 [[TMP4]], i32 1 +; GATHER-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i1> [[TMP5]], i1 [[TMP6]], i32 2 +; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; GATHER-NEXT: [[TMP9:%.*]] = insertelement <8 x i1> [[TMP7]], i1 [[TMP8]], i32 3 +; GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; GATHER-NEXT: [[TMP11:%.*]] = insertelement <8 x i1> [[TMP9]], i1 [[TMP10]], i32 4 +; GATHER-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; GATHER-NEXT: [[TMP13:%.*]] = insertelement <8 x i1> [[TMP11]], i1 [[TMP12]], i32 5 +; GATHER-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; GATHER-NEXT: [[TMP15:%.*]] = insertelement <8 x i1> [[TMP13]], i1 [[TMP14]], i32 6 +; GATHER-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 +; GATHER-NEXT: [[TMP17:%.*]] = insertelement <8 x i1> [[TMP15]], i1 [[TMP16]], i32 7 +; GATHER-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> , <8 x i32> +; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0 +; GATHER-NEXT: [[TMPP20:%.*]] = add i32 [[TMPP17]], [[TMP19]] +; GATHER-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1 +; GATHER-NEXT: [[TMPP22:%.*]] = add i32 [[TMPP20]], [[TMP20]] +; GATHER-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2 +; GATHER-NEXT: [[TMPP24:%.*]] = add i32 [[TMPP22]], [[TMP21]] +; GATHER-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3 +; GATHER-NEXT: [[TMPP26:%.*]] = add i32 [[TMPP24]], [[TMP22]] +; GATHER-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4 +; GATHER-NEXT: [[TMPP28:%.*]] = add i32 [[TMPP26]], [[TMP23]] +; GATHER-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5 +; GATHER-NEXT: [[TMPP30:%.*]] = add i32 [[TMPP28]], [[TMP24]] +; GATHER-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6 +; GATHER-NEXT: [[TMPP32:%.*]] = add i32 [[TMPP30]], [[TMP25]] +; GATHER-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0 +; GATHER-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1 +; GATHER-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2 +; GATHER-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3 +; GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4 +; GATHER-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5 +; GATHER-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6 +; GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7 +; GATHER-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7 +; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP34]]) +; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], [[TMPP17]] +; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMPP32]], [[TMP33]] ; GATHER-NEXT: br label [[FOR_BODY]] ; ; MAX-COST-LABEL: @PR28330( @@ -179,50 +183,54 @@ define void @PR32038(i32 %n) { ; ; GATHER-LABEL: @PR32038( ; GATHER-NEXT: entry: -; GATHER-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer -; GATHER-NEXT: [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; GATHER-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0 -; GATHER-NEXT: [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; GATHER-NEXT: [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0 -; GATHER-NEXT: [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 -; GATHER-NEXT: [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0 -; GATHER-NEXT: [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 -; GATHER-NEXT: [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0 -; GATHER-NEXT: [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 -; GATHER-NEXT: [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0 -; GATHER-NEXT: [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 -; GATHER-NEXT: [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0 +; GATHER-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 +; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer ; GATHER-NEXT: br label [[FOR_BODY:%.*]] ; GATHER: for.body: -; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; GATHER-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> , <2 x i32> -; GATHER-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; GATHER-NEXT: [[TMP20:%.*]] = add i32 -5, [[TMP3]] -; GATHER-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; GATHER-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP4]] -; GATHER-NEXT: [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]] -; GATHER-NEXT: [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]] -; GATHER-NEXT: [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]] -; GATHER-NEXT: [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]] -; GATHER-NEXT: [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]] -; GATHER-NEXT: [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80 -; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 -; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP4]], i32 1 -; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP23]], i32 2 -; GATHER-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP25]], i32 3 -; GATHER-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[TMP27]], i32 4 -; GATHER-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[TMP29]], i32 5 -; GATHER-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP31]], i32 6 -; GATHER-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP12]]) -; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP13]], -5 -; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]] +; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; GATHER-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; GATHER-NEXT: [[TMP3:%.*]] = insertelement <8 x i1> undef, i1 [[TMP2]], i32 0 +; GATHER-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i1> [[TMP3]], i1 [[TMP4]], i32 1 +; GATHER-NEXT: [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i1> [[TMP5]], i1 [[TMP6]], i32 2 +; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; GATHER-NEXT: [[TMP9:%.*]] = insertelement <8 x i1> [[TMP7]], i1 [[TMP8]], i32 3 +; GATHER-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; GATHER-NEXT: [[TMP11:%.*]] = insertelement <8 x i1> [[TMP9]], i1 [[TMP10]], i32 4 +; GATHER-NEXT: [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; GATHER-NEXT: [[TMP13:%.*]] = insertelement <8 x i1> [[TMP11]], i1 [[TMP12]], i32 5 +; GATHER-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; GATHER-NEXT: [[TMP15:%.*]] = insertelement <8 x i1> [[TMP13]], i1 [[TMP14]], i32 6 +; GATHER-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 +; GATHER-NEXT: [[TMP17:%.*]] = insertelement <8 x i1> [[TMP15]], i1 [[TMP16]], i32 7 +; GATHER-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> , <8 x i32> +; GATHER-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0 +; GATHER-NEXT: [[TMPP20:%.*]] = add i32 -5, [[TMP19]] +; GATHER-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1 +; GATHER-NEXT: [[TMPP22:%.*]] = add i32 [[TMPP20]], [[TMP20]] +; GATHER-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2 +; GATHER-NEXT: [[TMPP24:%.*]] = add i32 [[TMPP22]], [[TMP21]] +; GATHER-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3 +; GATHER-NEXT: [[TMPP26:%.*]] = add i32 [[TMPP24]], [[TMP22]] +; GATHER-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4 +; GATHER-NEXT: [[TMPP28:%.*]] = add i32 [[TMPP26]], [[TMP23]] +; GATHER-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5 +; GATHER-NEXT: [[TMPP30:%.*]] = add i32 [[TMPP28]], [[TMP24]] +; GATHER-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6 +; GATHER-NEXT: [[TMPP32:%.*]] = add i32 [[TMPP30]], [[TMP25]] +; GATHER-NEXT: [[TMP26:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0 +; GATHER-NEXT: [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1 +; GATHER-NEXT: [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2 +; GATHER-NEXT: [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3 +; GATHER-NEXT: [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4 +; GATHER-NEXT: [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5 +; GATHER-NEXT: [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6 +; GATHER-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7 +; GATHER-NEXT: [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7 +; GATHER-NEXT: [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP34]]) +; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP35]], -5 +; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMPP32]], [[TMP33]] ; GATHER-NEXT: br label [[FOR_BODY]] ; ; MAX-COST-LABEL: @PR32038( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll index 750a447..e26eeec 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -387,14 +387,14 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x ; to do this backwards this backwards define <4 x i32> @reconstruct(<4 x i32> %c) #0 { ; CHECK-LABEL: @reconstruct( -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[C]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[C]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[C]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP3]], i32 1 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP1]], i32 3 +; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 +; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 +; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 +; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 +; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[C0]], i32 0 +; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[C1]], i32 1 +; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[C2]], i32 2 +; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[C3]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[RD]] ; ; ZEROTHRESH-LABEL: @reconstruct( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll index 557a83a..ee28750 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -12,19 +12,20 @@ ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 2), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 8, i32 3 -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1) to <2 x i32>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 8, i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP12]], <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll index 21f17f3..360ce6a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll @@ -34,18 +34,15 @@ entry: define <4 x i16> @truncate_v_v(<4 x i32> %lhs) { ; CHECK-LABEL: @truncate_v_v( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[LHS:%.*]], i32 0 -; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[VECEXT]] to i16 -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i16> undef, i16 [[CONV]], i32 0 -; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[LHS]], i32 1 -; CHECK-NEXT: [[CONV2:%.*]] = trunc i32 [[VECEXT1]] to i16 -; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[CONV2]], i32 1 -; CHECK-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[LHS]], i32 2 -; CHECK-NEXT: [[CONV5:%.*]] = trunc i32 [[VECEXT4]] to i16 -; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[CONV5]], i32 2 -; CHECK-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[LHS]], i32 3 -; CHECK-NEXT: [[CONV8:%.*]] = trunc i32 [[VECEXT7]] to i16 -; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[CONV8]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[LHS:%.*]] to <4 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i16> undef, i16 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 +; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[TMP4]], i32 3 ; CHECK-NEXT: ret <4 x i16> [[VECINIT9]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll index 7558c72..c2f4b98 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll @@ -11,34 +11,32 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define void @test() { ; CHECK-LABEL: @test( ; CHECK-NEXT: bb279: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> undef, float undef, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float undef, i32 1 ; CHECK-NEXT: br label [[BB283:%.*]] ; CHECK: bb283: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP11:%.*]], [[EXIT:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP13:%.*]], [[EXIT]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP1]], [[EXIT]] ] ; CHECK-NEXT: br label [[BB284:%.*]] ; CHECK: bb284: -; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double> -; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef +; CHECK-NEXT: [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double> +; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], undef +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], undef ; CHECK-NEXT: br label [[BB21_I:%.*]] ; CHECK: bb21.i: ; CHECK-NEXT: br i1 undef, label [[BB22_I:%.*]], label [[EXIT]] ; CHECK: bb22.i: -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> undef, [[TMP6]] ; CHECK-NEXT: br label [[BB32_I:%.*]] ; CHECK: bb32.i: -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x double> [ [[TMP7]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ] ; CHECK-NEXT: br i1 undef, label [[BB32_I]], label [[BB21_I]] ; CHECK: exit: -; CHECK-NEXT: [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double> -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> , [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> undef, [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> undef, [[TMP9]] -; CHECK-NEXT: [[TMP11]] = fptrunc <2 x double> [[TMP10]] to <2 x float> -; CHECK-NEXT: [[TMP317:%.*]] = fptrunc double undef to float -; CHECK-NEXT: [[TMP319:%.*]] = fptrunc double undef to float -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP317]], i32 0 -; CHECK-NEXT: [[TMP13]] = insertelement <2 x float> [[TMP12]], float [[TMP319]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double> +; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> , [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x double> undef, [[TMP11]] +; CHECK-NEXT: [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float> ; CHECK-NEXT: br label [[BB283]] ; bb279: -- 2.7.4