From 2e972ea0567a5d64b011cb3476e4d18f6195f82a Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 13 Dec 2022 12:45:16 -0800 Subject: [PATCH] [SLP]Integrate looking through shuffles logic into ShuffleInstructionBuilder. Added BaseShuffleAnalysis as a base class for ShuffleInstructionBuilder and integrated shuffle logic from shuffles for externally used scalars into this class. This class is used as the main container that implements smart shuffle instruction builder logic. ShuffleInstructionBuilder uses this logic. ShuffleInstructionBuilder is also used in building of the shuffle for the externally used scalars instead of lambdas, which are now part of BaseShuffleAnalysis class. Differential Revision: https://reviews.llvm.org/D140100 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 465 ++++++++++++++------- .../Transforms/SLPVectorizer/AArch64/tsc-s116.ll | 10 +- .../X86/buildvector-same-lane-insert.ll | 5 +- .../SLPVectorizer/X86/jumbled_store_crash.ll | 41 +- .../X86/reduced-gathered-vectorized.ll | 57 +-- .../X86/reorder-reused-masked-gather.ll | 21 +- 6 files changed, 378 insertions(+), 221 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5baa7e7..db078b2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6286,6 +6286,276 @@ TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef VL, return {VK, VP}; } +namespace { +/// The base class for shuffle instruction emission and shuffle cost estimation. +class BaseShuffleAnalysis { +protected: + /// Checks if the mask is an identity mask. + /// \param IsStrict if is true the function returns false if mask size does + /// not match vector size. + static bool isIdentityMask(ArrayRef Mask, const FixedVectorType *VecTy, + bool IsStrict) { + int Limit = Mask.size(); + int VF = VecTy->getNumElements(); + return (VF == Limit || !IsStrict) && + all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && + ShuffleVectorInst::isIdentityMask(Mask); + } + + /// Tries to combine 2 different masks into single one. + /// \param LocalVF Vector length of the permuted input vector. \p Mask may + /// change the size of the vector, \p LocalVF is the original size of the + /// shuffled vector. + static void combineMasks(unsigned LocalVF, SmallVectorImpl &Mask, + ArrayRef ExtMask) { + unsigned VF = Mask.size(); + SmallVector NewMask(ExtMask.size(), UndefMaskElem); + for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { + if (ExtMask[I] == UndefMaskElem) + continue; + int MaskedIdx = Mask[ExtMask[I] % VF]; + NewMask[I] = + MaskedIdx == UndefMaskElem ? UndefMaskElem : MaskedIdx % LocalVF; + } + Mask.swap(NewMask); + } + + /// Looks through shuffles trying to reduce final number of shuffles in the + /// code. The function looks through the previously emitted shuffle + /// instructions and properly mark indices in mask as undef. + /// For example, given the code + /// \code + /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> + /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> + /// \endcode + /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will + /// look through %s1 and %s2 and select vectors %0 and %1 with mask + /// <0, 1, 2, 3> for the shuffle. + /// If 2 operands are of different size, the smallest one will be resized and + /// the mask recalculated properly. + /// For example, given the code + /// \code + /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> + /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> + /// \endcode + /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will + /// look through %s1 and %s2 and select vectors %0 and %1 with mask + /// <0, 1, 2, 3> for the shuffle. + /// So, it tries to transform permutations to simple vector merge, if + /// possible. + /// \param V The input vector which must be shuffled using the given \p Mask. + /// If the better candidate is found, \p V is set to this best candidate + /// vector. + /// \param Mask The input mask for the shuffle. If the best candidate is found + /// during looking-through-shuffles attempt, it is updated accordingly. + /// \param SinglePermute true if the shuffle operation is originally a + /// single-value-permutation. In this case the look-through-shuffles procedure + /// may look for resizing shuffles as the best candidates. + /// \return true if the shuffle results in the non-resizing identity shuffle + /// (and thus can be ignored), false - otherwise. + static bool peekThroughShuffles(Value *&V, SmallVectorImpl &Mask, + bool SinglePermute) { + Value *Op = V; + ShuffleVectorInst *IdentityOp = nullptr; + SmallVector IdentityMask; + while (auto *SV = dyn_cast(Op)) { + // Exit if not a fixed vector type or changing size shuffle. + auto *SVTy = dyn_cast(SV->getType()); + if (!SVTy) + break; + // Remember the identity or broadcast mask, if it is not a resizing + // shuffle. If no better candidates are found, this Op and Mask will be + // used in the final shuffle. + if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) { + if (!IdentityOp || !SinglePermute || + (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) && + !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask))) { + IdentityOp = SV; + // Store current mask in the IdentityMask so later we did not lost + // this info if IdentityOp is selected as the best candidate for the + // permutation. + IdentityMask.assign(Mask); + } + } + // Remember the broadcast mask. If no better candidates are found, this Op + // and Mask will be used in the final shuffle. + // Zero splat can be used as identity too, since it might be used with + // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling. + // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is + // expensive, the analysis founds out, that the source vector is just a + // broadcast, this original mask can be transformed to identity mask <0, + // 1, 2, 3>. + // \code + // %0 = shuffle %v, poison, zeroinitalizer + // %res = shuffle %0, poison, <3, 1, 2, 0> + // \endcode + // may be transformed to + // \code + // %0 = shuffle %v, poison, zeroinitalizer + // %res = shuffle %0, poison, <0, 1, 2, 3> + // \endcode + if (SV->isZeroEltSplat()) { + IdentityOp = SV; + IdentityMask.assign(Mask); + } + int LocalVF = Mask.size(); + if (auto *SVOpTy = + dyn_cast(SV->getOperand(0)->getType())) + LocalVF = SVOpTy->getNumElements(); + bool IsOp1Undef = + isUndefVector(SV->getOperand(0), + buildUseMask(LocalVF, Mask, UseMask::FirstArg)) + .all(); + bool IsOp2Undef = + isUndefVector(SV->getOperand(1), + buildUseMask(LocalVF, Mask, UseMask::SecondArg)) + .all(); + if (!IsOp1Undef && !IsOp2Undef) { + // Update mask and mark undef elems. + for (auto [Idx, I] : enumerate(Mask)) { + if (I == UndefMaskElem) + continue; + if (SV->getShuffleMask()[I % SV->getShuffleMask().size()] == + UndefMaskElem) + I = UndefMaskElem; + } + break; + } + SmallVector ShuffleMask(SV->getShuffleMask().begin(), + SV->getShuffleMask().end()); + combineMasks(LocalVF, ShuffleMask, Mask); + Mask.swap(ShuffleMask); + if (IsOp2Undef) + Op = SV->getOperand(0); + else + Op = SV->getOperand(1); + } + if (auto *OpTy = dyn_cast(Op->getType()); + !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute)) { + if (IdentityOp) { + V = IdentityOp; + assert(Mask.size() == IdentityMask.size() && + "Expected masks of same sizes."); + // Clear known poison elements. + for (auto [I, Idx] : enumerate(Mask)) + if (Idx == UndefMaskElem) + IdentityMask[I] = UndefMaskElem; + Mask.swap(IdentityMask); + auto *Shuffle = dyn_cast(V); + return SinglePermute && + (isIdentityMask(Mask, cast(V->getType()), + /*IsStrict=*/true) || + (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() && + Shuffle->isZeroEltSplat() && + ShuffleVectorInst::isZeroEltSplatMask(Mask))); + } + V = Op; + return false; + } + V = Op; + return true; + } + + /// Smart shuffle instruction emission, walks through shuffles trees and + /// tries to find the best matching vector for the actual shuffle + /// instruction. + template + static Value *createShuffle(Value *V1, Value *V2, ArrayRef Mask, + ShuffleBuilderTy &Builder) { + assert(V1 && "Expected at least one vector value."); + int VF = Mask.size(); + if (auto *FTy = dyn_cast(V1->getType())) + VF = FTy->getNumElements(); + if (V2 && + !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) { + // Peek through shuffles. + Value *Op1 = V1; + Value *Op2 = V2; + int VF = + cast(V1->getType())->getElementCount().getKnownMinValue(); + SmallVector CombinedMask1(Mask.size(), UndefMaskElem); + SmallVector CombinedMask2(Mask.size(), UndefMaskElem); + for (int I = 0, E = Mask.size(); I < E; ++I) { + if (Mask[I] < VF) + CombinedMask1[I] = Mask[I]; + else + CombinedMask2[I] = Mask[I] - VF; + } + Value *PrevOp1; + Value *PrevOp2; + do { + PrevOp1 = Op1; + PrevOp2 = Op2; + (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false); + (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false); + // Check if we have 2 resizing shuffles - need to peek through operands + // again. + if (auto *SV1 = dyn_cast(Op1)) + if (auto *SV2 = dyn_cast(Op2)) { + SmallBitVector UseMask1 = buildUseMask( + cast(SV1->getOperand(1)->getType()) + ->getNumElements(), + CombinedMask1, UseMask::FirstArg); + SmallBitVector UseMask2 = buildUseMask( + cast(SV2->getOperand(1)->getType()) + ->getNumElements(), + CombinedMask2, UseMask::FirstArg); + if (SV1->getOperand(0)->getType() == + SV2->getOperand(0)->getType() && + SV1->getOperand(0)->getType() != SV1->getType() && + isUndefVector(SV1->getOperand(1), UseMask1).all() && + isUndefVector(SV2->getOperand(1), UseMask2).all()) { + Op1 = SV1->getOperand(0); + Op2 = SV2->getOperand(0); + SmallVector ShuffleMask1(SV1->getShuffleMask().begin(), + SV1->getShuffleMask().end()); + int LocalVF = ShuffleMask1.size(); + if (auto *FTy = dyn_cast(Op1->getType())) + LocalVF = FTy->getNumElements(); + combineMasks(LocalVF, ShuffleMask1, CombinedMask1); + CombinedMask1.swap(ShuffleMask1); + SmallVector ShuffleMask2(SV2->getShuffleMask().begin(), + SV2->getShuffleMask().end()); + LocalVF = ShuffleMask2.size(); + if (auto *FTy = dyn_cast(Op2->getType())) + LocalVF = FTy->getNumElements(); + combineMasks(LocalVF, ShuffleMask2, CombinedMask2); + CombinedMask2.swap(ShuffleMask2); + } + } + } while (PrevOp1 != Op1 || PrevOp2 != Op2); + Builder.resizeToMatch(Op1, Op2); + VF = std::max(cast(Op1->getType()) + ->getElementCount() + .getKnownMinValue(), + cast(Op2->getType()) + ->getElementCount() + .getKnownMinValue()); + for (int I = 0, E = Mask.size(); I < E; ++I) { + if (CombinedMask2[I] != UndefMaskElem) { + assert(CombinedMask1[I] == UndefMaskElem && + "Expected undefined mask element"); + CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); + } + } + return Builder.createShuffleVector( + Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, + CombinedMask1); + } + if (isa(V1)) + return PoisonValue::get(FixedVectorType::get( + cast(V1->getType())->getElementType(), Mask.size())); + SmallVector NewMask(Mask.begin(), Mask.end()); + bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true); + assert(V1 && "Expected non-null value after looking through shuffles."); + + if (!IsIdentity) + return Builder.createShuffleVector(V1, NewMask); + return V1; + } +}; +} // namespace + InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals) { ArrayRef VL = E->Scalars; @@ -8146,7 +8416,7 @@ Value *BoUpSLP::gather(ArrayRef VL) { /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> /// \endcode /// instead. -class BoUpSLP::ShuffleInstructionBuilder { +class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { bool IsFinalized = false; /// Combined mask for all applied operands and masks. It is built during /// analysis and actual emission of shuffle vector instructions. @@ -8155,7 +8425,7 @@ class BoUpSLP::ShuffleInstructionBuilder { /// operands, if the 3rd is going to be added, the first 2 are combined into /// shuffle with \p CommonMask mask, the first operand sets to be the /// resulting shuffle and the second operand sets to be the newly added - /// operand. The \p CombinedMask is transformed in the proper way after that. + /// operand. The \p CommonMask is transformed in the proper way after that. SmallVector InVectors; IRBuilderBase &Builder; BoUpSLP &R; @@ -8199,6 +8469,29 @@ class BoUpSLP::ShuffleInstructionBuilder { } return Vec; } + /// Resizes 2 input vector to match the sizes, if the they are not equal + /// yet. The smallest vector is resized to the size of the larger vector. + void resizeToMatch(Value *&V1, Value *&V2) { + if (V1->getType() == V2->getType()) + return; + int V1VF = cast(V1->getType())->getNumElements(); + int V2VF = cast(V2->getType())->getNumElements(); + int VF = std::max(V1VF, V2VF); + int MinVF = std::min(V1VF, V2VF); + SmallVector IdentityMask(VF, UndefMaskElem); + std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF), + 0); + Value *&Op = MinVF == V1VF ? V1 : V2; + Op = Builder.CreateShuffleVector(Op, IdentityMask); + if (auto *I = dyn_cast(Op)) { + GatherShuffleExtractSeq.insert(I); + CSEBlocks.insert(I->getParent()); + } + if (MinVF == V1VF) + V1 = Op; + else + V2 = Op; + } }; /// Smart shuffle instruction emission, walks through shuffles trees and @@ -8208,9 +8501,7 @@ class BoUpSLP::ShuffleInstructionBuilder { assert(V1 && "Expected at least one vector value."); ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq, R.CSEBlocks); - if (V2) - return ShuffleBuilder.createShuffleVector(V1, V2, Mask); - return ShuffleBuilder.createShuffleVector(V1, Mask); + return BaseShuffleAnalysis::createShuffle(V1, V2, Mask, ShuffleBuilder); } /// Transforms mask \p CommonMask per given \p Mask to make proper set after @@ -9433,157 +9724,21 @@ Value *BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues, LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); } - // Checks if the mask is an identity mask. - auto &&IsIdentityMask = [](ArrayRef Mask, FixedVectorType *VecTy) { - int Limit = Mask.size(); - return VecTy->getNumElements() == Mask.size() && - all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) && - ShuffleVectorInst::isIdentityMask(Mask); - }; - // Tries to combine 2 different masks into single one. - auto &&CombineMasks = [](SmallVectorImpl &Mask, ArrayRef ExtMask) { - SmallVector NewMask(ExtMask.size(), UndefMaskElem); - for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { - if (ExtMask[I] == UndefMaskElem) - continue; - NewMask[I] = Mask[ExtMask[I]]; - } - Mask.swap(NewMask); - }; - // Peek through shuffles, trying to simplify the final shuffle code. - auto &&PeekThroughShuffles = - [&IsIdentityMask, &CombineMasks](Value *&V, SmallVectorImpl &Mask, - bool CheckForLengthChange = false) { - while (auto *SV = dyn_cast(V)) { - // Exit if not a fixed vector type or changing size shuffle. - if (!isa(SV->getType()) || - (CheckForLengthChange && SV->changesLength())) - break; - // Exit if the identity or broadcast mask is found. - if (IsIdentityMask(Mask, cast(SV->getType())) || - SV->isZeroEltSplat()) - break; - int LocalVF = Mask.size(); - if (auto *SVOpTy = - dyn_cast(SV->getOperand(0)->getType())) - LocalVF = SVOpTy->getNumElements(); - bool IsOp1Undef = - isUndefVector(SV->getOperand(0), - buildUseMask(LocalVF, Mask, UseMask::FirstArg)) - .all(); - bool IsOp2Undef = - isUndefVector(SV->getOperand(1), - buildUseMask(LocalVF, Mask, UseMask::SecondArg)) - .all(); - if (!IsOp1Undef && !IsOp2Undef) - break; - SmallVector ShuffleMask(SV->getShuffleMask().begin(), - SV->getShuffleMask().end()); - CombineMasks(ShuffleMask, Mask); - Mask.swap(ShuffleMask); - if (IsOp2Undef) - V = SV->getOperand(0); - else - V = SV->getOperand(1); - } - }; - // Smart shuffle instruction emission, walks through shuffles trees and - // tries to find the best matching vector for the actual shuffle - // instruction. - auto &&CreateShuffle = [this, &IsIdentityMask, &PeekThroughShuffles, - &CombineMasks](Value *V1, Value *V2, - ArrayRef Mask) -> Value * { - assert(V1 && "Expected at least one vector value."); - int VF = Mask.size(); - if (auto *FTy = dyn_cast(V1->getType())) - VF = FTy->getNumElements(); - if (V2 && - !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) { - // Peek through shuffles. - Value *Op1 = V1; - Value *Op2 = V2; - int VF = - cast(V1->getType())->getElementCount().getKnownMinValue(); - SmallVector CombinedMask1(Mask.size(), UndefMaskElem); - SmallVector CombinedMask2(Mask.size(), UndefMaskElem); - for (int I = 0, E = Mask.size(); I < E; ++I) { - if (Mask[I] < VF) - CombinedMask1[I] = Mask[I]; - else - CombinedMask2[I] = Mask[I] - VF; - } - Value *PrevOp1; - Value *PrevOp2; - do { - PrevOp1 = Op1; - PrevOp2 = Op2; - PeekThroughShuffles(Op1, CombinedMask1, /*CheckForLengthChange=*/true); - PeekThroughShuffles(Op2, CombinedMask2, /*CheckForLengthChange=*/true); - // Check if we have 2 resizing shuffles - need to peek through operands - // again. - if (auto *SV1 = dyn_cast(Op1)) - if (auto *SV2 = dyn_cast(Op2)) { - SmallBitVector UseMask1 = buildUseMask( - cast(SV1->getOperand(1)->getType()) - ->getNumElements(), - CombinedMask1, UseMask::FirstArg); - SmallBitVector UseMask2 = buildUseMask( - cast(SV2->getOperand(1)->getType()) - ->getNumElements(), - CombinedMask2, UseMask::FirstArg); - if (SV1->getOperand(0)->getType() == - SV2->getOperand(0)->getType() && - SV1->getOperand(0)->getType() != SV1->getType() && - isUndefVector(SV1->getOperand(1), UseMask1).all() && - isUndefVector(SV2->getOperand(1), UseMask2).all()) { - Op1 = SV1->getOperand(0); - Op2 = SV2->getOperand(0); - SmallVector ShuffleMask1(SV1->getShuffleMask().begin(), - SV1->getShuffleMask().end()); - CombineMasks(ShuffleMask1, CombinedMask1); - CombinedMask1.swap(ShuffleMask1); - SmallVector ShuffleMask2(SV2->getShuffleMask().begin(), - SV2->getShuffleMask().end()); - CombineMasks(ShuffleMask2, CombinedMask2); - CombinedMask2.swap(ShuffleMask2); - } - } - } while (PrevOp1 != Op1 || PrevOp2 != Op2); - VF = cast(Op1->getType()) - ->getElementCount() - .getKnownMinValue(); - for (int I = 0, E = Mask.size(); I < E; ++I) { - if (CombinedMask2[I] != UndefMaskElem) { - assert(CombinedMask1[I] == UndefMaskElem && - "Expected undefined mask element"); - CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); - } - } - Value *Vec = Builder.CreateShuffleVector( - Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2, - CombinedMask1); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - return Vec; - } - if (isa(V1)) - return PoisonValue::get(FixedVectorType::get( - cast(V1->getType())->getElementType(), Mask.size())); - Value *Op = V1; - SmallVector CombinedMask(Mask); - PeekThroughShuffles(Op, CombinedMask); - if (!isa(Op->getType()) || - !IsIdentityMask(CombinedMask, cast(Op->getType()))) { - Value *Vec = Builder.CreateShuffleVector(Op, CombinedMask); - if (auto *I = dyn_cast(Vec)) { - GatherShuffleExtractSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } - return Vec; + auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef Mask) { + SmallVector CombinedMask1(Mask.size(), UndefMaskElem); + SmallVector CombinedMask2(Mask.size(), UndefMaskElem); + int VF = cast(V1->getType())->getNumElements(); + for (int I = 0, E = Mask.size(); I < E; ++I) { + if (Mask[I] < VF) + CombinedMask1[I] = Mask[I]; + else + CombinedMask2[I] = Mask[I] - VF; } - return Op; + ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); + ShuffleBuilder.add(V1, CombinedMask1); + if (V2) + ShuffleBuilder.add(V2, CombinedMask2); + return ShuffleBuilder.finalize(std::nullopt); }; auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef Mask, diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index b96862f..e25022c 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -30,11 +30,11 @@ define void @s116_modified(float* %a) { ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <4 x float> [[TMP9]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[GEP0]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP12]], <4 x float>* [[TMP13]], align 4 ; CHECK-NEXT: ret void ; %gep0 = getelementptr inbounds float, float* %a, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll index 3e49201..752c35f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-same-lane-insert.ll @@ -43,10 +43,9 @@ define void @test1() { ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fcmp olt float [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP3]], ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP2]], <2 x i32> ; CHECK-NEXT: store <2 x float> [[TMP8]], ptr null, align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP2]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP9]], ptr null, align 4 ; CHECK-NEXT: ret void ; %1 = getelementptr inbounds float, ptr undef, i32 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll index 1f7ea2e..439c0b4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll @@ -26,26 +26,27 @@ define dso_local void @j() local_unnamed_addr { ; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float> ; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], ; CHECK-NEXT: [[TMP9:%.*]] = fsub <2 x float> , [[TMP8]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 -; CHECK-NEXT: store float [[TMP10]], float* @g, align 4 -; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[SHUFFLE]], -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 2 -; CHECK-NEXT: store float [[TMP12]], float* @c, align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 0 -; CHECK-NEXT: store float [[TMP13]], float* @d, align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 3 -; CHECK-NEXT: store float [[TMP14]], float* @e, align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 1 -; CHECK-NEXT: store float [[TMP15]], float* @f, align 4 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> [[SHUFFLE]], <4 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = fsub <4 x float> [[TMP11]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP11]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x float> [[TMP18]], <4 x float> [[TMP19]], <4 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = fptosi <4 x float> [[TMP20]] to <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP21]], <4 x i32>* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 +; CHECK-NEXT: store float [[TMP11]], float* @g, align 4 +; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[TMP10]], +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP12]], i32 2 +; CHECK-NEXT: store float [[TMP13]], float* @c, align 4 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP12]], i32 0 +; CHECK-NEXT: store float [[TMP14]], float* @d, align 4 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP12]], i32 3 +; CHECK-NEXT: store float [[TMP15]], float* @e, align 4 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP12]], i32 1 +; CHECK-NEXT: store float [[TMP16]], float* @f, align 4 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = fsub <4 x float> [[TMP12]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = fadd <4 x float> [[TMP12]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = fptosi <4 x float> [[TMP22]] to <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP23]], <4 x i32>* [[TMP24]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll index 7e324a8..cb4e3bd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll @@ -13,34 +13,35 @@ define i16 @test() { ; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX12:%.*]], [[WHILE]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr null, align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[A2]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr null, align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[A]], align 8 -; CHECK-NEXT: [[SHUFFLE13:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> poison, i64 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP4]], <16 x i64> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[SHUFFLE]], <4 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i64> [[TMP8]], <16 x i64> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i64> [[TMP10]], i64 [[TMP0]], i32 9 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i64> [[TMP11]], i64 [[TMP0]], i32 10 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i64> [[TMP12]], i64 [[TMP0]], i32 11 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[SHUFFLE13]], <4 x i64> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i64> [[TMP13]], <16 x i64> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[A1]], align 16 -; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[A2]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[A3]], align 16 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[TMP15]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = xor i64 [[TMP19]], [[TMP2]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i64 [[TMP2]], [[TMP2]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[SHUFFLE13]], i32 3 -; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i64 [[TMP2]], [[TMP20]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = xor i64 [[TMP20]], [[TMP16]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = xor i64 [[TMP16]], [[TMP16]] -; CHECK-NEXT: [[OP_RDX5:%.*]] = xor i64 [[TMP17]], [[TMP17]] -; CHECK-NEXT: [[OP_RDX6:%.*]] = xor i64 [[TMP18]], [[TMP18]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr null, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[A]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i64> poison, i64 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i64> [[TMP9]], <16 x i64> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i64> [[TMP11]], <16 x i64> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i64> [[TMP13]], i64 [[TMP0]], i32 9 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i64> [[TMP14]], i64 [[TMP0]], i32 10 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i64> [[TMP15]], i64 [[TMP0]], i32 11 +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i64> [[TMP16]], <16 x i64> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[A1]], align 16 +; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[A2]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[A3]], align 16 +; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[TMP18]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = xor i64 [[TMP22]], [[TMP3]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = xor i64 [[TMP3]], [[TMP3]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 +; CHECK-NEXT: [[OP_RDX2:%.*]] = xor i64 [[TMP3]], [[TMP23]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = xor i64 [[TMP23]], [[TMP19]] +; CHECK-NEXT: [[OP_RDX4:%.*]] = xor i64 [[TMP19]], [[TMP19]] +; CHECK-NEXT: [[OP_RDX5:%.*]] = xor i64 [[TMP20]], [[TMP20]] +; CHECK-NEXT: [[OP_RDX6:%.*]] = xor i64 [[TMP21]], [[TMP21]] ; CHECK-NEXT: [[OP_RDX7:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX1]] ; CHECK-NEXT: [[OP_RDX8:%.*]] = xor i64 [[OP_RDX2]], [[OP_RDX3]] ; CHECK-NEXT: [[OP_RDX9:%.*]] = xor i64 [[OP_RDX4]], [[OP_RDX5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll index b4eea63..3850cf0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -4,16 +4,17 @@ define void @test(float* noalias %0, float* %p) { ; CHECK-LABEL: @test( ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float*> poison, float* [[P:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP2]], <8 x float*> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP3]], i32 4, <8 x i1> , <8 x float> poison) -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> , <16 x float> [[SHUFFLE1]], <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[SHUFFLE1]], [[TMP6]] -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP4]] to <16 x float>* -; CHECK-NEXT: store <16 x float> [[SHUFFLE2]], <16 x float>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x float*> [[TMP2]], <8 x float*> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[TMP3]], <8 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> , <8 x float> poison) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> , <16 x float> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP5]] to <16 x float>* +; CHECK-NEXT: store <16 x float> [[TMP11]], <16 x float>* [[TMP12]], align 4 ; CHECK-NEXT: ret void ; %2 = getelementptr inbounds float, float* %p, i64 2 -- 2.7.4