From 5097b6e35291c23b50002c651b80ae8f770a015c Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Mon, 30 Aug 2021 19:16:44 +0200 Subject: [PATCH] Revert "[SLP]Improve graph reordering." This reverts commit 84cbd71c95923f9912512f3051c6ab548a99e016. This commit breaks one of the internal tests. As agreed with Alexey I will provide the reproducer later. --- .../llvm/Transforms/Vectorize/SLPVectorizer.h | 3 +- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 1362 ++++++-------------- .../AArch64/transpose-inseltpoison.ll | 84 +- .../Transforms/SLPVectorizer/AArch64/transpose.ll | 84 +- llvm/test/Transforms/SLPVectorizer/X86/addsub.ll | 42 +- .../Transforms/SLPVectorizer/X86/crash_cmpop.ll | 6 +- llvm/test/Transforms/SLPVectorizer/X86/extract.ll | 6 +- .../SLPVectorizer/X86/jumbled-load-multiuse.ll | 12 +- .../Transforms/SLPVectorizer/X86/jumbled-load.ll | 22 +- .../SLPVectorizer/X86/jumbled_store_crash.ll | 29 +- .../SLPVectorizer/X86/reorder_repeated_ops.ll | 4 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 4 +- .../SLPVectorizer/X86/vectorize-reorder-reuse.ll | 52 +- 13 files changed, 597 insertions(+), 1113 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h index 5e8c299..f416a59 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -95,7 +95,8 @@ private: /// Try to vectorize a list of operands. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef VL, slpvectorizer::BoUpSLP &R); + bool tryToVectorizeList(ArrayRef VL, slpvectorizer::BoUpSLP &R, + bool AllowReorder = false); /// Try to vectorize a chain that may start at the operands of \p I. bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f6f9d82..f68c677 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -21,7 +21,6 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" @@ -557,68 +556,13 @@ static bool isSimple(Instruction *I) { return true; } -/// Shuffles \p Mask in accordance with the given \p SubMask. -static void addMask(SmallVectorImpl &Mask, ArrayRef SubMask) { - if (SubMask.empty()) - return; - if (Mask.empty()) { - Mask.append(SubMask.begin(), SubMask.end()); - return; - } - SmallVector NewMask(SubMask.size(), UndefMaskElem); - int TermValue = std::min(Mask.size(), SubMask.size()); - for (int I = 0, E = SubMask.size(); I < E; ++I) { - if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || - Mask[SubMask[I]] >= TermValue) - continue; - NewMask[I] = Mask[SubMask[I]]; - } - Mask.swap(NewMask); -} - -/// Order may have elements assigned special value (size) which is out of -/// bounds. Such indices only appear on places which correspond to undef values -/// (see canReuseExtract for details) and used in order to avoid undef values -/// have effect on operands ordering. -/// The first loop below simply finds all unused indices and then the next loop -/// nest assigns these indices for undef values positions. -/// As an example below Order has two undef positions and they have assigned -/// values 3 and 7 respectively: -/// before: 6 9 5 4 9 2 1 0 -/// after: 6 3 5 4 7 2 1 0 -/// \returns Fixed ordering. -static void fixupOrderingIndices(SmallVectorImpl &Order) { - const unsigned Sz = Order.size(); - SmallBitVector UsedIndices(Sz); - SmallVector MaskedIndices; - for (unsigned I = 0; I < Sz; ++I) { - if (Order[I] < Sz) - UsedIndices.set(Order[I]); - else - MaskedIndices.push_back(I); - } - if (MaskedIndices.empty()) - return; - SmallVector AvailableIndices(MaskedIndices.size()); - unsigned Cnt = 0; - int Idx = UsedIndices.find_first(); - do { - AvailableIndices[Cnt] = Idx; - Idx = UsedIndices.find_next(Idx); - ++Cnt; - } while (Idx > 0); - assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); - for (int I = 0, E = MaskedIndices.size(); I < E; ++I) - Order[MaskedIndices[I]] = AvailableIndices[I]; -} - namespace llvm { static void inversePermutation(ArrayRef Indices, SmallVectorImpl &Mask) { Mask.clear(); const unsigned E = Indices.size(); - Mask.resize(E, UndefMaskElem); + Mask.resize(E, E + 1); for (unsigned I = 0; I < E; ++I) Mask[Indices[I]] = I; } @@ -658,22 +602,6 @@ static Optional getInsertIndex(Value *InsertInst, unsigned Offset) { return Index; } -/// Reorders the list of scalars in accordance with the given \p Order and then -/// the \p Mask. \p Order - is the original order of the scalars, need to -/// reorder scalars into an unordered state at first according to the given -/// order. Then the ordered scalars are shuffled once again in accordance with -/// the provided mask. -static void reorderScalars(SmallVectorImpl &Scalars, - ArrayRef Mask) { - assert(!Mask.empty() && "Expected non-empty mask."); - SmallVector Prev(Scalars.size(), - UndefValue::get(Scalars.front()->getType())); - Prev.swap(Scalars); - for (unsigned I = 0, E = Prev.size(); I < E; ++I) - if (Mask[I] != UndefMaskElem) - Scalars[Mask[I]] = Prev[I]; -} - namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -738,12 +666,13 @@ public: void buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst = None); - /// Builds external uses of the vectorized scalars, i.e. the list of - /// vectorized scalars to be extracted, their lanes and their scalar users. \p - /// ExternallyUsedValues contains additional list of external uses to handle - /// vectorization of reductions. - void - buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {}); + /// Construct a vectorizable tree that starts at \p Roots, ignoring users for + /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking + /// into account (and updating it, if required) list of externally used + /// values stored in \p ExternallyUsedValues. + void buildTree(ArrayRef Roots, + ExtraValueToDebugLocsMap &ExternallyUsedValues, + ArrayRef UserIgnoreLst = None); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { @@ -751,6 +680,8 @@ public: ScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); + NumOpsWantToKeepOrder.clear(); + NumOpsWantToKeepOriginalOrder = 0; for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -764,22 +695,103 @@ public: /// Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); - /// Reorders the current graph to the most profitable order starting from the - /// root node to the leaf nodes. The best order is chosen only from the nodes - /// of the same size (vectorization factor). Smaller nodes are considered - /// parts of subgraph with smaller VF and they are reordered independently. We - /// can make it because we still need to extend smaller nodes to the wider VF - /// and we can merge reordering shuffles with the widening shuffles. - void reorderTopToBottom(); - - /// Reorders the current graph to the most profitable order starting from - /// leaves to the root. It allows to rotate small subgraphs and reduce the - /// number of reshuffles if the leaf nodes use the same order. In this case we - /// can merge the orders and just shuffle user node instead of shuffling its - /// operands. Plus, even the leaf nodes have different orders, it allows to - /// sink reordering in the graph closer to the root node and merge it later - /// during analysis. - void reorderBottomToTop(); + /// \returns The best order of instructions for vectorization. + Optional> bestOrder() const { + assert(llvm::all_of( + NumOpsWantToKeepOrder, + [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) { + return D.getFirst().size() == + VectorizableTree[0]->Scalars.size(); + }) && + "All orders must have the same size as number of instructions in " + "tree node."); + auto I = std::max_element( + NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), + [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, + const decltype(NumOpsWantToKeepOrder)::value_type &D2) { + return D1.second < D2.second; + }); + if (I == NumOpsWantToKeepOrder.end() || + I->getSecond() <= NumOpsWantToKeepOriginalOrder) + return None; + + return makeArrayRef(I->getFirst()); + } + + /// Builds the correct order for root instructions. + /// If some leaves have the same instructions to be vectorized, we may + /// incorrectly evaluate the best order for the root node (it is built for the + /// vector of instructions without repeated instructions and, thus, has less + /// elements than the root node). This function builds the correct order for + /// the root node. + /// For example, if the root node is \, then the leaves + /// are \ and \. When we try to vectorize the first + /// leaf, it will be shrink to \. If instructions in this leaf should + /// be reordered, the best order will be \<1, 0\>. We need to extend this + /// order for the root node. For the root node this order should look like + /// \<3, 0, 1, 2\>. This function extends the order for the reused + /// instructions. + void findRootOrder(OrdersType &Order) { + // If the leaf has the same number of instructions to vectorize as the root + // - order must be set already. + unsigned RootSize = VectorizableTree[0]->Scalars.size(); + if (Order.size() == RootSize) + return; + SmallVector RealOrder(Order.size()); + std::swap(Order, RealOrder); + SmallVector Mask; + inversePermutation(RealOrder, Mask); + Order.assign(Mask.begin(), Mask.end()); + // The leaf has less number of instructions - need to find the true order of + // the root. + // Scan the nodes starting from the leaf back to the root. + const TreeEntry *PNode = VectorizableTree.back().get(); + SmallVector Nodes(1, PNode); + SmallPtrSet Visited; + while (!Nodes.empty() && Order.size() != RootSize) { + const TreeEntry *PNode = Nodes.pop_back_val(); + if (!Visited.insert(PNode).second) + continue; + const TreeEntry &Node = *PNode; + for (const EdgeInfo &EI : Node.UserTreeIndices) + if (EI.UserTE) + Nodes.push_back(EI.UserTE); + if (Node.ReuseShuffleIndices.empty()) + continue; + // Build the order for the parent node. + OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize); + SmallVector OrderCounter(Order.size(), 0); + // The algorithm of the order extension is: + // 1. Calculate the number of the same instructions for the order. + // 2. Calculate the index of the new order: total number of instructions + // with order less than the order of the current instruction + reuse + // number of the current instruction. + // 3. The new order is just the index of the instruction in the original + // vector of the instructions. + for (unsigned I : Node.ReuseShuffleIndices) + ++OrderCounter[Order[I]]; + SmallVector CurrentCounter(Order.size(), 0); + for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) { + unsigned ReusedIdx = Node.ReuseShuffleIndices[I]; + unsigned OrderIdx = Order[ReusedIdx]; + unsigned NewIdx = 0; + for (unsigned J = 0; J < OrderIdx; ++J) + NewIdx += OrderCounter[J]; + NewIdx += CurrentCounter[OrderIdx]; + ++CurrentCounter[OrderIdx]; + assert(NewOrder[NewIdx] == RootSize && + "The order index should not be written already."); + NewOrder[NewIdx] = I; + } + std::swap(Order, NewOrder); + } + assert(Order.size() == RootSize && + "Root node is expected or the size of the order must be the same as " + "the number of elements in the root node."); + assert(llvm::all_of(Order, + [RootSize](unsigned Val) { return Val != RootSize; }) && + "All indices must be initialized"); + } /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of @@ -802,10 +814,6 @@ public: return MinVecRegSize; } - unsigned getMinVF(unsigned Sz) const { - return std::max(2U, getMinVecRegSize() / Sz); - } - unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { unsigned MaxVF = MaxVFOption.getNumOccurrences() ? MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); @@ -1634,28 +1642,12 @@ private: /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { - auto &&IsSame = [VL](ArrayRef Scalars, ArrayRef Mask) { - if (Mask.size() != VL.size() && VL.size() == Scalars.size()) - return std::equal(VL.begin(), VL.end(), Scalars.begin()); - return VL.size() == Mask.size() && std::equal( - VL.begin(), VL.end(), Mask.begin(), - [Scalars](Value *V, int Idx) { return V == Scalars[Idx]; }); - }; - if (!ReorderIndices.empty()) { - // TODO: implement matching if the nodes are just reordered, still can - // treat the vector as the same if the list of scalars matches VL - // directly, without reordering. - SmallVector Mask; - inversePermutation(ReorderIndices, Mask); - if (VL.size() == Scalars.size()) - return IsSame(Scalars, Mask); - if (VL.size() == ReuseShuffleIndices.size()) { - ::addMask(Mask, ReuseShuffleIndices); - return IsSame(Scalars, Mask); - } - return false; - } - return IsSame(Scalars, ReuseShuffleIndices); + if (VL.size() == Scalars.size()) + return std::equal(VL.begin(), VL.end(), Scalars.begin()); + return VL.size() == ReuseShuffleIndices.size() && + std::equal( + VL.begin(), VL.end(), ReuseShuffleIndices.begin(), + [this](Value *V, int Idx) { return V == Scalars[Idx]; }); } /// A vector of scalars. @@ -1730,12 +1722,6 @@ private: } } - /// Reorders operands of the node to the given mask \p Mask. - void reorderOperands(ArrayRef Mask) { - for (ValueList &Operand : Operands) - reorderScalars(Operand, Mask); - } - /// \returns the \p OpIdx operand of this TreeEntry. ValueList &getOperand(unsigned OpIdx) { assert(OpIdx < Operands.size() && "Off bounds"); @@ -1795,14 +1781,19 @@ private: return AltOp ? AltOp->getOpcode() : 0; } - /// When ReuseReorderShuffleIndices is empty it just returns position of \p - /// V within vector of Scalars. Otherwise, try to remap on its reuse index. + /// Update operations state of this entry if reorder occurred. + bool updateStateIfReorder() { + if (ReorderIndices.empty()) + return false; + InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front()); + setOperations(S); + return true; + } + /// When ReuseShuffleIndices is empty it just returns position of \p V + /// within vector of Scalars. Otherwise, try to remap on its reuse index. int findLaneForValue(Value *V) const { unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V)); assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); - if (!ReorderIndices.empty()) - FoundLane = ReorderIndices[FoundLane]; - assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); if (!ReuseShuffleIndices.empty()) { FoundLane = std::distance(ReuseShuffleIndices.begin(), find(ReuseShuffleIndices, FoundLane)); @@ -1886,7 +1877,7 @@ private: TreeEntry *newTreeEntry(ArrayRef VL, Optional Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, - ArrayRef ReuseShuffleIndices = None, + ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { TreeEntry::EntryState EntryState = Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; @@ -1899,7 +1890,7 @@ private: Optional Bundle, const InstructionsState &S, const EdgeInfo &UserTreeIdx, - ArrayRef ReuseShuffleIndices = None, + ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && @@ -1907,25 +1898,12 @@ private: VectorizableTree.push_back(std::make_unique(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; + Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->State = EntryState; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); - if (ReorderIndices.empty()) { - Last->Scalars.assign(VL.begin(), VL.end()); - Last->setOperations(S); - } else { - // Reorder scalars and build final mask. - Last->Scalars.assign(VL.size(), nullptr); - transform(ReorderIndices, Last->Scalars.begin(), - [VL](unsigned Idx) -> Value * { - if (Idx >= VL.size()) - return UndefValue::get(VL.front()->getType()); - return VL[Idx]; - }); - InstructionsState S = getSameOpcode(Last->Scalars); - Last->setOperations(S); - Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); - } + Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); + Last->setOperations(S); if (Last->State != TreeEntry::NeedToGather) { for (Value *V : VL) { assert(!getTreeEntry(V) && "Scalar already in tree!"); @@ -2477,6 +2455,14 @@ private: } }; + /// Contains orders of operations along with the number of bundles that have + /// operations in this order. It stores only those orders that require + /// reordering, if reordering is not required it is counted using \a + /// NumOpsWantToKeepOriginalOrder. + DenseMap NumOpsWantToKeepOrder; + /// Number of bundles that do not require reordering. + unsigned NumOpsWantToKeepOriginalOrder = 0; + // Analysis and block reference. Function *F; ScalarEvolution *SE; @@ -2629,438 +2615,21 @@ void BoUpSLP::eraseInstructions(ArrayRef AV) { }; } -/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses -/// contains original mask for the scalars reused in the node. Procedure -/// transform this mask in accordance with the given \p Mask. -static void reorderReuses(SmallVectorImpl &Reuses, ArrayRef Mask) { - assert(!Mask.empty() && Reuses.size() == Mask.size() && - "Expected non-empty mask."); - SmallVector Prev(Reuses.begin(), Reuses.end()); - Prev.swap(Reuses); - for (unsigned I = 0, E = Prev.size(); I < E; ++I) - if (Mask[I] != UndefMaskElem) - Reuses[Mask[I]] = Prev[I]; +void BoUpSLP::buildTree(ArrayRef Roots, + ArrayRef UserIgnoreLst) { + ExtraValueToDebugLocsMap ExternallyUsedValues; + buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); } -/// Reorders the given \p Order according to the given \p Mask. \p Order - is -/// the original order of the scalars. Procedure transforms the provided order -/// in accordance with the given \p Mask. If the resulting \p Order is just an -/// identity order, \p Order is cleared. -static void reorderOrder(SmallVectorImpl &Order, ArrayRef Mask) { - assert(!Mask.empty() && "Expected non-empty mask."); - SmallVector MaskOrder; - if (Order.empty()) { - MaskOrder.resize(Mask.size()); - std::iota(MaskOrder.begin(), MaskOrder.end(), 0); - } else { - inversePermutation(Order, MaskOrder); - } - reorderReuses(MaskOrder, Mask); - if (ShuffleVectorInst::isIdentityMask(MaskOrder)) { - Order.clear(); +void BoUpSLP::buildTree(ArrayRef Roots, + ExtraValueToDebugLocsMap &ExternallyUsedValues, + ArrayRef UserIgnoreLst) { + deleteTree(); + UserIgnoreList = UserIgnoreLst; + if (!allSameType(Roots)) return; - } - Order.assign(Mask.size(), Mask.size()); - for (unsigned I = 0, E = Mask.size(); I < E; ++I) - if (MaskOrder[I] != UndefMaskElem) - Order[MaskOrder[I]] = I; - fixupOrderingIndices(Order); -} - -void BoUpSLP::reorderTopToBottom() { - // Maps VF to the graph nodes. - DenseMap> VFToOrderedEntries; - // ExtractElement gather nodes which can be vectorized and need to handle - // their ordering. - DenseMap GathersToOrders; - // Find all reorderable nodes with the given VF. - // Currently the are vectorized loads,extracts + some gathering of extracts. - for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( - const std::unique_ptr &TE) { - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa(TE->getMainOp()) && - !TE->isAltShuffle()) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - } else if (TE->State == TreeEntry::NeedToGather && - TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa(cast(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector. - OrdersType CurrentOrder; - bool Reuse = canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if (Reuse || !CurrentOrder.empty()) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - } - } - }); - - // Reorder the graph nodes according to their vectorization factor. - for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1; - VF /= 2) { - auto It = VFToOrderedEntries.find(VF); - if (It == VFToOrderedEntries.end()) - continue; - // Try to find the most profitable order. We just are looking for the most - // used order and reorder scalar elements in the nodes according to this - // mostly used order. - const SmallPtrSetImpl &OrderedEntries = It->getSecond(); - // All operands are reordered and used only in this node - propagate the - // most used order to the user node. - DenseMap OrdersUses; - SmallPtrSet VisitedOps; - for (const TreeEntry *OpTE : OrderedEntries) { - // No need to reorder this nodes, still need to extend and to use shuffle, - // just need to merge reordering shuffle and the reuse shuffle. - if (!OpTE->ReuseShuffleIndices.empty()) - continue; - // Count number of orders uses. - const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { - if (OpTE->State == TreeEntry::NeedToGather) - return GathersToOrders.find(OpTE)->second; - return OpTE->ReorderIndices; - }(); - // Stores actually store the mask, not the order, need to invert. - if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && - OpTE->getOpcode() == Instruction::Store && !Order.empty()) { - SmallVector Mask; - inversePermutation(Order, Mask); - unsigned E = Order.size(); - OrdersType CurrentOrder(E, E); - transform(Mask, CurrentOrder.begin(), [E](int Idx) { - return Idx == UndefMaskElem ? E : static_cast(Idx); - }); - fixupOrderingIndices(CurrentOrder); - ++OrdersUses.try_emplace(CurrentOrder).first->getSecond(); - } else { - ++OrdersUses.try_emplace(Order).first->getSecond(); - } - } - // Set order of the user node. - if (OrdersUses.empty()) - continue; - // Choose the most used order. - ArrayRef BestOrder = OrdersUses.begin()->first; - unsigned Cnt = OrdersUses.begin()->second; - for (const auto &Pair : llvm::drop_begin(OrdersUses)) { - if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { - BestOrder = Pair.first; - Cnt = Pair.second; - } - } - // Set order of the user node. - if (BestOrder.empty()) - continue; - SmallVector Mask; - inversePermutation(BestOrder, Mask); - SmallVector MaskOrder(BestOrder.size(), UndefMaskElem); - unsigned E = BestOrder.size(); - transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { - return I < E ? static_cast(I) : UndefMaskElem; - }); - // Do an actual reordering, if profitable. - for (std::unique_ptr &TE : VectorizableTree) { - // Just do the reordering for the nodes with the given VF. - if (TE->Scalars.size() != VF) { - if (TE->ReuseShuffleIndices.size() == VF) { - // Need to reorder the reuses masks of the operands with smaller VF to - // be able to find the match between the graph nodes and scalar - // operands of the given node during vectorization/cost estimation. - assert(all_of(TE->UserTreeIndices, - [VF, &TE](const EdgeInfo &EI) { - return EI.UserTE->Scalars.size() == VF || - EI.UserTE->Scalars.size() == - TE->Scalars.size(); - }) && - "All users must be of VF size."); - // Update ordering of the operands with the smaller VF than the given - // one. - reorderReuses(TE->ReuseShuffleIndices, Mask); - } - continue; - } - if (TE->State == TreeEntry::Vectorize && - isa(TE->getMainOp()) && - !TE->isAltShuffle()) { - // Build correct orders for extract{element,value}, loads and - // stores. - reorderOrder(TE->ReorderIndices, Mask); - if (isa(TE->getMainOp())) - TE->reorderOperands(Mask); - } else { - // Reorder the node and its operands. - TE->reorderOperands(Mask); - assert(TE->ReorderIndices.empty() && - "Expected empty reorder sequence."); - reorderScalars(TE->Scalars, Mask); - } - if (!TE->ReuseShuffleIndices.empty()) { - // Apply reversed order to keep the original ordering of the reused - // elements to avoid extra reorder indices shuffling. - OrdersType CurrentOrder; - reorderOrder(CurrentOrder, MaskOrder); - SmallVector NewReuses; - inversePermutation(CurrentOrder, NewReuses); - addMask(NewReuses, TE->ReuseShuffleIndices); - TE->ReuseShuffleIndices.swap(NewReuses); - } - } - } -} - -void BoUpSLP::reorderBottomToTop() { - SetVector OrderedEntries; - DenseMap GathersToOrders; - // Find all reorderable leaf nodes with the given VF. - // Currently the are vectorized loads,extracts without alternate operands + - // some gathering of extracts. - SmallVector NonVectorized; - for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders, - &NonVectorized]( - const std::unique_ptr &TE) { - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa(TE->getMainOp()) && - !TE->isAltShuffle()) { - OrderedEntries.insert(TE.get()); - } else if (TE->State == TreeEntry::NeedToGather && - TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa(cast(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector with a single user only. - OrdersType CurrentOrder; - bool Reuse = canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if ((Reuse || !CurrentOrder.empty()) && - !any_of( - VectorizableTree, [&TE](const std::unique_ptr &Entry) { - return Entry->State == TreeEntry::NeedToGather && - Entry.get() != TE.get() && Entry->isSame(TE->Scalars); - })) { - OrderedEntries.insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - } - } - if (TE->State != TreeEntry::Vectorize) - NonVectorized.push_back(TE.get()); - }); - - // Checks if the operands of the users are reordarable and have only single - // use. - auto &&CheckOperands = - [this, &NonVectorized](const auto &Data, - SmallVectorImpl &GatherOps) { - for (unsigned I = 0, E = Data.first->getNumOperands(); I < E; ++I) { - if (any_of(Data.second, - [I](const std::pair &OpData) { - return OpData.first == I && - OpData.second->State == TreeEntry::Vectorize; - })) - continue; - ArrayRef VL = Data.first->getOperand(I); - const TreeEntry *TE = nullptr; - const auto *It = find_if(VL, [this, &TE](Value *V) { - TE = getTreeEntry(V); - return TE; - }); - if (It != VL.end() && TE->isSame(VL)) - return false; - TreeEntry *Gather = nullptr; - if (count_if(NonVectorized, [VL, &Gather](TreeEntry *TE) { - assert(TE->State != TreeEntry::Vectorize && - "Only non-vectorized nodes are expected."); - if (TE->isSame(VL)) { - Gather = TE; - return true; - } - return false; - }) > 1) - return false; - if (Gather) - GatherOps.push_back(Gather); - } - return true; - }; - // 1. Propagate order to the graph nodes, which use only reordered nodes. - // I.e., if the node has operands, that are reordered, try to make at least - // one operand order in the natural order and reorder others + reorder the - // user node itself. - SmallPtrSet Visited; - while (!OrderedEntries.empty()) { - // 1. Filter out only reordered nodes. - // 2. If the entry has multiple uses - skip it and jump to the next node. - MapVector>> Users; - SmallVector Filtered; - for (TreeEntry *TE : OrderedEntries) { - if (!(TE->State == TreeEntry::Vectorize || - (TE->State == TreeEntry::NeedToGather && - TE->getOpcode() == Instruction::ExtractElement)) || - TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || - !all_of(drop_begin(TE->UserTreeIndices), - [TE](const EdgeInfo &EI) { - return EI.UserTE == TE->UserTreeIndices.front().UserTE; - }) || - !Visited.insert(TE).second) { - Filtered.push_back(TE); - continue; - } - // Build a map between user nodes and their operands order to speedup - // search. The graph currently does not provide this dependency directly. - for (EdgeInfo &EI : TE->UserTreeIndices) { - TreeEntry *UserTE = EI.UserTE; - auto It = Users.find(UserTE); - if (It == Users.end()) - It = Users.insert({UserTE, {}}).first; - It->second.emplace_back(EI.EdgeIdx, TE); - } - } - // Erase filtered entries. - for_each(Filtered, - [&OrderedEntries](TreeEntry *TE) { OrderedEntries.remove(TE); }); - for (const auto &Data : Users) { - // Check that operands are used only in the User node. - SmallVector GatherOps; - if (!CheckOperands(Data, GatherOps)) { - for_each(Data.second, - [&OrderedEntries](const std::pair &Op) { - OrderedEntries.remove(Op.second); - }); - continue; - } - // All operands are reordered and used only in this node - propagate the - // most used order to the user node. - DenseMap OrdersUses; - SmallPtrSet VisitedOps; - for (const auto &Op : Data.second) { - TreeEntry *OpTE = Op.second; - if (!OpTE->ReuseShuffleIndices.empty()) - continue; - const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & { - if (OpTE->State == TreeEntry::NeedToGather) - return GathersToOrders.find(OpTE)->second; - return OpTE->ReorderIndices; - }(); - // Stores actually store the mask, not the order, need to invert. - if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() && - OpTE->getOpcode() == Instruction::Store && !Order.empty()) { - SmallVector Mask; - inversePermutation(Order, Mask); - unsigned E = Order.size(); - OrdersType CurrentOrder(E, E); - transform(Mask, CurrentOrder.begin(), [E](int Idx) { - return Idx == UndefMaskElem ? E : static_cast(Idx); - }); - fixupOrderingIndices(CurrentOrder); - ++OrdersUses.try_emplace(CurrentOrder).first->getSecond(); - } else { - ++OrdersUses.try_emplace(Order).first->getSecond(); - } - if (VisitedOps.insert(OpTE).second) - OrdersUses.try_emplace({}, 0).first->getSecond() += - OpTE->UserTreeIndices.size(); - --OrdersUses[{}]; - } - // If no orders - skip current nodes and jump to the next one, if any. - if (OrdersUses.empty()) { - for_each(Data.second, - [&OrderedEntries](const std::pair &Op) { - OrderedEntries.remove(Op.second); - }); - continue; - } - // Choose the best order. - ArrayRef BestOrder = OrdersUses.begin()->first; - unsigned Cnt = OrdersUses.begin()->second; - for (const auto &Pair : llvm::drop_begin(OrdersUses)) { - if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { - BestOrder = Pair.first; - Cnt = Pair.second; - } - } - // Set order of the user node (reordering of operands and user nodes). - if (BestOrder.empty()) { - for_each(Data.second, - [&OrderedEntries](const std::pair &Op) { - OrderedEntries.remove(Op.second); - }); - continue; - } - // Erase operands from OrderedEntries list and adjust their orders. - VisitedOps.clear(); - SmallVector Mask; - inversePermutation(BestOrder, Mask); - SmallVector MaskOrder(BestOrder.size(), UndefMaskElem); - unsigned E = BestOrder.size(); - transform(BestOrder, MaskOrder.begin(), [E](unsigned I) { - return I < E ? static_cast(I) : UndefMaskElem; - }); - for (const std::pair &Op : Data.second) { - TreeEntry *TE = Op.second; - OrderedEntries.remove(TE); - if (!VisitedOps.insert(TE).second) - continue; - if (!TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) { - // Just reorder reuses indices. - reorderReuses(TE->ReuseShuffleIndices, Mask); - continue; - } - // Gathers are processed separately. - if (TE->State != TreeEntry::Vectorize) - continue; - assert((BestOrder.size() == TE->ReorderIndices.size() || - TE->ReorderIndices.empty()) && - "Non-matching sizes of user/operand entries."); - reorderOrder(TE->ReorderIndices, Mask); - } - // For gathers just need to reorder its scalars. - for (TreeEntry *Gather : GatherOps) { - if (!Gather->ReuseShuffleIndices.empty()) - continue; - assert(Gather->ReorderIndices.empty() && - "Unexpected reordering of gathers."); - reorderScalars(Gather->Scalars, Mask); - OrderedEntries.remove(Gather); - } - // Reorder operands of the user node and set the ordering for the user - // node itself. - if (Data.first->State != TreeEntry::Vectorize || - !isa( - Data.first->getMainOp()) || - Data.first->isAltShuffle()) - Data.first->reorderOperands(Mask); - if (!isa(Data.first->getMainOp()) || - Data.first->isAltShuffle()) { - reorderScalars(Data.first->Scalars, Mask); - reorderOrder(Data.first->ReorderIndices, MaskOrder); - if (Data.first->ReuseShuffleIndices.empty() && - !Data.first->ReorderIndices.empty()) { - // Insert user node to the list to try to sink reordering deeper in - // the graph. - OrderedEntries.insert(Data.first); - } - } else { - reorderOrder(Data.first->ReorderIndices, Mask); - } - } - } -} + buildTree_rec(Roots, 0, EdgeInfo()); -void BoUpSLP::buildExternalUses( - const ExtraValueToDebugLocsMap &ExternallyUsedValues) { // Collect the values that we need to extract from the tree. for (auto &TEPtr : VectorizableTree) { TreeEntry *Entry = TEPtr.get(); @@ -3119,80 +2688,6 @@ void BoUpSLP::buildExternalUses( } } -void BoUpSLP::buildTree(ArrayRef Roots, - ArrayRef UserIgnoreLst) { - deleteTree(); - UserIgnoreList = UserIgnoreLst; - if (!allSameType(Roots)) - return; - buildTree_rec(Roots, 0, EdgeInfo()); -} - -namespace { -/// Tracks the state we can represent the loads in the given sequence. -enum class LoadsState { Gather, Vectorize, ScatterVectorize }; -} // anonymous namespace - -/// Checks if the given array of loads can be represented as a vectorized, -/// scatter or just simple gather. -static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, - const TargetTransformInfo &TTI, - const DataLayout &DL, ScalarEvolution &SE, - SmallVectorImpl &Order, - SmallVectorImpl &PointerOps) { - // Check that a vectorized load would load the same memory as a scalar - // load. For example, we don't want to vectorize loads that are smaller - // than 8-bit. Even though we have a packed struct {} LLVM - // treats loading/storing it as an i8 struct. If we vectorize loads/stores - // from such a struct, we read/write packed bits disagreeing with the - // unvectorized version. - Type *ScalarTy = VL0->getType(); - - if (DL.getTypeSizeInBits(ScalarTy) != DL.getTypeAllocSizeInBits(ScalarTy)) - return LoadsState::Gather; - - // Make sure all loads in the bundle are simple - we can't vectorize - // atomic or volatile loads. - PointerOps.clear(); - PointerOps.resize(VL.size()); - auto *POIter = PointerOps.begin(); - for (Value *V : VL) { - auto *L = cast(V); - if (!L->isSimple()) - return LoadsState::Gather; - *POIter = L->getPointerOperand(); - ++POIter; - } - - Order.clear(); - // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order)) { - Value *Ptr0; - Value *PtrN; - if (Order.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); - } else { - Ptr0 = PointerOps[Order.front()]; - PtrN = PointerOps[Order.back()]; - } - Optional Diff = - getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); - // Check that the sorted loads are consecutive. - if (static_cast(*Diff) == VL.size() - 1) - return LoadsState::Vectorize; - Align CommonAlignment = cast(VL0)->getAlign(); - for (Value *V : VL) - CommonAlignment = - commonAlignment(CommonAlignment, cast(V)->getAlign()); - if (TTI.isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), - CommonAlignment)) - return LoadsState::ScatterVectorize; - } - - return LoadsState::Gather; -} - void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -3302,7 +2797,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // Check that every instruction appears once in this bundle. - SmallVector ReuseShuffleIndicies; + SmallVector ReuseShuffleIndicies; SmallVector UniqueValues; DenseMap UniquePositions; for (Value *V : VL) { @@ -3396,6 +2891,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); + ++NumOpsWantToKeepOriginalOrder; newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); // This is a special case, as it does not gather, but at the same time @@ -3413,11 +2909,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, dbgs() << " " << Idx; dbgs() << "\n"; }); - fixupOrderingIndices(CurrentOrder); // Insert new order with initial value 0, if it does not exist, // otherwise return the iterator to the existing one. newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; @@ -3437,14 +2934,8 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Check that we have a buildvector and not a shuffle of 2 or more // different vectors. ValueSet SourceVectors; - int MinIdx = std::numeric_limits::max(); - for (Value *V : VL) { + for (Value *V : VL) SourceVectors.insert(cast(V)->getOperand(0)); - Optional Idx = *getInsertIndex(V, 0); - if (!Idx || *Idx == UndefMaskElem) - continue; - MinIdx = std::min(MinIdx, *Idx); - } if (count_if(VL, [&SourceVectors](Value *V) { return !SourceVectors.contains(V); @@ -3452,35 +2943,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Found 2nd source vector - cancel. LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " "different source vectors.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); BS.cancelScheduling(VL, VL0); return; } - auto OrdCompare = [](const std::pair &P1, - const std::pair &P2) { - return P1.first > P2.first; - }; - PriorityQueue, SmallVector>, - decltype(OrdCompare)> - Indices(OrdCompare); - for (int I = 0, E = VL.size(); I < E; ++I) { - Optional Idx = *getInsertIndex(VL[I], 0); - if (!Idx || *Idx == UndefMaskElem) - continue; - Indices.emplace(*Idx, I); - } - OrdersType CurrentOrder(VL.size(), VL.size()); - bool IsIdentity = true; - for (int I = 0, E = VL.size(); I < E; ++I) { - CurrentOrder[Indices.top().second] = I; - IsIdentity &= Indices.top().second == I; - Indices.pop(); - } - if (IsIdentity) - CurrentOrder.clear(); - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - None, CurrentOrder); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx); LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); constexpr int NumOps = 2; @@ -3491,7 +2960,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, TE->setOperand(I, VectorOperands[I]); } - buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1}); + buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, 0}); return; } case Instruction::Load: { @@ -3501,52 +2970,90 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. - SmallVector PointerOps; - OrdersType CurrentOrder; - TreeEntry *TE = nullptr; - switch (canVectorizeLoads(VL, VL0, *TTI, *DL, *SE, CurrentOrder, - PointerOps)) { - case LoadsState::Vectorize: - if (CurrentOrder.empty()) { - // Original loads are consecutive and does not require reordering. - TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); - } else { - fixupOrderingIndices(CurrentOrder); - // Need to reorder. - TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); - LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); - } - TE->setOperandsInOrder(); - break; - case LoadsState::ScatterVectorize: - // Vectorizing non-consecutive loads with `llvm.masked.gather`. - TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, - UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - buildTree_rec(PointerOps, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); - break; - case LoadsState::Gather: + Type *ScalarTy = VL0->getType(); + + if (DL->getTypeSizeInBits(ScalarTy) != + DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); -#ifndef NDEBUG - Type *ScalarTy = VL0->getType(); - if (DL->getTypeSizeInBits(ScalarTy) != - DL->getTypeAllocSizeInBits(ScalarTy)) - LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); - else if (any_of(VL, [](Value *V) { - return !cast(V)->isSimple(); - })) + LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + return; + } + + // Make sure all loads in the bundle are simple - we can't vectorize + // atomic or volatile loads. + SmallVector PointerOps(VL.size()); + auto POIter = PointerOps.begin(); + for (Value *V : VL) { + auto *L = cast(V); + if (!L->isSimple()) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); - else - LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); -#endif // NDEBUG - break; + return; + } + *POIter = L->getPointerOperand(); + ++POIter; + } + + OrdersType CurrentOrder; + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { + Value *Ptr0; + Value *PtrN; + if (CurrentOrder.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[CurrentOrder.front()]; + PtrN = PointerOps[CurrentOrder.back()]; + } + Optional Diff = getPointersDiff( + ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); + // Check that the sorted loads are consecutive. + if (static_cast(*Diff) == VL.size() - 1) { + if (CurrentOrder.empty()) { + // Original loads are consecutive and does not require reordering. + ++NumOpsWantToKeepOriginalOrder; + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); + } else { + // Need to reorder. + TreeEntry *TE = + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, CurrentOrder); + TE->setOperandsInOrder(); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; + } + return; + } + Align CommonAlignment = cast(VL0)->getAlign(); + for (Value *V : VL) + CommonAlignment = + commonAlignment(CommonAlignment, cast(V)->getAlign()); + if (TTI->isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), + CommonAlignment)) { + // Vectorizing non-consecutive loads with `llvm.masked.gather`. + TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, + S, UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() + << "SLP: added a vector of non-consecutive loads.\n"); + return; + } } + + LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -3793,19 +3300,21 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (static_cast(*Dist) == VL.size() - 1) { if (CurrentOrder.empty()) { // Original stores are consecutive and does not require reordering. + ++NumOpsWantToKeepOriginalOrder; TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); } else { - fixupOrderingIndices(CurrentOrder); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -4136,42 +3645,25 @@ computeExtractCost(ArrayRef VL, FixedVectorType *VecTy, return Cost; } -/// Build shuffle mask for shuffle graph entries and lists of main and alternate -/// operations operands. -static void -buildSuffleEntryMask(ArrayRef VL, ArrayRef ReorderIndices, - ArrayRef ReusesIndices, - const function_ref IsAltOp, - SmallVectorImpl &Mask, - SmallVectorImpl *OpScalars = nullptr, - SmallVectorImpl *AltScalars = nullptr) { - unsigned Sz = VL.size(); - Mask.assign(Sz, UndefMaskElem); - SmallVector OrderMask; - if (!ReorderIndices.empty()) - inversePermutation(ReorderIndices, OrderMask); - for (unsigned I = 0; I < Sz; ++I) { - unsigned Idx = I; - if (!ReorderIndices.empty()) - Idx = OrderMask[I]; - auto *OpInst = cast(VL[I]); - if (IsAltOp(OpInst)) { - Mask[Idx] = Sz + I; - if (AltScalars) - AltScalars->push_back(OpInst); - } else { - Mask[Idx] = I; - if (OpScalars) - OpScalars->push_back(OpInst); - } +/// Shuffles \p Mask in accordance with the given \p SubMask. +static void addMask(SmallVectorImpl &Mask, ArrayRef SubMask) { + if (SubMask.empty()) + return; + if (Mask.empty()) { + Mask.append(SubMask.begin(), SubMask.end()); + return; } - if (!ReusesIndices.empty()) { - SmallVector NewMask(ReusesIndices.size(), UndefMaskElem); - transform(ReusesIndices, NewMask.begin(), [&Mask](int Idx) { - return Idx != UndefMaskElem ? Mask[Idx] : UndefMaskElem; - }); - Mask.swap(NewMask); + SmallVector NewMask(SubMask.size(), SubMask.size()); + int TermValue = std::min(Mask.size(), SubMask.size()); + for (int I = 0, E = SubMask.size(); I < E; ++I) { + if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || + Mask[SubMask[I]] >= TermValue) { + NewMask[I] = UndefMaskElem; + continue; + } + NewMask[I] = Mask[SubMask[I]]; } + Mask.swap(NewMask); } InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, @@ -4335,86 +3827,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, if (NeedToShuffleReuses) ReuseShuffleCost = TTI->getShuffleCost( TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); - // Improve gather cost for gather of loads, if we can group some of the - // loads into vector loads. - if (VL.size() > 2 && E->getOpcode() == Instruction::Load && - !E->isAltShuffle()) { - BoUpSLP::ValueSet VectorizedLoads; - unsigned StartIdx = 0; - unsigned VF = VL.size() / 2; - unsigned VectorizedCnt = 0; - unsigned ScatterVectorizeCnt = 0; - const unsigned Sz = DL->getTypeSizeInBits(E->getMainOp()->getType()); - for (unsigned MinVF = getMinVF(2 * Sz); VF >= MinVF; VF /= 2) { - for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; - Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); - if (!VectorizedLoads.count(Slice.front()) && - !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { - SmallVector PointerOps; - OrdersType CurrentOrder; - LoadsState LS = canVectorizeLoads(Slice, Slice.front(), *TTI, *DL, - *SE, CurrentOrder, PointerOps); - switch (LS) { - case LoadsState::Vectorize: - case LoadsState::ScatterVectorize: - // Mark the vectorized loads so that we don't vectorize them - // again. - if (LS == LoadsState::Vectorize) - ++VectorizedCnt; - else - ++ScatterVectorizeCnt; - VectorizedLoads.insert(Slice.begin(), Slice.end()); - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += VF; - break; - case LoadsState::Gather: - break; - } - } - } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= VL.size()) - break; - // Found vectorizable parts - exit. - if (!VectorizedLoads.empty()) - break; - } - if (!VectorizedLoads.empty()) { - InstructionCost GatherCost = 0; - // Get the cost for gathered loads. - for (unsigned I = 0, End = VL.size(); I < End; I += VF) { - if (VectorizedLoads.contains(VL[I])) - continue; - GatherCost += getGatherCost(VL.slice(I, VF)); - } - // The cost for vectorized loads. - InstructionCost ScalarsCost = 0; - for (Value *V : VectorizedLoads) { - auto *LI = cast(V); - ScalarsCost += TTI->getMemoryOpCost( - Instruction::Load, LI->getType(), LI->getAlign(), - LI->getPointerAddressSpace(), CostKind, LI); - } - auto *LI = cast(E->getMainOp()); - auto *LoadTy = FixedVectorType::get(LI->getType(), VF); - Align Alignment = LI->getAlign(); - GatherCost += - VectorizedCnt * - TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, LI); - GatherCost += ScatterVectorizeCnt * - TTI->getGatherScatterOpCost( - Instruction::Load, LoadTy, LI->getPointerOperand(), - /*VariableMask=*/false, Alignment, CostKind, LI); - // Add the cost for the subvectors shuffling. - GatherCost += ((VL.size() - VF) / VF) * - TTI->getShuffleCost(TTI::SK_Select, VecTy); - return ReuseShuffleCost + GatherCost - ScalarsCost; - } - } return ReuseShuffleCost + getGatherCost(VL); } InstructionCost CommonCost = 0; @@ -4507,33 +3919,29 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, return CommonCost; } case Instruction::InsertElement: { - assert(E->ReuseShuffleIndices.empty() && - "Unique insertelements only are expected."); auto *SrcVecTy = cast(VL0->getType()); unsigned const NumElts = SrcVecTy->getNumElements(); unsigned const NumScalars = VL.size(); APInt DemandedElts = APInt::getNullValue(NumElts); // TODO: Add support for Instruction::InsertValue. - SmallVector Mask; - if (!E->ReorderIndices.empty()) { - inversePermutation(E->ReorderIndices, Mask); - Mask.append(NumElts - NumScalars, UndefMaskElem); - } else { - Mask.assign(NumElts, UndefMaskElem); - std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); - } - unsigned Offset = *getInsertIndex(VL0, 0); + unsigned Offset = UINT_MAX; bool IsIdentity = true; - SmallVector PrevMask(NumElts, UndefMaskElem); - Mask.swap(PrevMask); + SmallVector ShuffleMask(NumElts, UndefMaskElem); for (unsigned I = 0; I < NumScalars; ++I) { - Optional InsertIdx = getInsertIndex(VL[PrevMask[I]], 0); + Optional InsertIdx = getInsertIndex(VL[I], 0); if (!InsertIdx || *InsertIdx == UndefMaskElem) continue; - DemandedElts.setBit(*InsertIdx); - IsIdentity &= *InsertIdx - Offset == I; - Mask[*InsertIdx - Offset] = I; + unsigned Idx = *InsertIdx; + DemandedElts.setBit(Idx); + if (Idx < Offset) { + Offset = Idx; + IsIdentity &= I == 0; + } else { + assert(Idx >= Offset && "Failed to find vector index offset"); + IsIdentity &= Idx - Offset == I; + } + ShuffleMask[Idx] = I; } assert(Offset < NumElts && "Failed to find vector index offset"); @@ -4548,23 +3956,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TargetTransformInfo::SK_PermuteSingleSrc, FixedVectorType::get(SrcVecTy->getElementType(), Sz)); } else if (!IsIdentity) { - auto *FirstInsert = - cast(*find_if(E->Scalars, [E](Value *V) { - return !is_contained(E->Scalars, - cast(V)->getOperand(0)); - })); - if (isa(FirstInsert->getOperand(0))) { - Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask); - } else { - SmallVector InsertMask(NumElts); - std::iota(InsertMask.begin(), InsertMask.end(), 0); - for (unsigned I = 0; I < NumElts; I++) { - if (Mask[I] != UndefMaskElem) - InsertMask[Offset + I] = NumElts + I; - } - Cost += - TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask); - } + Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, + ShuffleMask); } return Cost; @@ -4846,16 +4239,14 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TTI::CastContextHint::None, CostKind); } - SmallVector Mask; - buildSuffleEntryMask( - E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, - [E](Instruction *I) { - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - return I->getOpcode() == E->getAltOpcode(); - }, - Mask); - CommonCost = - TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy, Mask); + SmallVector Mask(E->Scalars.size()); + for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) { + auto *OpInst = cast(E->Scalars[I]); + assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); + Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0); + } + VecCost += + TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0); LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return CommonCost + VecCost - ScalarCost; } @@ -5778,17 +5169,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); switch (ShuffleOrOp) { case Instruction::PHI: { - assert( - (E->ReorderIndices.empty() || E != VectorizableTree.front().get()) && - "PHI reordering is free."); auto *PH = cast(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); Value *V = NewPhi; - ShuffleBuilder.addInversedMask(E->ReorderIndices); - ShuffleBuilder.addMask(E->ReuseShuffleIndices); - V = ShuffleBuilder.finalize(V); + if (NeedToShuffleReuses) + V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; @@ -5839,41 +5226,48 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return NewV; } case Instruction::InsertElement: { - assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); - Builder.SetInsertPoint(cast(E->Scalars.back())); + Builder.SetInsertPoint(VL0); Value *V = vectorizeTree(E->getOperand(1)); - // Create InsertVector shuffle if necessary - auto *FirstInsert = cast(*find_if(E->Scalars, [E](Value *V) { - return !is_contained(E->Scalars, cast(V)->getOperand(0)); - })); const unsigned NumElts = - cast(FirstInsert->getType())->getNumElements(); + cast(VL0->getType())->getNumElements(); const unsigned NumScalars = E->Scalars.size(); - unsigned Offset = *getInsertIndex(VL0, 0); - assert(Offset < NumElts && "Failed to find vector index offset"); - - // Create shuffle to resize vector - SmallVector Mask; - if (!E->ReorderIndices.empty()) { - inversePermutation(E->ReorderIndices, Mask); - Mask.append(NumElts - NumScalars, UndefMaskElem); - } else { - Mask.assign(NumElts, UndefMaskElem); - std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); - } // Create InsertVector shuffle if necessary + Instruction *FirstInsert = nullptr; bool IsIdentity = true; - SmallVector PrevMask(NumElts, UndefMaskElem); - Mask.swap(PrevMask); + unsigned Offset = UINT_MAX; for (unsigned I = 0; I < NumScalars; ++I) { - Value *Scalar = E->Scalars[PrevMask[I]]; + Value *Scalar = E->Scalars[I]; + if (!FirstInsert && + !is_contained(E->Scalars, cast(Scalar)->getOperand(0))) + FirstInsert = cast(Scalar); Optional InsertIdx = getInsertIndex(Scalar, 0); if (!InsertIdx || *InsertIdx == UndefMaskElem) continue; - IsIdentity &= *InsertIdx - Offset == I; - Mask[*InsertIdx - Offset] = I; + unsigned Idx = *InsertIdx; + if (Idx < Offset) { + Offset = Idx; + IsIdentity &= I == 0; + } else { + assert(Idx >= Offset && "Failed to find vector index offset"); + IsIdentity &= Idx - Offset == I; + } + } + assert(Offset < NumElts && "Failed to find vector index offset"); + + // Create shuffle to resize vector + SmallVector Mask(NumElts, UndefMaskElem); + if (!IsIdentity) { + for (unsigned I = 0; I < NumScalars; ++I) { + Value *Scalar = E->Scalars[I]; + Optional InsertIdx = getInsertIndex(Scalar, 0); + if (!InsertIdx || *InsertIdx == UndefMaskElem) + continue; + Mask[*InsertIdx - Offset] = I; + } + } else { + std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); } if (!IsIdentity || NumElts != NumScalars) V = Builder.CreateShuffleVector(V, Mask); @@ -5920,7 +5314,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *CI = cast(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5943,7 +5336,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); - ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5964,7 +5356,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateSelect(Cond, True, False); - ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -5988,7 +5379,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -6032,7 +5422,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -6044,6 +5433,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. + bool IsReorder = E->updateStateIfReorder(); + if (IsReorder) + VL0 = E->getMainOp(); setInsertPointAfterBundle(E); LoadInst *LI = cast(VL0); @@ -6081,7 +5473,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::Store: { - auto *SI = cast(VL0); + bool IsReorder = !E->ReorderIndices.empty(); + auto *SI = cast( + IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); unsigned AS = SI->getPointerAddressSpace(); setInsertPointAfterBundle(E); @@ -6140,7 +5534,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (Instruction *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); - ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -6207,7 +5600,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); propagateIRFlags(V, E->Scalars, VL0); - ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -6255,14 +5647,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // Also, gather up main and alt scalar ops to propagate IR flags to // each vector operation. ValueList OpScalars, AltScalars; - SmallVector Mask; - buildSuffleEntryMask( - E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices, - [E](Instruction *I) { - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); - return I->getOpcode() == E->getAltOpcode(); - }, - Mask, &OpScalars, &AltScalars); + unsigned Sz = E->Scalars.size(); + SmallVector Mask(Sz); + for (unsigned I = 0; I < Sz; ++I) { + auto *OpInst = cast(E->Scalars[I]); + assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); + if (OpInst->getOpcode() == E->getAltOpcode()) { + Mask[I] = Sz + I; + AltScalars.push_back(E->Scalars[I]); + } else { + Mask[I] = I; + OpScalars.push_back(E->Scalars[I]); + } + } propagateIRFlags(V0, OpScalars); propagateIRFlags(V1, AltScalars); @@ -6270,6 +5667,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; @@ -7445,6 +6843,44 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, return Changed; } +/// Order may have elements assigned special value (size) which is out of +/// bounds. Such indices only appear on places which correspond to undef values +/// (see canReuseExtract for details) and used in order to avoid undef values +/// have effect on operands ordering. +/// The first loop below simply finds all unused indices and then the next loop +/// nest assigns these indices for undef values positions. +/// As an example below Order has two undef positions and they have assigned +/// values 3 and 7 respectively: +/// before: 6 9 5 4 9 2 1 0 +/// after: 6 3 5 4 7 2 1 0 +/// \returns Fixed ordering. +static BoUpSLP::OrdersType fixupOrderingIndices(ArrayRef Order) { + BoUpSLP::OrdersType NewOrder(Order.begin(), Order.end()); + const unsigned Sz = NewOrder.size(); + SmallBitVector UsedIndices(Sz); + SmallVector MaskedIndices; + for (int I = 0, E = NewOrder.size(); I < E; ++I) { + if (NewOrder[I] < Sz) + UsedIndices.set(NewOrder[I]); + else + MaskedIndices.push_back(I); + } + if (MaskedIndices.empty()) + return NewOrder; + SmallVector AvailableIndices(MaskedIndices.size()); + unsigned Cnt = 0; + int Idx = UsedIndices.find_first(); + do { + AvailableIndices[Cnt] = Idx; + Idx = UsedIndices.find_next(Idx); + ++Cnt; + } while (Idx > 0); + assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); + for (int I = 0, E = MaskedIndices.size(); I < E; ++I) + NewOrder[MaskedIndices[I]] = AvailableIndices[I]; + return NewOrder; +} + bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, unsigned Idx) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() @@ -7460,13 +6896,19 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, << "\n"); R.buildTree(Chain); + Optional> Order = R.bestOrder(); + // TODO: Handle orders of size less than number of elements in the vector. + if (Order && Order->size() == Chain.size()) { + // TODO: reorder tree nodes without tree rebuilding. + SmallVector ReorderedOps(Chain.size()); + transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), + [Chain](const unsigned Idx) { return Chain[Idx]; }); + R.buildTree(ReorderedOps); + } if (R.isTreeTinyAndNotFullyVectorizable()) return false; if (R.isLoadCombineCandidate()) return false; - R.reorderTopToBottom(); - R.reorderBottomToTop(); - R.buildExternalUses(); R.computeMinimumValueSizes(); @@ -7589,7 +7031,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, unsigned EltSize = R.getVectorElementSize(Operands[0]); unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize); - unsigned MinVF = R.getMinVF(EltSize); + unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize); unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts); @@ -7662,10 +7104,11 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; Value *VL[] = {A, B}; - return tryToVectorizeList(VL, R); + return tryToVectorizeList(VL, R, /*AllowReorder=*/true); } -bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R) { +bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, + bool AllowReorder) { if (VL.size() < 2) return false; @@ -7699,7 +7142,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R) { } unsigned Sz = R.getVectorElementSize(I0); - unsigned MinVF = R.getMinVF(Sz); + unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); unsigned MaxVF = std::max(PowerOf2Floor(VL.size()), MinVF); MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF); if (MaxVF < 2) { @@ -7752,11 +7195,18 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R) { << "\n"); R.buildTree(Ops); + if (AllowReorder) { + Optional> Order = R.bestOrder(); + if (Order) { + // TODO: reorder tree nodes without tree rebuilding. + SmallVector ReorderedOps(Ops.size()); + transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), + [Ops](const unsigned Idx) { return Ops[Idx]; }); + R.buildTree(ReorderedOps); + } + } if (R.isTreeTinyAndNotFullyVectorizable()) continue; - R.reorderTopToBottom(); - R.reorderBottomToTop(); - R.buildExternalUses(); R.computeMinimumValueSizes(); InstructionCost Cost = R.getTreeCost(); @@ -8400,14 +7850,22 @@ public: unsigned i = 0; while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { ArrayRef VL(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, IgnoreList); + V.buildTree(VL, ExternallyUsedValues, IgnoreList); + Optional> Order = V.bestOrder(); + if (Order) { + assert(Order->size() == VL.size() && + "Order size must be the same as number of vectorized " + "instructions."); + // TODO: reorder tree nodes without tree rebuilding. + SmallVector ReorderedOps(VL.size()); + transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), + [VL](const unsigned Idx) { return VL[Idx]; }); + V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); + } if (V.isTreeTinyAndNotFullyVectorizable()) break; if (V.isLoadCombineReductionCandidate(RdxKind)) break; - V.reorderTopToBottom(); - V.reorderBottomToTop(); - V.buildExternalUses(ExternallyUsedValues); // For a poison-safe boolean logic reduction, do not replace select // instructions with logic ops. All reduced values will be frozen (see @@ -8880,7 +8338,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R); + return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, @@ -8895,7 +8353,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); - return tryToVectorizeList(BuildVectorInsts, R); + return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/true); } bool SLPVectorizerPass::vectorizeSimpleInstructions( @@ -9087,7 +8545,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // So allow tryToVectorizeList to reorder them if it is beneficial. This // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { + if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, + /*AllowReorder=*/true)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; @@ -9098,7 +8557,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } // Final attempt to vectorize phis with the same types. if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) { - if (Candidates.size() > 1 && tryToVectorizeList(Candidates, R)) { + if (Candidates.size() > 1 && + tryToVectorizeList(Candidates, R, /*AllowReorder=*/true)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll index 74ff392..dd722bd 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -6,14 +6,16 @@ target triple = "aarch64--linux-gnu" define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]] -; CHECK-NEXT: ret <2 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]] +; CHECK-NEXT: ret <2 x i64> [[TMP9]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -72,14 +74,16 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) { define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: ret <4 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: ret <4 x i32> [[TMP9]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -110,14 +114,16 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -223,20 +229,22 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) -; CHECK-NEXT: ret i32 [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) +; CHECK-NEXT: ret i32 [[TMP15]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll index 9ddbfa4..f27b0f7 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -6,14 +6,16 @@ target triple = "aarch64--linux-gnu" define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i64> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP6]], [[TMP3]] -; CHECK-NEXT: ret <2 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[V0:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[V1:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i64> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP8]], [[TMP5]] +; CHECK-NEXT: ret <2 x i64> [[TMP9]] ; %v0.0 = extractelement <2 x i64> %v0, i32 0 %v0.1 = extractelement <2 x i64> %v0, i32 1 @@ -72,14 +74,16 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) { define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: ret <4 x i32> [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: ret <4 x i32> [[TMP9]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -110,14 +114,16 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[SHUFFLE]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -223,20 +229,22 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) -; CHECK-NEXT: ret i32 [[TMP13]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]]) +; CHECK-NEXT: ret i32 [[TMP15]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll index c9cb895..4cca20e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -338,26 +338,32 @@ define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias noca ret void } -define void @vec_shuff_reorder() #0 { -; CHECK-LABEL: @vec_shuff_reorder( +; Dont vectorization of following code for float data type as sub is not commutative- +; fc[0] = fb[0]+fa[0]; +; fc[1] = fa[1]-fb[1]; +; fc[2] = fa[2]+fb[2]; +; fc[3] = fb[3]-fa[3]; +; In the above code we can swap the 1st and 2nd operation as fadd is commutative +; but not 2nd or 4th as fsub is not commutative. + +define void @no_vec_shuff_reorder() #0 { +; CHECK-LABEL: @no_vec_shuff_reorder( ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1) to <2 x float>*), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1) to <2 x float>*), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP5]], i32 3 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP6]], i32 3 -; CHECK-NEXT: [[TMP15:%.*]] = fadd <4 x float> [[TMP10]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = fsub <4 x float> [[TMP10]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP17]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] +; CHECK-NEXT: store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]] +; CHECK-NEXT: store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]] +; CHECK-NEXT: store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]] +; CHECK-NEXT: store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 ; CHECK-NEXT: ret void ; %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll index 7b914b4..676391f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -61,12 +61,12 @@ define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) { ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> ; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 ; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP0]], [[TMP3]] -; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[SHUFFLE]], [[TMP3]] ; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[SHUFFLE]] +; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[TMP4]] ; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], ; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> ; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll index c0362f7..ac9bef7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll @@ -30,11 +30,11 @@ define void @fextr1(double* %ptr) { ; CHECK-LABEL: @fextr1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <2 x double>, <2 x double>* undef, align 16 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[LD]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[LD]], -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[SHUFFLE]], ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P1]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[SHUFFLE]], <2 x double>* [[TMP1]], align 4 +; CHECK-NEXT: store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll index d785f8e..17b6bd2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -8,12 +8,12 @@ define i32 @fn1() { ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> , i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> , i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll index 11e313b..ec25ceb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll @@ -11,21 +11,21 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 @@ -69,22 +69,22 @@ define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias noca ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 3 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP2]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[SHUFFLE]], [[TMP10]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll index 33ad7c4..3e1c043 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll @@ -26,31 +26,32 @@ define dso_local void @j() local_unnamed_addr { ; CHECK-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float> ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], ; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> , [[TMP7]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 ; CHECK-NEXT: store float [[TMP9]], float* @g, align 4 -; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 ; CHECK-NEXT: store float [[TMP11]], float* @c, align 4 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 ; CHECK-NEXT: store float [[TMP12]], float* @d, align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 ; CHECK-NEXT: store float [[TMP13]], float* @e, align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 ; CHECK-NEXT: store float [[TMP14]], float* @f, align 4 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 14 ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 15 ; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* @a, align 4 ; CHECK-NEXT: [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 2 -; CHECK-NEXT: [[TMP19:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 2 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32> +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP23]], align 4 +; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP23]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll index 0e5d3bc..52a1bc6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -15,8 +15,8 @@ define void @hoge() { ; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i16> [[TMP0]] to <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> , [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], undef -; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE10]], +; CHECK-NEXT: [[SHUFFLE10:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE10]], ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef ; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index c8a39bc..b78de44 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -130,8 +130,8 @@ define dso_local void @test_unordered_splits(%struct.S* nocapture %p) local_unna ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[G10]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4 @@ -139,8 +139,8 @@ define dso_local void @test_unordered_splits(%struct.S* nocapture %p) local_unna ; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[G20]] to <4 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX30]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll index cff931f..196a5df 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -7,15 +7,15 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A7:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A8:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A1:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A2:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A3:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A4:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] @@ -57,15 +57,15 @@ define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A6:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A4:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A5:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A8:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A2:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] @@ -111,15 +111,15 @@ define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[A4:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A6:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A5:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A8:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A2:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A7:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] -- 2.7.4