From ce39bdbd65e840f1b58a642f8d361898ac99cef3 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 19 Sep 2022 12:43:30 -0700 Subject: [PATCH] [SLP][NFC]Reorder gather nodes with reused scalars, NFC. The compiler does not reorder the gather nodes with reused scalars, just does it for opernads of the user nodes. This currently does not affect the compiler but breaks internal logic of the SLP graph. In future, it is supposed to actually use all nodes instead of just list of operands and this will affect the vectorization result. Also, did some early check to avoid complex logic in cost estimation analysis, should improve compiler time a bit. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 55 +++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 374e2c2..4706543 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2116,6 +2116,10 @@ private: ArrayRef ReorderableGathers, SmallVectorImpl &GatherOps); + /// Checks if the given \p TE is a gather node with clustered reused scalars + /// and reorders it per given \p Mask. + void reorderNodeWithReuses(TreeEntry &TE, ArrayRef Mask) const; + /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph, /// if any. If it is not vectorized (gather node), returns nullptr. TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) { @@ -3785,6 +3789,40 @@ Optional BoUpSLP::getReorderingData(const TreeEntry &TE, return None; } +/// Checks if the given mask is a "clustered" mask with the same clusters of +/// size \p Sz, which are not identity submasks. +static bool isRepeatedNonIdentityClusteredMask(ArrayRef Mask, + unsigned Sz) { + ArrayRef FirstCluster = Mask.slice(0, Sz); + if (ShuffleVectorInst::isIdentityMask(FirstCluster)) + return false; + for (unsigned I = 0, E = Mask.size(); I < E; I += Sz) { + ArrayRef Cluster = Mask.slice(I, Sz); + if (Cluster != FirstCluster) + return false; + } + return true; +} + +void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef Mask) const { + // For vectorized and non-clustered reused - just reorder reuses mask. + const unsigned Sz = TE.Scalars.size(); + if (TE.State != TreeEntry::NeedToGather || !TE.ReorderIndices.empty() || + !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices, + Sz) || + !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz)) { + reorderReuses(TE.ReuseShuffleIndices, Mask); + return; + } + // Try to improve gathered nodes with clustered reuses, if possible. + reorderScalars(TE.Scalars, makeArrayRef(TE.ReuseShuffleIndices).slice(0, Sz)); + // Fill the reuses mask with the identity submasks. + for (auto It = TE.ReuseShuffleIndices.begin(), + End = TE.ReuseShuffleIndices.end(); + It != End; std::advance(It, Sz)) + std::iota(It, std::next(It + Sz), 0); +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap> VFToOrderedEntries; @@ -3976,7 +4014,7 @@ void BoUpSLP::reorderTopToBottom() { "All users must be of VF size."); // Update ordering of the operands with the smaller VF than the given // one. - reorderReuses(TE->ReuseShuffleIndices, Mask); + reorderNodeWithReuses(*TE, Mask); } continue; } @@ -4258,8 +4296,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { if (!VisitedOps.insert(TE).second) continue; if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { - // Just reorder reuses indices. - reorderReuses(TE->ReuseShuffleIndices, Mask); + reorderNodeWithReuses(*TE, Mask); continue; } // Gathers are processed separately. @@ -7165,6 +7202,18 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = *VectorizableTree[I]; + if (TE.State == TreeEntry::NeedToGather) { + if (const TreeEntry *E = getTreeEntry(TE.getMainOp()); + E && E->getVectorFactor() == TE.getVectorFactor() && + E->isSame(TE.Scalars)) { + // Some gather nodes might be absolutely the same as some vectorizable + // nodes after reordering, need to handle it. + LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle that starts with " + << *TE.Scalars[0] << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + continue; + } + } InstructionCost C = getEntryCost(&TE, VectorizedVals); Cost += C; -- 2.7.4