ArrayRef<TreeEntry *> ReorderableGathers,
SmallVectorImpl<TreeEntry *> &GatherOps);
+ /// Checks if the given \p TE is a gather node with clustered reused scalars
+ /// and reorders it per given \p Mask.
+ void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
+
/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
/// if any. If it is not vectorized (gather node), returns nullptr.
TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
return None;
}
+/// Checks if the given mask is a "clustered" mask with the same clusters of
+/// size \p Sz, which are not identity submasks.
+static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
+ unsigned Sz) {
+ ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
+ if (ShuffleVectorInst::isIdentityMask(FirstCluster))
+ return false;
+ for (unsigned I = 0, E = Mask.size(); I < E; I += Sz) {
+ ArrayRef<int> Cluster = Mask.slice(I, Sz);
+ if (Cluster != FirstCluster)
+ return false;
+ }
+ return true;
+}
+
+void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
+ // For vectorized and non-clustered reused - just reorder reuses mask.
+ const unsigned Sz = TE.Scalars.size();
+ if (TE.State != TreeEntry::NeedToGather || !TE.ReorderIndices.empty() ||
+ !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
+ Sz) ||
+ !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz)) {
+ reorderReuses(TE.ReuseShuffleIndices, Mask);
+ return;
+ }
+ // Try to improve gathered nodes with clustered reuses, if possible.
+ reorderScalars(TE.Scalars, makeArrayRef(TE.ReuseShuffleIndices).slice(0, Sz));
+ // Fill the reuses mask with the identity submasks.
+ for (auto It = TE.ReuseShuffleIndices.begin(),
+ End = TE.ReuseShuffleIndices.end();
+ It != End; std::advance(It, Sz))
+ std::iota(It, std::next(It + Sz), 0);
+}
+
void BoUpSLP::reorderTopToBottom() {
// Maps VF to the graph nodes.
DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
"All users must be of VF size.");
// Update ordering of the operands with the smaller VF than the given
// one.
- reorderReuses(TE->ReuseShuffleIndices, Mask);
+ reorderNodeWithReuses(*TE, Mask);
}
continue;
}
if (!VisitedOps.insert(TE).second)
continue;
if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
- // Just reorder reuses indices.
- reorderReuses(TE->ReuseShuffleIndices, Mask);
+ reorderNodeWithReuses(*TE, Mask);
continue;
}
// Gathers are processed separately.
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I];
+ if (TE.State == TreeEntry::NeedToGather) {
+ if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
+ E && E->getVectorFactor() == TE.getVectorFactor() &&
+ E->isSame(TE.Scalars)) {
+ // Some gather nodes might be absolutely the same as some vectorizable
+ // nodes after reordering, need to handle it.
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle that starts with "
+ << *TE.Scalars[0] << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ continue;
+ }
+ }
InstructionCost C = getEntryCost(&TE, VectorizedVals);
Cost += C;