This moves the SuffixTree test used in the Machine Outliner and moves it into Support for use in other outliners elsewhere in the compilation pipeline.
Differential Revision: https://reviews.llvm.org/D80586
--- /dev/null
+//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Suffix Tree class and Suffix Tree Node struct.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_SUPPORT_SUFFIXTREE_H
+#define LLVM_SUPPORT_SUFFIXTREE_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Allocator.h"
+#include <vector>
+
+namespace llvm {
+
+/// Represents an undefined index in the suffix tree.
+const unsigned EmptyIdx = -1;
+
+/// A node in a suffix tree which represents a substring or suffix.
+///
+/// Each node has either no children or at least two children, with the root
+/// being a exception in the empty tree.
+///
+/// Children are represented as a map between unsigned integers and nodes. If
+/// a node N has a child M on unsigned integer k, then the mapping represented
+/// by N is a proper prefix of the mapping represented by M. Note that this,
+/// although similar to a trie is somewhat different: each node stores a full
+/// substring of the full mapping rather than a single character state.
+///
+/// Each internal node contains a pointer to the internal node representing
+/// the same string, but with the first character chopped off. This is stored
+/// in \p Link. Each leaf node stores the start index of its respective
+/// suffix in \p SuffixIdx.
+struct SuffixTreeNode {
+
+ /// The children of this node.
+ ///
+ /// A child existing on an unsigned integer implies that from the mapping
+ /// represented by the current node, there is a way to reach another
+ /// mapping by tacking that character on the end of the current string.
+ llvm::DenseMap<unsigned, SuffixTreeNode *> Children;
+
+ /// The start index of this node's substring in the main string.
+ unsigned StartIdx = EmptyIdx;
+
+ /// The end index of this node's substring in the main string.
+ ///
+ /// Every leaf node must have its \p EndIdx incremented at the end of every
+ /// step in the construction algorithm. To avoid having to update O(N)
+ /// nodes individually at the end of every step, the end index is stored
+ /// as a pointer.
+ unsigned *EndIdx = nullptr;
+
+ /// For leaves, the start index of the suffix represented by this node.
+ ///
+ /// For all other nodes, this is ignored.
+ unsigned SuffixIdx = EmptyIdx;
+
+ /// For internal nodes, a pointer to the internal node representing
+ /// the same sequence with the first character chopped off.
+ ///
+ /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
+ /// Ukkonen's algorithm does to achieve linear-time construction is
+ /// keep track of which node the next insert should be at. This makes each
+ /// insert O(1), and there are a total of O(N) inserts. The suffix link
+ /// helps with inserting children of internal nodes.
+ ///
+ /// Say we add a child to an internal node with associated mapping S. The
+ /// next insertion must be at the node representing S - its first character.
+ /// This is given by the way that we iteratively build the tree in Ukkonen's
+ /// algorithm. The main idea is to look at the suffixes of each prefix in the
+ /// string, starting with the longest suffix of the prefix, and ending with
+ /// the shortest. Therefore, if we keep pointers between such nodes, we can
+ /// move to the next insertion point in O(1) time. If we don't, then we'd
+ /// have to query from the root, which takes O(N) time. This would make the
+ /// construction algorithm O(N^2) rather than O(N).
+ SuffixTreeNode *Link = nullptr;
+
+ /// The length of the string formed by concatenating the edge labels from the
+ /// root to this node.
+ unsigned ConcatLen = 0;
+
+ /// Returns true if this node is a leaf.
+ bool isLeaf() const { return SuffixIdx != EmptyIdx; }
+
+ /// Returns true if this node is the root of its owning \p SuffixTree.
+ bool isRoot() const { return StartIdx == EmptyIdx; }
+
+ /// Return the number of elements in the substring associated with this node.
+ size_t size() const {
+
+ // Is it the root? If so, it's the empty string so return 0.
+ if (isRoot())
+ return 0;
+
+ assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
+
+ // Size = the number of elements in the string.
+ // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
+ return *EndIdx - StartIdx + 1;
+ }
+
+ SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
+ : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
+
+ SuffixTreeNode() {}
+};
+
+/// A data structure for fast substring queries.
+///
+/// Suffix trees represent the suffixes of their input strings in their leaves.
+/// A suffix tree is a type of compressed trie structure where each node
+/// represents an entire substring rather than a single character. Each leaf
+/// of the tree is a suffix.
+///
+/// A suffix tree can be seen as a type of state machine where each state is a
+/// substring of the full string. The tree is structured so that, for a string
+/// of length N, there are exactly N leaves in the tree. This structure allows
+/// us to quickly find repeated substrings of the input string.
+///
+/// In this implementation, a "string" is a vector of unsigned integers.
+/// These integers may result from hashing some data type. A suffix tree can
+/// contain 1 or many strings, which can then be queried as one large string.
+///
+/// The suffix tree is implemented using Ukkonen's algorithm for linear-time
+/// suffix tree construction. Ukkonen's algorithm is explained in more detail
+/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
+/// paper is available at
+///
+/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
+class SuffixTree {
+public:
+ /// Each element is an integer representing an instruction in the module.
+ llvm::ArrayRef<unsigned> Str;
+
+ /// A repeated substring in the tree.
+ struct RepeatedSubstring {
+ /// The length of the string.
+ unsigned Length;
+
+ /// The start indices of each occurrence.
+ std::vector<unsigned> StartIndices;
+ };
+
+private:
+ /// Maintains each node in the tree.
+ llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
+
+ /// The root of the suffix tree.
+ ///
+ /// The root represents the empty string. It is maintained by the
+ /// \p NodeAllocator like every other node in the tree.
+ SuffixTreeNode *Root = nullptr;
+
+ /// Maintains the end indices of the internal nodes in the tree.
+ ///
+ /// Each internal node is guaranteed to never have its end index change
+ /// during the construction algorithm; however, leaves must be updated at
+ /// every step. Therefore, we need to store leaf end indices by reference
+ /// to avoid updating O(N) leaves at every step of construction. Thus,
+ /// every internal node must be allocated its own end index.
+ llvm::BumpPtrAllocator InternalEndIdxAllocator;
+
+ /// The end index of each leaf in the tree.
+ unsigned LeafEndIdx = -1;
+
+ /// Helper struct which keeps track of the next insertion point in
+ /// Ukkonen's algorithm.
+ struct ActiveState {
+ /// The next node to insert at.
+ SuffixTreeNode *Node = nullptr;
+
+ /// The index of the first character in the substring currently being added.
+ unsigned Idx = EmptyIdx;
+
+ /// The length of the substring we have to add at the current step.
+ unsigned Len = 0;
+ };
+
+ /// The point the next insertion will take place at in the
+ /// construction algorithm.
+ ActiveState Active;
+
+ /// Allocate a leaf node and add it to the tree.
+ ///
+ /// \param Parent The parent of this node.
+ /// \param StartIdx The start index of this node's associated string.
+ /// \param Edge The label on the edge leaving \p Parent to this node.
+ ///
+ /// \returns A pointer to the allocated leaf node.
+ SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
+ unsigned Edge);
+
+ /// Allocate an internal node and add it to the tree.
+ ///
+ /// \param Parent The parent of this node. Only null when allocating the root.
+ /// \param StartIdx The start index of this node's associated string.
+ /// \param EndIdx The end index of this node's associated string.
+ /// \param Edge The label on the edge leaving \p Parent to this node.
+ ///
+ /// \returns A pointer to the allocated internal node.
+ SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
+ unsigned EndIdx, unsigned Edge);
+
+ /// Set the suffix indices of the leaves to the start indices of their
+ /// respective suffixes.
+ void setSuffixIndices();
+
+ /// Construct the suffix tree for the prefix of the input ending at
+ /// \p EndIdx.
+ ///
+ /// Used to construct the full suffix tree iteratively. At the end of each
+ /// step, the constructed suffix tree is either a valid suffix tree, or a
+ /// suffix tree with implicit suffixes. At the end of the final step, the
+ /// suffix tree is a valid tree.
+ ///
+ /// \param EndIdx The end index of the current prefix in the main string.
+ /// \param SuffixesToAdd The number of suffixes that must be added
+ /// to complete the suffix tree at the current phase.
+ ///
+ /// \returns The number of suffixes that have not been added at the end of
+ /// this step.
+ unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
+
+public:
+ /// Construct a suffix tree from a sequence of unsigned integers.
+ ///
+ /// \param Str The string to construct the suffix tree for.
+ SuffixTree(const std::vector<unsigned> &Str);
+
+ /// Iterator for finding all repeated substrings in the suffix tree.
+ struct RepeatedSubstringIterator {
+ private:
+ /// The current node we're visiting.
+ SuffixTreeNode *N = nullptr;
+
+ /// The repeated substring associated with this node.
+ RepeatedSubstring RS;
+
+ /// The nodes left to visit.
+ std::vector<SuffixTreeNode *> ToVisit;
+
+ /// The minimum length of a repeated substring to find.
+ /// Since we're outlining, we want at least two instructions in the range.
+ /// FIXME: This may not be true for targets like X86 which support many
+ /// instruction lengths.
+ const unsigned MinLength = 2;
+
+ /// Move the iterator to the next repeated substring.
+ void advance() {
+ // Clear the current state. If we're at the end of the range, then this
+ // is the state we want to be in.
+ RS = RepeatedSubstring();
+ N = nullptr;
+
+ // Each leaf node represents a repeat of a string.
+ std::vector<SuffixTreeNode *> LeafChildren;
+
+ // Continue visiting nodes until we find one which repeats more than once.
+ while (!ToVisit.empty()) {
+ SuffixTreeNode *Curr = ToVisit.back();
+ ToVisit.pop_back();
+ LeafChildren.clear();
+
+ // Keep track of the length of the string associated with the node. If
+ // it's too short, we'll quit.
+ unsigned Length = Curr->ConcatLen;
+
+ // Iterate over each child, saving internal nodes for visiting, and
+ // leaf nodes in LeafChildren. Internal nodes represent individual
+ // strings, which may repeat.
+ for (auto &ChildPair : Curr->Children) {
+ // Save all of this node's children for processing.
+ if (!ChildPair.second->isLeaf())
+ ToVisit.push_back(ChildPair.second);
+
+ // It's not an internal node, so it must be a leaf. If we have a
+ // long enough string, then save the leaf children.
+ else if (Length >= MinLength)
+ LeafChildren.push_back(ChildPair.second);
+ }
+
+ // The root never represents a repeated substring. If we're looking at
+ // that, then skip it.
+ if (Curr->isRoot())
+ continue;
+
+ // Do we have any repeated substrings?
+ if (LeafChildren.size() >= 2) {
+ // Yes. Update the state to reflect this, and then bail out.
+ N = Curr;
+ RS.Length = Length;
+ for (SuffixTreeNode *Leaf : LeafChildren)
+ RS.StartIndices.push_back(Leaf->SuffixIdx);
+ break;
+ }
+ }
+
+ // At this point, either NewRS is an empty RepeatedSubstring, or it was
+ // set in the above loop. Similarly, N is either nullptr, or the node
+ // associated with NewRS.
+ }
+
+ public:
+ /// Return the current repeated substring.
+ RepeatedSubstring &operator*() { return RS; }
+
+ RepeatedSubstringIterator &operator++() {
+ advance();
+ return *this;
+ }
+
+ RepeatedSubstringIterator operator++(int I) {
+ RepeatedSubstringIterator It(*this);
+ advance();
+ return It;
+ }
+
+ bool operator==(const RepeatedSubstringIterator &Other) {
+ return N == Other.N;
+ }
+ bool operator!=(const RepeatedSubstringIterator &Other) {
+ return !(*this == Other);
+ }
+
+ RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
+ // Do we have a non-null node?
+ if (N) {
+ // Yes. At the first step, we need to visit all of N's children.
+ // Note: This means that we visit N last.
+ ToVisit.push_back(N);
+ advance();
+ }
+ }
+ };
+
+ typedef RepeatedSubstringIterator iterator;
+ iterator begin() { return iterator(Root); }
+ iterator end() { return iterator(nullptr); }
+};
+
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_SUFFIXTREE_H
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Mangler.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Support/Allocator.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/SuffixTree.h"
#include "llvm/Support/raw_ostream.h"
#include <functional>
#include <tuple>
namespace {
-/// Represents an undefined index in the suffix tree.
-const unsigned EmptyIdx = -1;
-
-/// A node in a suffix tree which represents a substring or suffix.
-///
-/// Each node has either no children or at least two children, with the root
-/// being a exception in the empty tree.
-///
-/// Children are represented as a map between unsigned integers and nodes. If
-/// a node N has a child M on unsigned integer k, then the mapping represented
-/// by N is a proper prefix of the mapping represented by M. Note that this,
-/// although similar to a trie is somewhat different: each node stores a full
-/// substring of the full mapping rather than a single character state.
-///
-/// Each internal node contains a pointer to the internal node representing
-/// the same string, but with the first character chopped off. This is stored
-/// in \p Link. Each leaf node stores the start index of its respective
-/// suffix in \p SuffixIdx.
-struct SuffixTreeNode {
-
- /// The children of this node.
- ///
- /// A child existing on an unsigned integer implies that from the mapping
- /// represented by the current node, there is a way to reach another
- /// mapping by tacking that character on the end of the current string.
- DenseMap<unsigned, SuffixTreeNode *> Children;
-
- /// The start index of this node's substring in the main string.
- unsigned StartIdx = EmptyIdx;
-
- /// The end index of this node's substring in the main string.
- ///
- /// Every leaf node must have its \p EndIdx incremented at the end of every
- /// step in the construction algorithm. To avoid having to update O(N)
- /// nodes individually at the end of every step, the end index is stored
- /// as a pointer.
- unsigned *EndIdx = nullptr;
-
- /// For leaves, the start index of the suffix represented by this node.
- ///
- /// For all other nodes, this is ignored.
- unsigned SuffixIdx = EmptyIdx;
-
- /// For internal nodes, a pointer to the internal node representing
- /// the same sequence with the first character chopped off.
- ///
- /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
- /// Ukkonen's algorithm does to achieve linear-time construction is
- /// keep track of which node the next insert should be at. This makes each
- /// insert O(1), and there are a total of O(N) inserts. The suffix link
- /// helps with inserting children of internal nodes.
- ///
- /// Say we add a child to an internal node with associated mapping S. The
- /// next insertion must be at the node representing S - its first character.
- /// This is given by the way that we iteratively build the tree in Ukkonen's
- /// algorithm. The main idea is to look at the suffixes of each prefix in the
- /// string, starting with the longest suffix of the prefix, and ending with
- /// the shortest. Therefore, if we keep pointers between such nodes, we can
- /// move to the next insertion point in O(1) time. If we don't, then we'd
- /// have to query from the root, which takes O(N) time. This would make the
- /// construction algorithm O(N^2) rather than O(N).
- SuffixTreeNode *Link = nullptr;
-
- /// The length of the string formed by concatenating the edge labels from the
- /// root to this node.
- unsigned ConcatLen = 0;
-
- /// Returns true if this node is a leaf.
- bool isLeaf() const { return SuffixIdx != EmptyIdx; }
-
- /// Returns true if this node is the root of its owning \p SuffixTree.
- bool isRoot() const { return StartIdx == EmptyIdx; }
-
- /// Return the number of elements in the substring associated with this node.
- size_t size() const {
-
- // Is it the root? If so, it's the empty string so return 0.
- if (isRoot())
- return 0;
-
- assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
-
- // Size = the number of elements in the string.
- // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
- return *EndIdx - StartIdx + 1;
- }
-
- SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
- : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
-
- SuffixTreeNode() {}
-};
-
-/// A data structure for fast substring queries.
-///
-/// Suffix trees represent the suffixes of their input strings in their leaves.
-/// A suffix tree is a type of compressed trie structure where each node
-/// represents an entire substring rather than a single character. Each leaf
-/// of the tree is a suffix.
-///
-/// A suffix tree can be seen as a type of state machine where each state is a
-/// substring of the full string. The tree is structured so that, for a string
-/// of length N, there are exactly N leaves in the tree. This structure allows
-/// us to quickly find repeated substrings of the input string.
-///
-/// In this implementation, a "string" is a vector of unsigned integers.
-/// These integers may result from hashing some data type. A suffix tree can
-/// contain 1 or many strings, which can then be queried as one large string.
-///
-/// The suffix tree is implemented using Ukkonen's algorithm for linear-time
-/// suffix tree construction. Ukkonen's algorithm is explained in more detail
-/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
-/// paper is available at
-///
-/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
-class SuffixTree {
-public:
- /// Each element is an integer representing an instruction in the module.
- ArrayRef<unsigned> Str;
-
- /// A repeated substring in the tree.
- struct RepeatedSubstring {
- /// The length of the string.
- unsigned Length;
-
- /// The start indices of each occurrence.
- std::vector<unsigned> StartIndices;
- };
-
-private:
- /// Maintains each node in the tree.
- SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
-
- /// The root of the suffix tree.
- ///
- /// The root represents the empty string. It is maintained by the
- /// \p NodeAllocator like every other node in the tree.
- SuffixTreeNode *Root = nullptr;
-
- /// Maintains the end indices of the internal nodes in the tree.
- ///
- /// Each internal node is guaranteed to never have its end index change
- /// during the construction algorithm; however, leaves must be updated at
- /// every step. Therefore, we need to store leaf end indices by reference
- /// to avoid updating O(N) leaves at every step of construction. Thus,
- /// every internal node must be allocated its own end index.
- BumpPtrAllocator InternalEndIdxAllocator;
-
- /// The end index of each leaf in the tree.
- unsigned LeafEndIdx = -1;
-
- /// Helper struct which keeps track of the next insertion point in
- /// Ukkonen's algorithm.
- struct ActiveState {
- /// The next node to insert at.
- SuffixTreeNode *Node = nullptr;
-
- /// The index of the first character in the substring currently being added.
- unsigned Idx = EmptyIdx;
-
- /// The length of the substring we have to add at the current step.
- unsigned Len = 0;
- };
-
- /// The point the next insertion will take place at in the
- /// construction algorithm.
- ActiveState Active;
-
- /// Allocate a leaf node and add it to the tree.
- ///
- /// \param Parent The parent of this node.
- /// \param StartIdx The start index of this node's associated string.
- /// \param Edge The label on the edge leaving \p Parent to this node.
- ///
- /// \returns A pointer to the allocated leaf node.
- SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
- unsigned Edge) {
-
- assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
-
- SuffixTreeNode *N = new (NodeAllocator.Allocate())
- SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
- Parent.Children[Edge] = N;
-
- return N;
- }
-
- /// Allocate an internal node and add it to the tree.
- ///
- /// \param Parent The parent of this node. Only null when allocating the root.
- /// \param StartIdx The start index of this node's associated string.
- /// \param EndIdx The end index of this node's associated string.
- /// \param Edge The label on the edge leaving \p Parent to this node.
- ///
- /// \returns A pointer to the allocated internal node.
- SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
- unsigned EndIdx, unsigned Edge) {
-
- assert(StartIdx <= EndIdx && "String can't start after it ends!");
- assert(!(!Parent && StartIdx != EmptyIdx) &&
- "Non-root internal nodes must have parents!");
-
- unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
- SuffixTreeNode *N =
- new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
- if (Parent)
- Parent->Children[Edge] = N;
-
- return N;
- }
-
- /// Set the suffix indices of the leaves to the start indices of their
- /// respective suffixes.
- void setSuffixIndices() {
- // List of nodes we need to visit along with the current length of the
- // string.
- std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
-
- // Current node being visited.
- SuffixTreeNode *CurrNode = Root;
-
- // Sum of the lengths of the nodes down the path to the current one.
- unsigned CurrNodeLen = 0;
- ToVisit.push_back({CurrNode, CurrNodeLen});
- while (!ToVisit.empty()) {
- std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
- ToVisit.pop_back();
- CurrNode->ConcatLen = CurrNodeLen;
- for (auto &ChildPair : CurrNode->Children) {
- assert(ChildPair.second && "Node had a null child!");
- ToVisit.push_back(
- {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
- }
-
- // No children, so we are at the end of the string.
- if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
- CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
- }
- }
-
- /// Construct the suffix tree for the prefix of the input ending at
- /// \p EndIdx.
- ///
- /// Used to construct the full suffix tree iteratively. At the end of each
- /// step, the constructed suffix tree is either a valid suffix tree, or a
- /// suffix tree with implicit suffixes. At the end of the final step, the
- /// suffix tree is a valid tree.
- ///
- /// \param EndIdx The end index of the current prefix in the main string.
- /// \param SuffixesToAdd The number of suffixes that must be added
- /// to complete the suffix tree at the current phase.
- ///
- /// \returns The number of suffixes that have not been added at the end of
- /// this step.
- unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd) {
- SuffixTreeNode *NeedsLink = nullptr;
-
- while (SuffixesToAdd > 0) {
-
- // Are we waiting to add anything other than just the last character?
- if (Active.Len == 0) {
- // If not, then say the active index is the end index.
- Active.Idx = EndIdx;
- }
-
- assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
-
- // The first character in the current substring we're looking at.
- unsigned FirstChar = Str[Active.Idx];
-
- // Have we inserted anything starting with FirstChar at the current node?
- if (Active.Node->Children.count(FirstChar) == 0) {
- // If not, then we can just insert a leaf and move too the next step.
- insertLeaf(*Active.Node, EndIdx, FirstChar);
-
- // The active node is an internal node, and we visited it, so it must
- // need a link if it doesn't have one.
- if (NeedsLink) {
- NeedsLink->Link = Active.Node;
- NeedsLink = nullptr;
- }
- } else {
- // There's a match with FirstChar, so look for the point in the tree to
- // insert a new node.
- SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
-
- unsigned SubstringLen = NextNode->size();
-
- // Is the current suffix we're trying to insert longer than the size of
- // the child we want to move to?
- if (Active.Len >= SubstringLen) {
- // If yes, then consume the characters we've seen and move to the next
- // node.
- Active.Idx += SubstringLen;
- Active.Len -= SubstringLen;
- Active.Node = NextNode;
- continue;
- }
-
- // Otherwise, the suffix we're trying to insert must be contained in the
- // next node we want to move to.
- unsigned LastChar = Str[EndIdx];
-
- // Is the string we're trying to insert a substring of the next node?
- if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
- // If yes, then we're done for this step. Remember our insertion point
- // and move to the next end index. At this point, we have an implicit
- // suffix tree.
- if (NeedsLink && !Active.Node->isRoot()) {
- NeedsLink->Link = Active.Node;
- NeedsLink = nullptr;
- }
-
- Active.Len++;
- break;
- }
-
- // The string we're trying to insert isn't a substring of the next node,
- // but matches up to a point. Split the node.
- //
- // For example, say we ended our search at a node n and we're trying to
- // insert ABD. Then we'll create a new node s for AB, reduce n to just
- // representing C, and insert a new leaf node l to represent d. This
- // allows us to ensure that if n was a leaf, it remains a leaf.
- //
- // | ABC ---split---> | AB
- // n s
- // C / \ D
- // n l
-
- // The node s from the diagram
- SuffixTreeNode *SplitNode =
- insertInternalNode(Active.Node, NextNode->StartIdx,
- NextNode->StartIdx + Active.Len - 1, FirstChar);
-
- // Insert the new node representing the new substring into the tree as
- // a child of the split node. This is the node l from the diagram.
- insertLeaf(*SplitNode, EndIdx, LastChar);
-
- // Make the old node a child of the split node and update its start
- // index. This is the node n from the diagram.
- NextNode->StartIdx += Active.Len;
- SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
-
- // SplitNode is an internal node, update the suffix link.
- if (NeedsLink)
- NeedsLink->Link = SplitNode;
-
- NeedsLink = SplitNode;
- }
-
- // We've added something new to the tree, so there's one less suffix to
- // add.
- SuffixesToAdd--;
-
- if (Active.Node->isRoot()) {
- if (Active.Len > 0) {
- Active.Len--;
- Active.Idx = EndIdx - SuffixesToAdd + 1;
- }
- } else {
- // Start the next phase at the next smallest suffix.
- Active.Node = Active.Node->Link;
- }
- }
-
- return SuffixesToAdd;
- }
-
-public:
- /// Construct a suffix tree from a sequence of unsigned integers.
- ///
- /// \param Str The string to construct the suffix tree for.
- SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
- Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
- Active.Node = Root;
-
- // Keep track of the number of suffixes we have to add of the current
- // prefix.
- unsigned SuffixesToAdd = 0;
-
- // Construct the suffix tree iteratively on each prefix of the string.
- // PfxEndIdx is the end index of the current prefix.
- // End is one past the last element in the string.
- for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End;
- PfxEndIdx++) {
- SuffixesToAdd++;
- LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
- SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
- }
-
- // Set the suffix indices of each leaf.
- assert(Root && "Root node can't be nullptr!");
- setSuffixIndices();
- }
-
- /// Iterator for finding all repeated substrings in the suffix tree.
- struct RepeatedSubstringIterator {
- private:
- /// The current node we're visiting.
- SuffixTreeNode *N = nullptr;
-
- /// The repeated substring associated with this node.
- RepeatedSubstring RS;
-
- /// The nodes left to visit.
- std::vector<SuffixTreeNode *> ToVisit;
-
- /// The minimum length of a repeated substring to find.
- /// Since we're outlining, we want at least two instructions in the range.
- /// FIXME: This may not be true for targets like X86 which support many
- /// instruction lengths.
- const unsigned MinLength = 2;
-
- /// Move the iterator to the next repeated substring.
- void advance() {
- // Clear the current state. If we're at the end of the range, then this
- // is the state we want to be in.
- RS = RepeatedSubstring();
- N = nullptr;
-
- // Each leaf node represents a repeat of a string.
- std::vector<SuffixTreeNode *> LeafChildren;
-
- // Continue visiting nodes until we find one which repeats more than once.
- while (!ToVisit.empty()) {
- SuffixTreeNode *Curr = ToVisit.back();
- ToVisit.pop_back();
- LeafChildren.clear();
-
- // Keep track of the length of the string associated with the node. If
- // it's too short, we'll quit.
- unsigned Length = Curr->ConcatLen;
-
- // Iterate over each child, saving internal nodes for visiting, and
- // leaf nodes in LeafChildren. Internal nodes represent individual
- // strings, which may repeat.
- for (auto &ChildPair : Curr->Children) {
- // Save all of this node's children for processing.
- if (!ChildPair.second->isLeaf())
- ToVisit.push_back(ChildPair.second);
-
- // It's not an internal node, so it must be a leaf. If we have a
- // long enough string, then save the leaf children.
- else if (Length >= MinLength)
- LeafChildren.push_back(ChildPair.second);
- }
-
- // The root never represents a repeated substring. If we're looking at
- // that, then skip it.
- if (Curr->isRoot())
- continue;
-
- // Do we have any repeated substrings?
- if (LeafChildren.size() >= 2) {
- // Yes. Update the state to reflect this, and then bail out.
- N = Curr;
- RS.Length = Length;
- for (SuffixTreeNode *Leaf : LeafChildren)
- RS.StartIndices.push_back(Leaf->SuffixIdx);
- break;
- }
- }
-
- // At this point, either NewRS is an empty RepeatedSubstring, or it was
- // set in the above loop. Similarly, N is either nullptr, or the node
- // associated with NewRS.
- }
-
- public:
- /// Return the current repeated substring.
- RepeatedSubstring &operator*() { return RS; }
-
- RepeatedSubstringIterator &operator++() {
- advance();
- return *this;
- }
-
- RepeatedSubstringIterator operator++(int I) {
- RepeatedSubstringIterator It(*this);
- advance();
- return It;
- }
-
- bool operator==(const RepeatedSubstringIterator &Other) {
- return N == Other.N;
- }
- bool operator!=(const RepeatedSubstringIterator &Other) {
- return !(*this == Other);
- }
-
- RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
- // Do we have a non-null node?
- if (N) {
- // Yes. At the first step, we need to visit all of N's children.
- // Note: This means that we visit N last.
- ToVisit.push_back(N);
- advance();
- }
- }
- };
-
- typedef RepeatedSubstringIterator iterator;
- iterator begin() { return iterator(Root); }
- iterator end() { return iterator(nullptr); }
-};
-
/// Maps \p MachineInstrs to unsigned integers and stores the mappings.
struct InstructionMapper {
StringMap.cpp
StringSaver.cpp
StringRef.cpp
+ SuffixTree.cpp
SymbolRemappingReader.cpp
SystemUtils.cpp
TarWriter.cpp
--- /dev/null
+//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Suffix Tree class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SuffixTree.h"
+#include "llvm/Support/Allocator.h"
+#include <vector>
+
+using namespace llvm;
+
+SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
+ Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
+ Active.Node = Root;
+
+ // Keep track of the number of suffixes we have to add of the current
+ // prefix.
+ unsigned SuffixesToAdd = 0;
+
+ // Construct the suffix tree iteratively on each prefix of the string.
+ // PfxEndIdx is the end index of the current prefix.
+ // End is one past the last element in the string.
+ for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
+ SuffixesToAdd++;
+ LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
+ SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
+ }
+
+ // Set the suffix indices of each leaf.
+ assert(Root && "Root node can't be nullptr!");
+ setSuffixIndices();
+}
+
+SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
+ unsigned StartIdx, unsigned Edge) {
+
+ assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
+
+ SuffixTreeNode *N = new (NodeAllocator.Allocate())
+ SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
+ Parent.Children[Edge] = N;
+
+ return N;
+}
+
+SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
+ unsigned StartIdx,
+ unsigned EndIdx, unsigned Edge) {
+
+ assert(StartIdx <= EndIdx && "String can't start after it ends!");
+ assert(!(!Parent && StartIdx != EmptyIdx) &&
+ "Non-root internal nodes must have parents!");
+
+ unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
+ SuffixTreeNode *N =
+ new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
+ if (Parent)
+ Parent->Children[Edge] = N;
+
+ return N;
+}
+
+void SuffixTree::setSuffixIndices() {
+ // List of nodes we need to visit along with the current length of the
+ // string.
+ std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
+
+ // Current node being visited.
+ SuffixTreeNode *CurrNode = Root;
+
+ // Sum of the lengths of the nodes down the path to the current one.
+ unsigned CurrNodeLen = 0;
+ ToVisit.push_back({CurrNode, CurrNodeLen});
+ while (!ToVisit.empty()) {
+ std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
+ ToVisit.pop_back();
+ CurrNode->ConcatLen = CurrNodeLen;
+ for (auto &ChildPair : CurrNode->Children) {
+ assert(ChildPair.second && "Node had a null child!");
+ ToVisit.push_back(
+ {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
+ }
+
+ // No children, so we are at the end of the string.
+ if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
+ CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
+ }
+}
+
+unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
+ SuffixTreeNode *NeedsLink = nullptr;
+
+ while (SuffixesToAdd > 0) {
+
+ // Are we waiting to add anything other than just the last character?
+ if (Active.Len == 0) {
+ // If not, then say the active index is the end index.
+ Active.Idx = EndIdx;
+ }
+
+ assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
+
+ // The first character in the current substring we're looking at.
+ unsigned FirstChar = Str[Active.Idx];
+
+ // Have we inserted anything starting with FirstChar at the current node?
+ if (Active.Node->Children.count(FirstChar) == 0) {
+ // If not, then we can just insert a leaf and move to the next step.
+ insertLeaf(*Active.Node, EndIdx, FirstChar);
+
+ // The active node is an internal node, and we visited it, so it must
+ // need a link if it doesn't have one.
+ if (NeedsLink) {
+ NeedsLink->Link = Active.Node;
+ NeedsLink = nullptr;
+ }
+ } else {
+ // There's a match with FirstChar, so look for the point in the tree to
+ // insert a new node.
+ SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
+
+ unsigned SubstringLen = NextNode->size();
+
+ // Is the current suffix we're trying to insert longer than the size of
+ // the child we want to move to?
+ if (Active.Len >= SubstringLen) {
+ // If yes, then consume the characters we've seen and move to the next
+ // node.
+ Active.Idx += SubstringLen;
+ Active.Len -= SubstringLen;
+ Active.Node = NextNode;
+ continue;
+ }
+
+ // Otherwise, the suffix we're trying to insert must be contained in the
+ // next node we want to move to.
+ unsigned LastChar = Str[EndIdx];
+
+ // Is the string we're trying to insert a substring of the next node?
+ if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
+ // If yes, then we're done for this step. Remember our insertion point
+ // and move to the next end index. At this point, we have an implicit
+ // suffix tree.
+ if (NeedsLink && !Active.Node->isRoot()) {
+ NeedsLink->Link = Active.Node;
+ NeedsLink = nullptr;
+ }
+
+ Active.Len++;
+ break;
+ }
+
+ // The string we're trying to insert isn't a substring of the next node,
+ // but matches up to a point. Split the node.
+ //
+ // For example, say we ended our search at a node n and we're trying to
+ // insert ABD. Then we'll create a new node s for AB, reduce n to just
+ // representing C, and insert a new leaf node l to represent d. This
+ // allows us to ensure that if n was a leaf, it remains a leaf.
+ //
+ // | ABC ---split---> | AB
+ // n s
+ // C / \ D
+ // n l
+
+ // The node s from the diagram
+ SuffixTreeNode *SplitNode =
+ insertInternalNode(Active.Node, NextNode->StartIdx,
+ NextNode->StartIdx + Active.Len - 1, FirstChar);
+
+ // Insert the new node representing the new substring into the tree as
+ // a child of the split node. This is the node l from the diagram.
+ insertLeaf(*SplitNode, EndIdx, LastChar);
+
+ // Make the old node a child of the split node and update its start
+ // index. This is the node n from the diagram.
+ NextNode->StartIdx += Active.Len;
+ SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
+
+ // SplitNode is an internal node, update the suffix link.
+ if (NeedsLink)
+ NeedsLink->Link = SplitNode;
+
+ NeedsLink = SplitNode;
+ }
+
+ // We've added something new to the tree, so there's one less suffix to
+ // add.
+ SuffixesToAdd--;
+
+ if (Active.Node->isRoot()) {
+ if (Active.Len > 0) {
+ Active.Len--;
+ Active.Idx = EndIdx - SuffixesToAdd + 1;
+ }
+ } else {
+ // Start the next phase at the next smallest suffix.
+ Active.Node = Active.Node->Link;
+ }
+ }
+
+ return SuffixesToAdd;
+}
ScaledNumberTest.cpp
SourceMgrTest.cpp
SpecialCaseListTest.cpp
+ SuffixTreeTest.cpp
SwapByteOrderTest.cpp
SymbolRemappingReaderTest.cpp
TarWriterTest.cpp
--- /dev/null
+//===- unittests/Support/SuffixTreeTest.cpp - suffix tree tests -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SuffixTree.h"
+#include "gtest/gtest.h"
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+
+// Each example vector has a unique element at the end to represent the end of
+// the string
+
+// Tests that The SuffixTree finds a simple repetition of the substring {1, 2}
+// {1, 2} twice in the provided string.
+TEST(SuffixTreeTest, TestSingleRepetition) {
+ std::vector<unsigned> SimpleRepetitionData = {1, 2, 1, 2, 3};
+ SuffixTree ST(SimpleRepetitionData);
+ std::vector<SuffixTree::RepeatedSubstring> SubStrings;
+ for (auto It = ST.begin(); It != ST.end(); It++)
+ SubStrings.push_back(*It);
+ ASSERT_EQ(SubStrings.size(), 1u);
+ EXPECT_EQ(SubStrings[0].Length, 2u);
+ EXPECT_EQ(SubStrings[0].StartIndices.size(), 2u);
+ for (unsigned StartIdx : SubStrings[0].StartIndices) {
+ EXPECT_EQ(SimpleRepetitionData[StartIdx], 1u);
+ EXPECT_EQ(SimpleRepetitionData[StartIdx + 1], 2u);
+ }
+}
+
+// Tests that the SuffixTree is able to find the substrings {1, 2, 3} at
+// at indices 0 and 3 as well as the substrings {2, 3} at indices 1 and 4.
+// This test also serves as a flag for improvements to the suffix tree.
+//
+// FIXME: Right now, the longest repeated substring from a specific index is
+// returned, this could be improved to return the longest repeated substring, as
+// well as those that are smaller.
+TEST(SuffixTreeTest, TestLongerRepetition) {
+ std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3, 4};
+ SuffixTree ST(RepeatedRepetitionData);
+ std::vector<SuffixTree::RepeatedSubstring> SubStrings;
+ for (auto It = ST.begin(); It != ST.end(); It++)
+ SubStrings.push_back(*It);
+ EXPECT_EQ(SubStrings.size(), 2u);
+ unsigned Len;
+ for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
+ Len = RS.Length;
+ bool IsExpectedLen = (Len == 3u || Len == 2u);
+ bool IsExpectedIndex;
+ ASSERT_TRUE(IsExpectedLen);
+
+ if (Len == 3u) {
+ for (unsigned StartIdx : RS.StartIndices) {
+ IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
+ EXPECT_TRUE(IsExpectedIndex);
+ EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
+ EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
+ EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
+ }
+ } else {
+ for (unsigned StartIdx : RS.StartIndices) {
+ IsExpectedIndex = (StartIdx == 1u || StartIdx == 4u);
+ EXPECT_TRUE(IsExpectedIndex);
+ EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
+ EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
+ }
+ }
+ }
+}
+
+// Tests that the SuffixTree is able to find substring {1, 1, 1, 1, 1} at
+// indices 0 and 1.
+//
+// FIXME: Add support for detecting {1, 1} and {1, 1, 1}
+TEST(SuffixTreeTest, TestSingleCharacterRepeat) {
+ std::vector<unsigned> RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2};
+ std::vector<unsigned>::iterator RRDIt, RRDIt2;
+ SuffixTree ST(RepeatedRepetitionData);
+ std::vector<SuffixTree::RepeatedSubstring> SubStrings;
+ for (auto It = ST.begin(); It != ST.end(); It++)
+ SubStrings.push_back(*It);
+ EXPECT_EQ(SubStrings.size(), 1u);
+ for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
+ EXPECT_EQ(RS.StartIndices.size(),
+ RepeatedRepetitionData.size() - RS.Length);
+ for (unsigned StartIdx : SubStrings[0].StartIndices) {
+ RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
+ std::advance(RRDIt, StartIdx);
+ std::advance(RRDIt2, StartIdx + SubStrings[0].Length);
+ ASSERT_TRUE(
+ all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
+ [](unsigned Elt) { return Elt == 1; }));
+ }
+ }
+}
+
+// The suffix tree cannot currently find repeated substrings in strings of the
+// form {1, 2, 3, 1, 2, 3}, because the two {1, 2, 3}s are adjacent ("tandem
+// repeats")
+//
+// FIXME: Teach the SuffixTree to recognize these cases.
+TEST(SuffixTreeTest, TestTandemRepeat) {
+ std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3};
+ SuffixTree ST(RepeatedRepetitionData);
+ std::vector<SuffixTree::RepeatedSubstring> SubStrings;
+ for (auto It = ST.begin(); It != ST.end(); It++)
+ SubStrings.push_back(*It);
+ EXPECT_EQ(SubStrings.size(), 0u);
+}
+
+// Tests that the SuffixTree does not erroneously include values that are not
+// in repeated substrings. That is, only finds {1, 1} at indices 0 and 3 and
+// does not include 2 and 3.
+TEST(SuffixTreeTest, TestExclusion) {
+ std::vector<unsigned> RepeatedRepetitionData = {1, 1, 2, 1, 1, 3};
+ std::vector<unsigned>::iterator RRDIt, RRDIt2;
+ SuffixTree ST(RepeatedRepetitionData);
+ std::vector<SuffixTree::RepeatedSubstring> SubStrings;
+ for (auto It = ST.begin(); It != ST.end(); It++)
+ SubStrings.push_back(*It);
+ EXPECT_EQ(SubStrings.size(), 1u);
+ bool IsExpectedIndex;
+ for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
+ for (unsigned StartIdx : RS.StartIndices) {
+ IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
+ EXPECT_TRUE(IsExpectedIndex);
+ RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
+ std::advance(RRDIt, StartIdx);
+ std::advance(RRDIt2, StartIdx + RS.Length);
+ EXPECT_TRUE(
+ all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
+ [](unsigned Elt) { return Elt == 1; }));
+ }
+ }
+}
+
+} // namespace