1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_
6 #define EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_
14 #include "base/basictypes.h"
15 #include "extensions/common/matcher/string_pattern.h"
17 namespace extensions {
19 // Class that store a set of string patterns and can find for a string S,
20 // which string patterns occur in S.
21 class SubstringSetMatcher {
23 SubstringSetMatcher();
24 ~SubstringSetMatcher();
26 // Registers all |patterns|. The ownership remains with the caller.
27 // The same pattern cannot be registered twice and each pattern needs to have
29 // Ownership of the patterns remains with the caller.
30 void RegisterPatterns(const std::vector<const StringPattern*>& patterns);
32 // Unregisters the passed |patterns|.
33 void UnregisterPatterns(const std::vector<const StringPattern*>& patterns);
35 // Analogous to RegisterPatterns and UnregisterPatterns but executes both
36 // operations in one step, which is cheaper in the execution.
37 void RegisterAndUnregisterPatterns(
38 const std::vector<const StringPattern*>& to_register,
39 const std::vector<const StringPattern*>& to_unregister);
41 // Matches |text| against all registered StringPatterns. Stores the IDs
42 // of matching patterns in |matches|. |matches| is not cleared before adding
44 bool Match(const std::string& text,
45 std::set<StringPattern::ID>* matches) const;
47 // Returns true if this object retains no allocated data. Only for debugging.
51 // A node of an Aho Corasick Tree. This is implemented according to
52 // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
54 // The algorithm is based on the idea of building a trie of all registered
55 // patterns. Each node of the tree is annotated with a set of pattern
56 // IDs that are used to report matches.
58 // The root of the trie represents an empty match. If we were looking whether
59 // any registered pattern matches a text at the beginning of the text (i.e.
60 // whether any pattern is a prefix of the text), we could just follow
61 // nodes in the trie according to the matching characters in the text.
62 // E.g., if text == "foobar", we would follow the trie from the root node
63 // to its child labeled 'f', from there to child 'o', etc. In this process we
64 // would report all pattern IDs associated with the trie nodes as matches.
66 // As we are not looking for all prefix matches but all substring matches,
67 // this algorithm would need to compare text.substr(0), text.substr(1), ...
68 // against the trie, which is in O(|text|^2).
70 // The Aho Corasick algorithm improves this runtime by using failure edges.
71 // In case we have found a partial match of length k in the text
72 // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
73 // a node at depth k, but cannot find a match in the trie for character
74 // text[i + k] at depth k + 1, we follow a failure edge. This edge
75 // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
76 // is a prefix of any registered pattern.
78 // If your brain thinks "Forget it, let's go shopping.", don't worry.
79 // Take a nap and read an introductory text on the Aho Corasick algorithm.
80 // It will make sense. Eventually.
81 class AhoCorasickNode {
83 // Key: label of the edge, value: node index in |tree_| of parent class.
84 typedef std::map<char, uint32> Edges;
85 typedef std::set<StringPattern::ID> Matches;
87 static const uint32 kNoSuchEdge; // Represents an invalid node index.
91 AhoCorasickNode(const AhoCorasickNode& other);
92 AhoCorasickNode& operator=(const AhoCorasickNode& other);
94 uint32 GetEdge(char c) const;
95 void SetEdge(char c, uint32 node);
96 const Edges& edges() const { return edges_; }
98 uint32 failure() const { return failure_; }
99 void set_failure(uint32 failure) { failure_ = failure; }
101 void AddMatch(StringPattern::ID id);
102 void AddMatches(const Matches& matches);
103 const Matches& matches() const { return matches_; }
106 // Outgoing edges of current node.
109 // Node index that failure edge leads to.
112 // Identifiers of matches.
116 typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap;
117 typedef std::vector<const StringPattern*> SubstringPatternVector;
119 // |sorted_patterns| is a copy of |patterns_| sorted by the pattern string.
120 void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns);
122 // Inserts a path for |pattern->pattern()| into the tree and adds
123 // |pattern->id()| to the set of matches. Ownership of |pattern| remains with
125 void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern);
126 void CreateFailureEdges();
128 // Set of all registered StringPatterns. Used to regenerate the
129 // Aho-Corasick tree in case patterns are registered or unregistered.
130 SubstringPatternMap patterns_;
132 // The nodes of a Aho-Corasick tree.
133 std::vector<AhoCorasickNode> tree_;
135 DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher);
138 } // namespace extensions
140 #endif // EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_