From a151af332dbb6857820698e763afee875dc74d97 Mon Sep 17 00:00:00 2001 From: Woohyun Jung Date: Tue, 14 Mar 2023 09:09:49 +0900 Subject: [PATCH 1/1] Import aho-corasick 0.7.20 --- .cargo_vcs_info.json | 6 + .github/workflows/ci.yml | 110 +++ .gitignore | 12 + COPYING | 3 + Cargo.toml | 50 + Cargo.toml.orig | 38 + DESIGN.md | 483 ++++++++++ LICENSE-MIT | 21 + README.md | 187 ++++ UNLICENSE | 24 + rustfmt.toml | 2 + src/ahocorasick.rs | 2141 +++++++++++++++++++++++++++++++++++++++++++ src/automaton.rs | 573 ++++++++++++ src/buffer.rs | 132 +++ src/byte_frequencies.rs | 258 ++++++ src/classes.rs | 238 +++++ src/dfa.rs | 713 ++++++++++++++ src/error.rs | 101 ++ src/lib.rs | 303 ++++++ src/nfa.rs | 1214 ++++++++++++++++++++++++ src/packed/api.rs | 625 +++++++++++++ src/packed/mod.rs | 117 +++ src/packed/pattern.rs | 318 +++++++ src/packed/rabinkarp.rs | 185 ++++ src/packed/teddy/README.md | 386 ++++++++ src/packed/teddy/compile.rs | 414 +++++++++ src/packed/teddy/mod.rs | 62 ++ src/packed/teddy/runtime.rs | 1204 ++++++++++++++++++++++++ src/packed/tests.rs | 568 ++++++++++++ src/packed/vector.rs | 181 ++++ src/prefilter.rs | 1057 +++++++++++++++++++++ src/state_id.rs | 192 ++++ src/tests.rs | 1254 +++++++++++++++++++++++++ 33 files changed, 13172 insertions(+) create mode 100644 .cargo_vcs_info.json create mode 100644 .github/workflows/ci.yml create mode 100644 .gitignore create mode 100644 COPYING create mode 100644 Cargo.toml create mode 100644 Cargo.toml.orig create mode 100644 DESIGN.md create mode 100644 LICENSE-MIT create mode 100644 README.md create mode 100644 UNLICENSE create mode 100644 rustfmt.toml create mode 100644 src/ahocorasick.rs create mode 100644 src/automaton.rs create mode 100644 src/buffer.rs create mode 100644 src/byte_frequencies.rs create mode 100644 src/classes.rs create mode 100644 src/dfa.rs create mode 100644 src/error.rs create mode 100644 src/lib.rs create mode 100644 src/nfa.rs create mode 100644 src/packed/api.rs create mode 100644 src/packed/mod.rs create mode 100644 src/packed/pattern.rs create mode 100644 src/packed/rabinkarp.rs create mode 100644 src/packed/teddy/README.md create mode 100644 src/packed/teddy/compile.rs create mode 100644 src/packed/teddy/mod.rs create mode 100644 src/packed/teddy/runtime.rs create mode 100644 src/packed/tests.rs create mode 100644 src/packed/vector.rs create mode 100644 src/prefilter.rs create mode 100644 src/state_id.rs create mode 100644 src/tests.rs diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json new file mode 100644 index 0000000..873e57f --- /dev/null +++ b/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "7e231db4b4ac192ebc674078f2b03cd37b9ed5b9" + }, + "path_in_vcs": "" +} \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..f419055 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,110 @@ +name: ci +on: + pull_request: + push: + branches: + - master + schedule: + - cron: '00 01 * * *' +jobs: + test: + name: test + env: + # For some builds, we use cross to test on 32-bit and big-endian + # systems. + CARGO: cargo + # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`. + TARGET: + runs-on: ${{ matrix.os }} + strategy: + matrix: + build: + - pinned + - stable + - stable-32 + - stable-mips + - beta + - nightly + - macos + - win-msvc + - win-gnu + include: + - build: pinned + os: ubuntu-18.04 + rust: 1.41.1 + - build: stable + os: ubuntu-18.04 + rust: stable + - build: stable-32 + os: ubuntu-18.04 + rust: stable + target: i686-unknown-linux-gnu + - build: stable-mips + os: ubuntu-18.04 + rust: stable + target: mips64-unknown-linux-gnuabi64 + - build: beta + os: ubuntu-18.04 + rust: beta + - build: nightly + os: ubuntu-18.04 + rust: nightly + - build: macos + os: macos-latest + rust: stable + - build: win-msvc + os: windows-2019 + rust: stable + - build: win-gnu + os: windows-2019 + rust: stable-x86_64-gnu + steps: + - name: Checkout repository + uses: actions/checkout@v1 + with: + fetch-depth: 1 + - name: Install Rust + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.rust }} + - name: Use Cross + if: matrix.target != '' + run: | + # We used to install 'cross' from master, but it kept failing. So now + # we build from a known-good version until 'cross' becomes more stable + # or we find an alternative. Notably, between v0.2.1 and current + # master (2022-06-14), the number of Cross's dependencies has doubled. + cargo install --bins --git https://github.com/rust-embedded/cross --tag v0.2.1 + echo "CARGO=cross" >> $GITHUB_ENV + echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV + - name: Show command used for Cargo + run: | + echo "cargo command is: ${{ env.CARGO }}" + echo "target flag is: ${{ env.TARGET }}" + - name: Show CPU info for debugging + if: matrix.os == 'ubuntu-18.04' + run: lscpu + - run: ${{ env.CARGO }} build --verbose + - run: ${{ env.CARGO }} doc --verbose + - run: ${{ env.CARGO }} test --verbose + - if: matrix.build == 'nightly' + run: ${{ env.CARGO }} build --manifest-path aho-corasick-debug/Cargo.toml + - if: matrix.build == 'nightly' + run: ${{ env.CARGO }} bench --verbose --manifest-path bench/Cargo.toml -- --test + + rustfmt: + name: rustfmt + runs-on: ubuntu-18.04 + steps: + - name: Checkout repository + uses: actions/checkout@v1 + with: + fetch-depth: 1 + - name: Install Rust + uses: dtolnay/rust-toolchain@master + with: + toolchain: stable + components: rustfmt + - name: Check formatting + run: | + cargo fmt --all -- --check diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f1a4d65 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +.*.swp +doc +tags +examples/ss10pusa.csv +build +target +/Cargo.lock +scratch* +bench_large/huge +BREADCRUMBS +/tmp +/aho-corasick-debug/Cargo.lock diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..bb9c20a --- /dev/null +++ b/COPYING @@ -0,0 +1,3 @@ +This project is dual-licensed under the Unlicense and MIT licenses. + +You may use this code under the terms of either license. diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..ee8e94e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,50 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "aho-corasick" +version = "0.7.20" +authors = ["Andrew Gallant "] +exclude = ["/aho-corasick-debug"] +autotests = false +description = "Fast multiple substring searching." +homepage = "https://github.com/BurntSushi/aho-corasick" +readme = "README.md" +keywords = [ + "string", + "search", + "text", + "aho", + "multi", +] +categories = ["text-processing"] +license = "Unlicense OR MIT" +repository = "https://github.com/BurntSushi/aho-corasick" + +[profile.bench] +debug = true + +[profile.release] +debug = true + +[lib] +name = "aho_corasick" + +[dependencies.memchr] +version = "2.4.0" +default-features = false + +[dev-dependencies] + +[features] +default = ["std"] +std = ["memchr/std"] diff --git a/Cargo.toml.orig b/Cargo.toml.orig new file mode 100644 index 0000000..a43e872 --- /dev/null +++ b/Cargo.toml.orig @@ -0,0 +1,38 @@ +[package] +name = "aho-corasick" +version = "0.7.20" #:version +authors = ["Andrew Gallant "] +description = "Fast multiple substring searching." +homepage = "https://github.com/BurntSushi/aho-corasick" +repository = "https://github.com/BurntSushi/aho-corasick" +readme = "README.md" +keywords = ["string", "search", "text", "aho", "multi"] +license = "Unlicense OR MIT" +categories = ["text-processing"] +autotests = false +exclude = ["/aho-corasick-debug"] +edition = "2018" + +[workspace] +members = ["aho-corasick-debug", "bench"] + +[lib] +name = "aho_corasick" + +[features] +default = ["std"] +std = ["memchr/std"] + +[dependencies] +memchr = { version = "2.4.0", default-features = false } + +[dev-dependencies] +# TODO: Re-enable this once the MSRV is 1.43 or greater. +# See: https://github.com/BurntSushi/aho-corasick/issues/62 +# doc-comment = "0.3.1" + +[profile.release] +debug = true + +[profile.bench] +debug = true diff --git a/DESIGN.md b/DESIGN.md new file mode 100644 index 0000000..0e15ad0 --- /dev/null +++ b/DESIGN.md @@ -0,0 +1,483 @@ +This document describes the internal design of this crate, which is an object +lesson in what happens when you take a fairly simple old algorithm like +Aho-Corasick and make it fast and production ready. + +The target audience of this document is Rust programmers that have some +familiarity with string searching, however, one does not need to know the +Aho-Corasick algorithm in order to read this (it is explained below). One +should, however, know what a trie is. (If you don't, go read its Wikipedia +article.) + +The center-piece of this crate is an implementation of Aho-Corasick. On its +own, Aho-Corasick isn't that complicated. The complex pieces come from the +different variants of Aho-Corasick implemented in this crate. Specifically, +they are: + +* Aho-Corasick as an NFA, using dense transitions near the root with sparse + transitions elsewhere. +* Aho-Corasick as a DFA. (An NFA is slower to search, but cheaper to construct + and uses less memory.) + * A DFA with pre-multiplied state identifiers. This saves a multiplication + instruction in the core search loop. + * A DFA with equivalence classes of bytes as the alphabet, instead of the + traditional 256-byte alphabet. This shrinks the size of the DFA in memory, + but adds an extra lookup in the core search loop to map the input byte to + an equivalent class. +* The option to choose how state identifiers are represented, via one of + u8, u16, u32, u64 or usize. This permits creating compact automatons when + matching a small number of patterns. +* Supporting "standard" match semantics, along with its overlapping variant, + in addition to leftmost-first and leftmost-longest semantics. The "standard" + semantics are typically what you see in a textbook description of + Aho-Corasick. However, Aho-Corasick is also useful as an optimization in + regex engines, which often use leftmost-first or leftmost-longest semantics. + Thus, it is useful to implement those semantics here. The "standard" and + "leftmost" search algorithms are subtly different, and also require slightly + different construction algorithms. +* Support for ASCII case insensitive matching. +* Support for accelerating searches when the patterns all start with a small + number of fixed bytes. Or alternatively, when the patterns all contain a + small number of rare bytes. (Searching for these bytes uses SIMD vectorized + code courtesy of `memchr`.) +* Transparent support for alternative SIMD vectorized search routines for + smaller number of literals, such as the Teddy algorithm. We called these + "packed" search routines because they use SIMD. They can often be an order of + magnitude faster than just Aho-Corasick, but don't scale as well. +* Support for searching streams. This can reuse most of the underlying code, + but does require careful buffering support. +* Support for anchored searches, which permit efficient `is_prefix` checks for + a large number of patterns. + +When you combine all of this together along with trying to make everything as +fast as possible, what you end up with is enitrely too much code with too much +`unsafe`. Alas, I was not smart enough to figure out how to reduce it. Instead, +we will explain it. + + +# Basics + +The fundamental problem this crate is trying to solve is to determine the +occurrences of possibly many patterns in a haystack. The naive way to solve +this is to look for a match for each pattern at each position in the haystack: + + for i in 0..haystack.len(): + for p in patterns.iter(): + if haystack[i..].starts_with(p.bytes()): + return Match(p.id(), i, i + p.bytes().len()) + +Those four lines are effectively all this crate does. The problem with those +four lines is that they are very slow, especially when you're searching for a +large number of patterns. + +While there are many different algorithms available to solve this, a popular +one is Aho-Corasick. It's a common solution because it's not too hard to +implement, scales quite well even when searching for thousands of patterns and +is generally pretty fast. Aho-Corasick does well here because, regardless of +the number of patterns you're searching for, it always visits each byte in the +haystack exactly once. This means, generally speaking, adding more patterns to +an Aho-Corasick automaton does not make it slower. (Strictly speaking, however, +this is not true, since a larger automaton will make less effective use of the +CPU's cache.) + +Aho-Corasick can be succinctly described as a trie with state transitions +between some of the nodes that efficiently instruct the search algorithm to +try matching alternative keys in the automaton. The trick is that these state +transitions are arranged such that each byte of input needs to be inspected +only once. These state transitions are typically called "failure transitions," +because they instruct the searcher (the thing traversing the automaton while +reading from the haystack) what to do when a byte in the haystack does not +correspond to a valid transition in the current state of the trie. + +More formally, a failure transition points to a state in the automaton that may +lead to a match whose prefix is a proper suffix of the path traversed through +the trie so far. (If no such proper suffix exists, then the failure transition +points back to the start state of the trie, effectively restarting the search.) +This is perhaps simpler to explain pictorally. For example, let's say we built +an Aho-Corasick automaton with the following patterns: 'abcd' and 'cef'. The +trie looks like this: + + a - S1 - b - S2 - c - S3 - d - S4* + / + S0 - c - S5 - e - S6 - f - S7* + +where states marked with a `*` are match states (meaning, the search algorithm +should stop and report a match to the caller). + +So given this trie, it should be somewhat straight-forward to see how it can +be used to determine whether any particular haystack *starts* with either +`abcd` or `cef`. It's easy to express this in code: + + fn has_prefix(trie: &Trie, haystack: &[u8]) -> bool { + let mut state_id = trie.start(); + // If the empty pattern is in trie, then state_id is a match state. + if trie.is_match(state_id) { + return true; + } + for (i, &b) in haystack.iter().enumerate() { + state_id = match trie.next_state(state_id, b) { + Some(id) => id, + // If there was no transition for this state and byte, then we know + // the haystack does not start with one of the patterns in our trie. + None => return false, + }; + if trie.is_match(state_id) { + return true; + } + } + false + } + +And that's pretty much it. All we do is move through the trie starting with the +bytes at the beginning of the haystack. If we find ourselves in a position +where we can't move, or if we've looked through the entire haystack without +seeing a match state, then we know the haystack does not start with any of the +patterns in the trie. + +The meat of the Aho-Corasick algorithm is in how we add failure transitions to +our trie to keep searching efficient. Specifically, it permits us to not only +check whether a haystack *starts* with any one of a number of patterns, but +rather, whether the haystack contains any of a number of patterns *anywhere* in +the haystack. + +As mentioned before, failure transitions connect a proper suffix of the path +traversed through the trie before, with a path that leads to a match that has a +prefix corresponding to that proper suffix. So in our case, for patterns `abcd` +and `cef`, with a haystack `abcef`, we want to transition to state `S5` (from +the diagram above) from `S3` upon seeing that the byte following `c` is not +`d`. Namely, the proper suffix in this example is `c`, which is a prefix of +`cef`. So the modified diagram looks like this: + + + a - S1 - b - S2 - c - S3 - d - S4* + / / + / ---------------- + / / + S0 - c - S5 - e - S6 - f - S7* + +One thing that isn't shown in this diagram is that *all* states have a failure +transition, but only `S3` has a *non-trivial* failure transition. That is, all +other states have a failure transition back to the start state. So if our +haystack was `abzabcd`, then the searcher would transition back to `S0` after +seeing `z`, which effectively restarts the search. (Because there is no pattern +in our trie that has a prefix of `bz` or `z`.) + +The code for traversing this *automaton* or *finite state machine* (it is no +longer just a trie) is not that much different from the `has_prefix` code +above: + + fn contains(fsm: &FiniteStateMachine, haystack: &[u8]) -> bool { + let mut state_id = fsm.start(); + // If the empty pattern is in fsm, then state_id is a match state. + if fsm.is_match(state_id) { + return true; + } + for (i, &b) in haystack.iter().enumerate() { + // While the diagram above doesn't show this, we may wind up needing + // to follow multiple failure transitions before we land on a state + // in which we can advance. Therefore, when searching for the next + // state, we need to loop until we don't see a failure transition. + // + // This loop terminates because the start state has no empty + // transitions. Every transition from the start state either points to + // another state, or loops back to the start state. + loop { + match fsm.next_state(state_id, b) { + Some(id) => { + state_id = id; + break; + } + // Unlike our code above, if there was no transition for this + // state, then we don't quit. Instead, we look for this state's + // failure transition and follow that instead. + None => { + state_id = fsm.next_fail_state(state_id); + } + }; + } + if fsm.is_match(state_id) { + return true; + } + } + false + } + +Other than the complication around traversing failure transitions, this code +is still roughly "traverse the automaton with bytes from the haystack, and quit +when a match is seen." + +And that concludes our section on the basics. While we didn't go deep into +how the automaton is built (see `src/nfa.rs`, which has detailed comments about +that), the basic structure of Aho-Corasick should be reasonably clear. + + +# NFAs and DFAs + +There are generally two types of finite automata: non-deterministic finite +automata (NFA) and deterministic finite automata (DFA). The difference between +them is, principally, that an NFA can be in multiple states at once. This is +typically accomplished by things called _epsilon_ transitions, where one could +move to a new state without consuming any bytes from the input. (The other +mechanism by which NFAs can be in more than one state is where the same byte in +a particular state transitions to multiple distinct states.) In contrast, a DFA +can only ever be in one state at a time. A DFA has no epsilon transitions, and +for any given state, a byte transitions to at most one other state. + +By this formulation, the Aho-Corasick automaton described in the previous +section is an NFA. This is because failure transitions are, effectively, +epsilon transitions. That is, whenever the automaton is in state `S`, it is +actually in the set of states that are reachable by recursively following +failure transitions from `S`. (This means that, for example, the start state +is always active since the start state is reachable via failure transitions +from any state in the automaton.) + +NFAs have a lot of nice properties. They tend to be easier to construct, and +also tend to use less memory. However, their primary downside is that they are +typically slower to execute. For example, the code above showing how to search +with an Aho-Corasick automaton needs to potentially iterate through many +failure transitions for every byte of input. While this is a fairly small +amount of overhead, this can add up, especially if the automaton has a lot of +overlapping patterns with a lot of failure transitions. + +A DFA's search code, by contrast, looks like this: + + fn contains(dfa: &DFA, haystack: &[u8]) -> bool { + let mut state_id = dfa.start(); + // If the empty pattern is in dfa, then state_id is a match state. + if dfa.is_match(state_id) { + return true; + } + for (i, &b) in haystack.iter().enumerate() { + // An Aho-Corasick DFA *never* has a missing state that requires + // failure transitions to be followed. One byte of input advances the + // automaton by one state. Always. + state_id = dfa.next_state(state_id, b); + if dfa.is_match(state_id) { + return true; + } + } + false + } + +The search logic here is much simpler than for the NFA, and this tends to +translate into significant performance benefits as well, since there's a lot +less work being done for each byte in the haystack. How is this accomplished? +It's done by pre-following all failure transitions for all states for all bytes +in the alphabet, and then building a single state transition table. Building +this DFA can be much more costly than building the NFA, and use much more +memory, but the better performance can be worth it. + +Users of this crate can actually choose between using an NFA or a DFA. By +default, an NFA is used, because it typically strikes the best balance between +space usage and search performance. But the DFA option is available for cases +where a little extra memory and upfront time building the automaton is okay. +For example, the `AhoCorasick::auto_configure` and +`AhoCorasickBuilder::auto_configure` methods will enable the DFA setting if +there are a small number of patterns. + + +# More DFA tricks + +As described in the previous section, one of the downsides of using a DFA +is that it uses more memory and can take longer to build. One small way of +mitigating these concerns is to map the alphabet used by the automaton into +a smaller space. Typically, the alphabet of a DFA has 256 elements in it: +one element for each possible value that fits into a byte. However, in many +cases, one does not need the full alphabet. For example, if all patterns in an +Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct +bytes. As far as the automaton is concerned, the rest of the 204 bytes are +indistinguishable from one another: they will never disrciminate between a +match or a non-match. Therefore, in cases like that, the alphabet can be shrunk +to just 53 elements. One for each ASCII letter, and then another to serve as a +placeholder for every other unused byte. + +In practice, this library doesn't quite compute the optimal set of equivalence +classes, but it's close enough in most cases. The key idea is that this then +allows the transition table for the DFA to be potentially much smaller. The +downside of doing this, however, is that since the transition table is defined +in terms of this smaller alphabet space, every byte in the haystack must be +re-mapped to this smaller space. This requires an additional 256-byte table. +In practice, this can lead to a small search time hit, but it can be difficult +to measure. Moreover, it can sometimes lead to faster search times for bigger +automata, since it could be difference between more parts of the automaton +staying in the CPU cache or not. + +One other trick for DFAs employed by this crate is the notion of premultiplying +state identifiers. Specifically, the normal way to compute the next transition +in a DFA is via the following (assuming that the transition table is laid out +sequentially in memory, in row-major order, where the rows are states): + + next_state_id = dfa.transitions[current_state_id * 256 + current_byte] + +However, since the value `256` is a fixed constant, we can actually premultiply +the state identifiers in the table when we build the table initially. Then, the +next transition computation simply becomes: + + next_state_id = dfa.transitions[current_state_id + current_byte] + +This doesn't seem like much, but when this is being executed for every byte of +input that you're searching, saving that extra multiplication instruction can +add up. + +The same optimization works even when equivalence classes are enabled, as +described above. The only difference is that the premultiplication is by the +total number of equivalence classes instead of 256. + +There isn't much downside to premultiplying state identifiers, other than the +fact that you may need to choose a bigger integer representation than you would +otherwise. For example, if you don't premultiply state identifiers, then an +automaton that uses `u8` as a state identifier can hold up to 256 states. +However, if they are premultiplied, then it can only hold up to +`floor(256 / len(alphabet))` states. Thus premultiplication impacts how compact +your DFA can be. In practice, it's pretty rare to use `u8` as a state +identifier, so premultiplication is usually a good thing to do. + +Both equivalence classes and premultiplication are tuneable parameters via the +`AhoCorasickBuilder` type, and both are enabled by default. + + +# Match semantics + +One of the more interesting things about this implementation of Aho-Corasick +that (as far as this author knows) separates it from other implementations, is +that it natively supports leftmost-first and leftmost-longest match semantics. +Briefly, match semantics refer to the decision procedure by which searching +will disambiguate matches when there are multiple to choose from: + +* **standard** match semantics emits matches as soon as they are detected by + the automaton. This is typically equivalent to the textbook non-overlapping + formulation of Aho-Corasick. +* **leftmost-first** match semantics means that 1) the next match is the match + starting at the leftmost position and 2) among multiple matches starting at + the same leftmost position, the match corresponding to the pattern provided + first by the caller is reported. +* **leftmost-longest** is like leftmost-first, except when there are multiple + matches starting at the same leftmost position, the pattern corresponding to + the longest match is returned. + +(The crate API documentation discusses these differences, with examples, in +more depth on the `MatchKind` type.) + +The reason why supporting these match semantics is important is because it +gives the user more control over the match procedure. For example, +leftmost-first permits users to implement match priority by simply putting the +higher priority patterns first. Leftmost-longest, on the other hand, permits +finding the longest possible match, which might be useful when trying to find +words matching a dictionary. Additionally, regex engines often want to use +Aho-Corasick as an optimization when searching for an alternation of literals. +In order to preserve correct match semantics, regex engines typically can't use +the standard textbook definition directly, since regex engines will implement +either leftmost-first (Perl-like) or leftmost-longest (POSIX) match semantics. + +Supporting leftmost semantics requires a couple key changes: + +* Constructing the Aho-Corasick automaton changes a bit in both how the trie is + constructed and how failure transitions are found. Namely, only a subset of + the failure transitions are added. Specifically, only the failure transitions + that either do not occur after a match or do occur after a match but preserve + that match are kept. (More details on this can be found in `src/nfa.rs`.) +* The search algorithm changes slightly. Since we are looking for the leftmost + match, we cannot quit as soon as a match is detected. Instead, after a match + is detected, we must keep searching until either the end of the input or + until a dead state is seen. (Dead states are not used for standard match + semantics. Dead states mean that searching should stop after a match has been + found.) + +Other implementations of Aho-Corasick do support leftmost match semantics, but +they do it with more overhead at search time, or even worse, with a queue of +matches and sophisticated hijinks to disambiguate the matches. While our +construction algorithm becomes a bit more complicated, the correct match +semantics fall out from the structure of the automaton itself. + + +# Overlapping matches + +One of the nice properties of an Aho-Corasick automaton is that it can report +all possible matches, even when they overlap with one another. In this mode, +the match semantics don't matter, since all possible matches are reported. +Overlapping searches work just like regular searches, except the state +identifier at which the previous search left off is carried over to the next +search, so that it can pick up where it left off. If there are additional +matches at that state, then they are reported before resuming the search. + +Enabling leftmost-first or leftmost-longest match semantics causes the +automaton to use a subset of all failure transitions, which means that +overlapping searches cannot be used. Therefore, if leftmost match semantics are +used, attempting to do an overlapping search will panic. Thus, to get +overlapping searches, the caller must use the default standard match semantics. +This behavior was chosen because there are only two alternatives, which were +deemed worse: + +* Compile two automatons internally, one for standard semantics and one for + the semantics requested by the caller (if not standard). +* Create a new type, distinct from the `AhoCorasick` type, which has different + capabilities based on the configuration options. + +The first is untenable because of the amount of memory used by the automaton. +The second increases the complexity of the API too much by adding too many +types that do similar things. It is conceptually much simpler to keep all +searching isolated to a single type. Callers may query whether the automaton +supports overlapping searches via the `AhoCorasick::supports_overlapping` +method. + + +# Stream searching + +Since Aho-Corasick is an automaton, it is possible to do partial searches on +partial parts of the haystack, and then resume that search on subsequent pieces +of the haystack. This is useful when the haystack you're trying to search is +not stored contiguously in memory, or if one does not want to read the entire +haystack into memory at once. + +Currently, only standard semantics are supported for stream searching. This is +some of the more complicated code in this crate, and is something I would very +much like to improve. In particular, it currently has the restriction that it +must buffer at least enough of the haystack in memory in order to fit the +longest possible match. The difficulty in getting stream searching right is +that the implementation choices (such as the buffer size) often impact what the +API looks like and what it's allowed to do. + + +# Prefilters + +In some cases, Aho-Corasick is not the fastest way to find matches containing +multiple patterns. Sometimes, the search can be accelerated using highly +optimized SIMD routines. For example, consider searching the following +patterns: + + Sherlock + Moriarty + Watson + +It is plausible that it would be much faster to quickly look for occurrences of +the leading bytes, `S`, `M` or `W`, before trying to start searching via the +automaton. Indeed, this is exactly what this crate will do. + +When there are more than three distinct starting bytes, then this crate will +look for three distinct bytes occurring at any position in the patterns, while +preferring bytes that are heuristically determined to be rare over others. For +example: + + Abuzz + Sanchez + Vasquez + Topaz + Waltz + +Here, we have more than 3 distinct starting bytes, but all of the patterns +contain `z`, which is typically a rare byte. In this case, the prefilter will +scan for `z`, back up a bit, and then execute the Aho-Corasick automaton. + +If all of that fails, then a packed multiple substring algorithm will be +attempted. Currently, the only algorithm available for this is Teddy, but more +may be added in the future. Teddy is unlike the above prefilters in that it +confirms its own matches, so when Teddy is active, it might not be necessary +for Aho-Corasick to run at all. (See `Automaton::leftmost_find_at_no_state_imp` +in `src/automaton.rs`.) However, the current Teddy implementation only works +in `x86_64` and when SSSE3 or AVX2 are available, and moreover, only works +_well_ when there are a small number of patterns (say, less than 100). Teddy +also requires the haystack to be of a certain length (more than 16-34 bytes). +When the haystack is shorter than that, Rabin-Karp is used instead. (See +`src/packed/rabinkarp.rs`.) + +There is a more thorough description of Teddy at +[`src/packed/teddy/README.md`](src/packed/teddy/README.md). diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..3b0a5dc --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..f033e01 --- /dev/null +++ b/README.md @@ -0,0 +1,187 @@ +aho-corasick +============ +A library for finding occurrences of many patterns at once with SIMD +acceleration in some cases. This library provides multiple pattern +search principally through an implementation of the +[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), +which builds a finite state machine for executing searches in linear time. +Features include case insensitive matching, overlapping matches, fast searching +via SIMD and optional full DFA construction and search & replace in streams. + +[![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions) +[![crates.io](https://img.shields.io/crates/v/aho-corasick.svg)](https://crates.io/crates/aho-corasick) + +Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). + + +### Documentation + +https://docs.rs/aho-corasick + + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +aho-corasick = "0.7" +``` + + +### Example: basic searching + +This example shows how to search for occurrences of multiple patterns +simultaneously. Each match includes the pattern that matched along with the +byte offsets of the match. + +```rust +use aho_corasick::AhoCorasick; + +let patterns = &["apple", "maple", "Snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasick::new(patterns); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (1, 13, 18), + (0, 28, 33), + (2, 43, 50), +]); +``` + + +### Example: case insensitivity + +This is like the previous example, but matches `Snapple` case insensitively +using `AhoCorasickBuilder`: + +```rust +use aho_corasick::AhoCorasickBuilder; + +let patterns = &["apple", "maple", "snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(patterns); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (1, 13, 18), + (0, 28, 33), + (2, 43, 50), +]); +``` + + +### Example: replacing matches in a stream + +This example shows how to execute a search and replace on a stream without +loading the entire stream into memory first. + +```rust +use aho_corasick::AhoCorasick; + +let patterns = &["fox", "brown", "quick"]; +let replace_with = &["sloth", "grey", "slow"]; + +// In a real example, these might be `std::fs::File`s instead. All you need to +// do is supply a pair of `std::io::Read` and `std::io::Write` implementations. +let rdr = "The quick brown fox."; +let mut wtr = vec![]; + +let ac = AhoCorasick::new(patterns); +ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with) + .expect("stream_replace_all failed"); +assert_eq!(b"The slow grey sloth.".to_vec(), wtr); +``` + + +### Example: finding the leftmost first match + +In the textbook description of Aho-Corasick, its formulation is typically +structured such that it reports all possible matches, even when they overlap +with another. In many cases, overlapping matches may not be desired, such as +the case of finding all successive non-overlapping matches like you might with +a standard regular expression. + +Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do +this doesn't always work in the expected way, since it will report matches as +soon as they are seen. For example, consider matching the regex `Samwise|Sam` +against the text `Samwise`. Most regex engines (that are Perl-like, or +non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick +algorithm modified for reporting non-overlapping matches will report `Sam`. + +A novel contribution of this library is the ability to change the match +semantics of Aho-Corasick (without additional search time overhead) such that +`Samwise` is reported instead. For example, here's the standard approach: + +```rust +use aho_corasick::AhoCorasick; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasick::new(patterns); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Sam", &haystack[mat.start()..mat.end()]); +``` + +And now here's the leftmost-first version, which matches how a Perl-like +regex will work: + +```rust +use aho_corasick::{AhoCorasickBuilder, MatchKind}; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .build(patterns); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); +``` + +In addition to leftmost-first semantics, this library also supports +leftmost-longest semantics, which match the POSIX behavior of a regular +expression alternation. See `MatchKind` in the docs for more details. + + +### Minimum Rust version policy + +This crate's minimum supported `rustc` version is `1.41.1`. + +The current policy is that the minimum Rust version required to use this crate +can be increased in minor version updates. For example, if `crate 1.0` requires +Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust +1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum +version of Rust. + +In general, this crate will be conservative with respect to the minimum +supported version of Rust. + + +### FFI bindings + +* [G-Research/ahocorasick_rs](https://github.com/G-Research/ahocorasick_rs/) +is a Python wrapper for this library. + + +### Future work + +Here are some plans for the future: + +* Assuming the current API is sufficient, I'd like to commit to it and release + a `1.0` version of this crate some time in the next 6-12 months. +* Support stream searching with leftmost match semantics. Currently, only + standard match semantics are supported. Getting this right seems possible, + but is tricky since the match state needs to be propagated through multiple + searches. (With standard semantics, as soon as a match is seen the search + ends.) diff --git a/UNLICENSE b/UNLICENSE new file mode 100644 index 0000000..68a49da --- /dev/null +++ b/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..aa37a21 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,2 @@ +max_width = 79 +use_small_heuristics = "max" diff --git a/src/ahocorasick.rs b/src/ahocorasick.rs new file mode 100644 index 0000000..cfd74fd --- /dev/null +++ b/src/ahocorasick.rs @@ -0,0 +1,2141 @@ +use std::io; + +use crate::automaton::Automaton; +use crate::buffer::Buffer; +use crate::dfa::{self, DFA}; +use crate::error::Result; +use crate::nfa::{self, NFA}; +use crate::packed; +use crate::prefilter::{Prefilter, PrefilterState}; +use crate::state_id::StateID; +use crate::Match; + +/// An automaton for searching multiple strings in linear time. +/// +/// The `AhoCorasick` type supports a few basic ways of constructing an +/// automaton, including +/// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new) +/// and +/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). +/// However, there are a fair number of configurable options that can be set +/// by using +/// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) +/// instead. Such options include, but are not limited to, how matches are +/// determined, simple case insensitivity, whether to use a DFA or not and +/// various knobs for controlling the space-vs-time trade offs taken when +/// building the automaton. +/// +/// If you aren't sure where to start, try beginning with +/// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). +/// +/// # Resource usage +/// +/// Aho-Corasick automatons are always constructed in `O(p)` time, where `p` +/// is the combined length of all patterns being searched. With that said, +/// building an automaton can be fairly costly because of high constant +/// factors, particularly when enabling the +/// [DFA](struct.AhoCorasickBuilder.html#method.dfa) +/// option (which is disabled by default). For this reason, it's generally a +/// good idea to build an automaton once and reuse it as much as possible. +/// +/// Aho-Corasick automatons can also use a fair bit of memory. To get a +/// concrete idea of how much memory is being used, try using the +/// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes) +/// method. +/// +/// # Examples +/// +/// This example shows how to search for occurrences of multiple patterns +/// simultaneously in a case insensitive fashion. Each match includes the +/// pattern that matched along with the byte offsets of the match. +/// +/// ``` +/// use aho_corasick::AhoCorasickBuilder; +/// +/// let patterns = &["apple", "maple", "snapple"]; +/// let haystack = "Nobody likes maple in their apple flavored Snapple."; +/// +/// let ac = AhoCorasickBuilder::new() +/// .ascii_case_insensitive(true) +/// .build(patterns); +/// let mut matches = vec![]; +/// for mat in ac.find_iter(haystack) { +/// matches.push((mat.pattern(), mat.start(), mat.end())); +/// } +/// assert_eq!(matches, vec![ +/// (1, 13, 18), +/// (0, 28, 33), +/// (2, 43, 50), +/// ]); +/// ``` +/// +/// This example shows how to replace matches with some other string: +/// +/// ``` +/// use aho_corasick::AhoCorasick; +/// +/// let patterns = &["fox", "brown", "quick"]; +/// let haystack = "The quick brown fox."; +/// let replace_with = &["sloth", "grey", "slow"]; +/// +/// let ac = AhoCorasick::new(patterns); +/// let result = ac.replace_all(haystack, replace_with); +/// assert_eq!(result, "The slow grey sloth."); +/// ``` +#[derive(Clone, Debug)] +pub struct AhoCorasick { + imp: Imp, + match_kind: MatchKind, +} + +impl AhoCorasick { + /// Create a new Aho-Corasick automaton using the default configuration. + /// + /// The default configuration optimizes for less space usage, but at the + /// expense of longer search times. To change the configuration, use + /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) + /// for fine-grained control, or + /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) + /// for automatic configuration if you aren't sure which settings to pick. + /// + /// This uses the default + /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) + /// match semantics, which reports a match as soon as it is found. This + /// corresponds to the standard match semantics supported by textbook + /// descriptions of the Aho-Corasick algorithm. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&[ + /// "foo", "bar", "baz", + /// ]); + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// ``` + pub fn new(patterns: I) -> AhoCorasick + where + I: IntoIterator, + P: AsRef<[u8]>, + { + AhoCorasickBuilder::new().build(patterns) + } + + /// Build an Aho-Corasick automaton with an automatically determined + /// configuration. + /// + /// Specifically, this requires a slice of patterns instead of an iterator + /// since the configuration is determined by looking at the patterns before + /// constructing the automaton. The idea here is to balance space and time + /// automatically. That is, when searching a small number of patterns, this + /// will attempt to use the fastest possible configuration since the total + /// space required will be small anyway. As the number of patterns grows, + /// this will fall back to slower configurations that use less space. + /// + /// If you want auto configuration but with match semantics different from + /// the default `MatchKind::Standard`, then use + /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure). + /// + /// # Examples + /// + /// Basic usage is just like `new`, except you must provide a slice: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new_auto_configured(&[ + /// "foo", "bar", "baz", + /// ]); + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// ``` + pub fn new_auto_configured(patterns: &[B]) -> AhoCorasick + where + B: AsRef<[u8]>, + { + AhoCorasickBuilder::new().auto_configure(patterns).build(patterns) + } +} + +impl AhoCorasick { + /// Returns true if and only if this automaton matches the haystack at any + /// position. + /// + /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. + /// This includes, but is not limited to, `String`, `&str`, `Vec`, and + /// `&[u8]` itself. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&[ + /// "foo", "bar", "quux", "baz", + /// ]); + /// assert!(ac.is_match("xxx bar xxx")); + /// assert!(!ac.is_match("xxx qux xxx")); + /// ``` + pub fn is_match>(&self, haystack: B) -> bool { + self.earliest_find(haystack).is_some() + } + + /// Returns the location of the first detected match in `haystack`. + /// + /// This method has the same behavior regardless of the + /// [`MatchKind`](enum.MatchKind.html) + /// of this automaton. + /// + /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. + /// This includes, but is not limited to, `String`, `&str`, `Vec`, and + /// `&[u8]` itself. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&[ + /// "abc", "b", + /// ]); + /// let mat = ac.earliest_find("abcd").expect("should have match"); + /// assert_eq!(1, mat.pattern()); + /// assert_eq!((1, 2), (mat.start(), mat.end())); + /// ``` + pub fn earliest_find>(&self, haystack: B) -> Option { + let mut prestate = PrefilterState::new(self.max_pattern_len()); + let mut start = self.imp.start_state(); + self.imp.earliest_find_at( + &mut prestate, + haystack.as_ref(), + 0, + &mut start, + ) + } + + /// Returns the location of the first match according to the match + /// semantics that this automaton was constructed with. + /// + /// When using `MatchKind::Standard`, this corresponds precisely to the + /// same behavior as + /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find). + /// Otherwise, match semantics correspond to either + /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst) + /// or + /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest). + /// + /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. + /// This includes, but is not limited to, `String`, `&str`, `Vec`, and + /// `&[u8]` itself. + /// + /// # Examples + /// + /// Basic usage, with standard semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::Standard) // default, not necessary + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("b", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// Now with leftmost-first semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// And finally, leftmost-longest semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostLongest) + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); + /// ``` + pub fn find>(&self, haystack: B) -> Option { + let mut prestate = PrefilterState::new(self.max_pattern_len()); + self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0) + } + + /// Returns an iterator of non-overlapping matches, using the match + /// semantics that this automaton was constructed with. + /// + /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. + /// This includes, but is not limited to, `String`, `&str`, `Vec`, and + /// `&[u8]` itself. + /// + /// # Examples + /// + /// Basic usage, with standard semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::Standard) // default, not necessary + /// .build(patterns); + /// let matches: Vec = ac + /// .find_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![2, 2, 2], matches); + /// ``` + /// + /// Now with leftmost-first semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let matches: Vec = ac + /// .find_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![0, 2, 0], matches); + /// ``` + /// + /// And finally, leftmost-longest semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostLongest) + /// .build(patterns); + /// let matches: Vec = ac + /// .find_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![0, 2, 1], matches); + /// ``` + pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( + &'a self, + haystack: &'b B, + ) -> FindIter<'a, 'b, S> { + FindIter::new(self, haystack.as_ref()) + } + + /// Returns an iterator of overlapping matches in the given `haystack`. + /// + /// Overlapping matches can _only_ be detected using + /// `MatchKind::Standard` semantics. If this automaton was constructed with + /// leftmost semantics, then this method will panic. To determine whether + /// this will panic at runtime, use the + /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping) + /// method. + /// + /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. + /// This includes, but is not limited to, `String`, `&str`, `Vec`, and + /// `&[u8]` itself. + /// + /// # Panics + /// + /// This panics when `AhoCorasick::supports_overlapping` returns `false`. + /// That is, this panics when this automaton's match semantics are not + /// `MatchKind::Standard`. + /// + /// # Examples + /// + /// Basic usage, with standard semantics: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::new(patterns); + /// let matches: Vec = ac + /// .find_overlapping_iter(haystack) + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches); + /// ``` + pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( + &'a self, + haystack: &'b B, + ) -> FindOverlappingIter<'a, 'b, S> { + FindOverlappingIter::new(self, haystack.as_ref()) + } + + /// Replace all matches with a corresponding value in the `replace_with` + /// slice given. Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// Replacements are determined by the index of the matching pattern. + /// For example, if the pattern with index `2` is found, then it is + /// replaced by `replace_with[2]`. + /// + /// # Panics + /// + /// This panics when `replace_with.len()` does not equal the total number + /// of patterns that are matched by this automaton. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let result = ac.replace_all(haystack, &["x", "y", "z"]); + /// assert_eq!("x the z to the xage", result); + /// ``` + pub fn replace_all(&self, haystack: &str, replace_with: &[B]) -> String + where + B: AsRef, + { + assert_eq!( + replace_with.len(), + self.pattern_count(), + "replace_all requires a replacement for every pattern \ + in the automaton" + ); + let mut dst = String::with_capacity(haystack.len()); + self.replace_all_with(haystack, &mut dst, |mat, _, dst| { + dst.push_str(replace_with[mat.pattern()].as_ref()); + true + }); + dst + } + + /// Replace all matches using raw bytes with a corresponding value in the + /// `replace_with` slice given. Matches correspond to the same matches as + /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// Replacements are determined by the index of the matching pattern. + /// For example, if the pattern with index `2` is found, then it is + /// replaced by `replace_with[2]`. + /// + /// # Panics + /// + /// This panics when `replace_with.len()` does not equal the total number + /// of patterns that are matched by this automaton. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = b"append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]); + /// assert_eq!(b"x the z to the xage".to_vec(), result); + /// ``` + pub fn replace_all_bytes( + &self, + haystack: &[u8], + replace_with: &[B], + ) -> Vec + where + B: AsRef<[u8]>, + { + assert_eq!( + replace_with.len(), + self.pattern_count(), + "replace_all_bytes requires a replacement for every pattern \ + in the automaton" + ); + let mut dst = Vec::with_capacity(haystack.len()); + self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| { + dst.extend(replace_with[mat.pattern()].as_ref()); + true + }); + dst + } + + /// Replace all matches using a closure called on each match. + /// Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// The closure accepts three parameters: the match found, the text of + /// the match and a string buffer with which to write the replaced text + /// (if any). If the closure returns `true`, then it continues to the next + /// match. If the closure returns `false`, then searching is stopped. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let mut result = String::new(); + /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { + /// dst.push_str(&mat.pattern().to_string()); + /// true + /// }); + /// assert_eq!("0 the 2 to the 0age", result); + /// ``` + /// + /// Stopping the replacement by returning `false` (continued from the + /// example above): + /// + /// ``` + /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// # let patterns = &["append", "appendage", "app"]; + /// # let haystack = "append the app to the appendage"; + /// # let ac = AhoCorasickBuilder::new() + /// # .match_kind(MatchKind::LeftmostFirst) + /// # .build(patterns); + /// let mut result = String::new(); + /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { + /// dst.push_str(&mat.pattern().to_string()); + /// mat.pattern() != 2 + /// }); + /// assert_eq!("0 the 2 to the appendage", result); + /// ``` + pub fn replace_all_with( + &self, + haystack: &str, + dst: &mut String, + mut replace_with: F, + ) where + F: FnMut(&Match, &str, &mut String) -> bool, + { + let mut last_match = 0; + for mat in self.find_iter(haystack) { + dst.push_str(&haystack[last_match..mat.start()]); + last_match = mat.end(); + if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { + break; + }; + } + dst.push_str(&haystack[last_match..]); + } + + /// Replace all matches using raw bytes with a closure called on each + /// match. Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// The closure accepts three parameters: the match found, the text of + /// the match and a byte buffer with which to write the replaced text + /// (if any). If the closure returns `true`, then it continues to the next + /// match. If the closure returns `false`, then searching is stopped. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = b"append the app to the appendage"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let mut result = vec![]; + /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { + /// dst.extend(mat.pattern().to_string().bytes()); + /// true + /// }); + /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result); + /// ``` + /// + /// Stopping the replacement by returning `false` (continued from the + /// example above): + /// + /// ``` + /// # use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// # let patterns = &["append", "appendage", "app"]; + /// # let haystack = b"append the app to the appendage"; + /// # let ac = AhoCorasickBuilder::new() + /// # .match_kind(MatchKind::LeftmostFirst) + /// # .build(patterns); + /// let mut result = vec![]; + /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { + /// dst.extend(mat.pattern().to_string().bytes()); + /// mat.pattern() != 2 + /// }); + /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result); + /// ``` + pub fn replace_all_with_bytes( + &self, + haystack: &[u8], + dst: &mut Vec, + mut replace_with: F, + ) where + F: FnMut(&Match, &[u8], &mut Vec) -> bool, + { + let mut last_match = 0; + for mat in self.find_iter(haystack) { + dst.extend(&haystack[last_match..mat.start()]); + last_match = mat.end(); + if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) { + break; + }; + } + dst.extend(&haystack[last_match..]); + } + + /// Returns an iterator of non-overlapping matches in the given + /// stream. Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// The matches yielded by this iterator use absolute position offsets in + /// the stream given, where the first byte has index `0`. Matches are + /// yieled until the stream is exhausted. + /// + /// Each item yielded by the iterator is an `io::Result`, where an + /// error is yielded if there was a problem reading from the reader given. + /// + /// When searching a stream, an internal buffer is used. Therefore, callers + /// should avoiding providing a buffered reader, if possible. + /// + /// Searching a stream requires that the automaton was built with + /// `MatchKind::Standard` semantics. If this automaton was constructed + /// with leftmost semantics, then this method will panic. To determine + /// whether this will panic at runtime, use the + /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) + /// method. + /// + /// # Memory usage + /// + /// In general, searching streams will use a constant amount of memory for + /// its internal buffer. The one requirement is that the internal buffer + /// must be at least the size of the longest possible match. In most use + /// cases, the default buffer size will be much larger than any individual + /// match. + /// + /// # Panics + /// + /// This panics when `AhoCorasick::supports_stream` returns `false`. + /// That is, this panics when this automaton's match semantics are not + /// `MatchKind::Standard`. This restriction may be lifted in the future. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// # fn example() -> Result<(), ::std::io::Error> { + /// let patterns = &["append", "appendage", "app"]; + /// let haystack = "append the app to the appendage"; + /// + /// let ac = AhoCorasick::new(patterns); + /// let mut matches = vec![]; + /// for result in ac.stream_find_iter(haystack.as_bytes()) { + /// let mat = result?; + /// matches.push(mat.pattern()); + /// } + /// assert_eq!(vec![2, 2, 2], matches); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn stream_find_iter<'a, R: io::Read>( + &'a self, + rdr: R, + ) -> StreamFindIter<'a, R, S> { + StreamFindIter::new(self, rdr) + } + + /// Search for and replace all matches of this automaton in + /// the given reader, and write the replacements to the given + /// writer. Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// Replacements are determined by the index of the matching pattern. + /// For example, if the pattern with index `2` is found, then it is + /// replaced by `replace_with[2]`. + /// + /// After all matches are replaced, the writer is _not_ flushed. + /// + /// If there was a problem reading from the given reader or writing to the + /// given writer, then the corresponding `io::Error` is returned and all + /// replacement is stopped. + /// + /// When searching a stream, an internal buffer is used. Therefore, callers + /// should avoiding providing a buffered reader, if possible. However, + /// callers may want to provide a buffered writer. + /// + /// Searching a stream requires that the automaton was built with + /// `MatchKind::Standard` semantics. If this automaton was constructed + /// with leftmost semantics, then this method will panic. To determine + /// whether this will panic at runtime, use the + /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) + /// method. + /// + /// # Memory usage + /// + /// In general, searching streams will use a constant amount of memory for + /// its internal buffer. The one requirement is that the internal buffer + /// must be at least the size of the longest possible match. In most use + /// cases, the default buffer size will be much larger than any individual + /// match. + /// + /// # Panics + /// + /// This panics when `AhoCorasick::supports_stream` returns `false`. + /// That is, this panics when this automaton's match semantics are not + /// `MatchKind::Standard`. This restriction may be lifted in the future. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// # fn example() -> Result<(), ::std::io::Error> { + /// let patterns = &["fox", "brown", "quick"]; + /// let haystack = "The quick brown fox."; + /// let replace_with = &["sloth", "grey", "slow"]; + /// + /// let ac = AhoCorasick::new(patterns); + /// let mut result = vec![]; + /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?; + /// assert_eq!(b"The slow grey sloth.".to_vec(), result); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn stream_replace_all( + &self, + rdr: R, + wtr: W, + replace_with: &[B], + ) -> io::Result<()> + where + R: io::Read, + W: io::Write, + B: AsRef<[u8]>, + { + assert_eq!( + replace_with.len(), + self.pattern_count(), + "stream_replace_all requires a replacement for every pattern \ + in the automaton" + ); + self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| { + wtr.write_all(replace_with[mat.pattern()].as_ref()) + }) + } + + /// Search the given reader and replace all matches of this automaton + /// using the given closure. The result is written to the given + /// writer. Matches correspond to the same matches as reported by + /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). + /// + /// The closure accepts three parameters: the match found, the text of + /// the match and the writer with which to write the replaced text (if any). + /// + /// After all matches are replaced, the writer is _not_ flushed. + /// + /// If there was a problem reading from the given reader or writing to the + /// given writer, then the corresponding `io::Error` is returned and all + /// replacement is stopped. + /// + /// When searching a stream, an internal buffer is used. Therefore, callers + /// should avoiding providing a buffered reader, if possible. However, + /// callers may want to provide a buffered writer. + /// + /// Searching a stream requires that the automaton was built with + /// `MatchKind::Standard` semantics. If this automaton was constructed + /// with leftmost semantics, then this method will panic. To determine + /// whether this will panic at runtime, use the + /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) + /// method. + /// + /// # Memory usage + /// + /// In general, searching streams will use a constant amount of memory for + /// its internal buffer. The one requirement is that the internal buffer + /// must be at least the size of the longest possible match. In most use + /// cases, the default buffer size will be much larger than any individual + /// match. + /// + /// # Panics + /// + /// This panics when `AhoCorasick::supports_stream` returns `false`. + /// That is, this panics when this automaton's match semantics are not + /// `MatchKind::Standard`. This restriction may be lifted in the future. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::io::Write; + /// use aho_corasick::AhoCorasick; + /// + /// # fn example() -> Result<(), ::std::io::Error> { + /// let patterns = &["fox", "brown", "quick"]; + /// let haystack = "The quick brown fox."; + /// + /// let ac = AhoCorasick::new(patterns); + /// let mut result = vec![]; + /// ac.stream_replace_all_with( + /// haystack.as_bytes(), + /// &mut result, + /// |mat, _, wtr| { + /// wtr.write_all(mat.pattern().to_string().as_bytes()) + /// }, + /// )?; + /// assert_eq!(b"The 2 1 0.".to_vec(), result); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn stream_replace_all_with( + &self, + rdr: R, + mut wtr: W, + mut replace_with: F, + ) -> io::Result<()> + where + R: io::Read, + W: io::Write, + F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>, + { + let mut it = StreamChunkIter::new(self, rdr); + while let Some(result) = it.next() { + let chunk = result?; + match chunk { + StreamChunk::NonMatch { bytes, .. } => { + wtr.write_all(bytes)?; + } + StreamChunk::Match { bytes, mat } => { + replace_with(&mat, bytes, &mut wtr)?; + } + } + } + Ok(()) + } + + /// Returns the match kind used by this automaton. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, MatchKind}; + /// + /// let ac = AhoCorasick::new(&[ + /// "foo", "bar", "quux", "baz", + /// ]); + /// assert_eq!(&MatchKind::Standard, ac.match_kind()); + /// ``` + pub fn match_kind(&self) -> &MatchKind { + self.imp.match_kind() + } + + /// Returns the length of the longest pattern matched by this automaton. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&[ + /// "foo", "bar", "quux", "baz", + /// ]); + /// assert_eq!(4, ac.max_pattern_len()); + /// ``` + pub fn max_pattern_len(&self) -> usize { + self.imp.max_pattern_len() + } + + /// Return the total number of patterns matched by this automaton. + /// + /// This includes patterns that may never participate in a match. For + /// example, if + /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst) + /// match semantics are used, and the patterns `Sam` and `Samwise` were + /// used to build the automaton, then `Samwise` can never participate in a + /// match because `Sam` will always take priority. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasick; + /// + /// let ac = AhoCorasick::new(&[ + /// "foo", "bar", "baz", + /// ]); + /// assert_eq!(3, ac.pattern_count()); + /// ``` + pub fn pattern_count(&self) -> usize { + self.imp.pattern_count() + } + + /// Returns true if and only if this automaton supports reporting + /// overlapping matches. + /// + /// If this returns false and overlapping matches are requested, then it + /// will result in a panic. + /// + /// Since leftmost matching is inherently incompatible with overlapping + /// matches, only + /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) + /// supports overlapping matches. This is unlikely to change in the future. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::Standard) + /// .build(&["foo", "bar", "baz"]); + /// assert!(ac.supports_overlapping()); + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(&["foo", "bar", "baz"]); + /// assert!(!ac.supports_overlapping()); + /// ``` + pub fn supports_overlapping(&self) -> bool { + self.match_kind.supports_overlapping() + } + + /// Returns true if and only if this automaton supports stream searching. + /// + /// If this returns false and stream searching (or replacing) is attempted, + /// then it will result in a panic. + /// + /// Currently, only + /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) + /// supports streaming. This may be expanded in the future. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::Standard) + /// .build(&["foo", "bar", "baz"]); + /// assert!(ac.supports_stream()); + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(&["foo", "bar", "baz"]); + /// assert!(!ac.supports_stream()); + /// ``` + pub fn supports_stream(&self) -> bool { + self.match_kind.supports_stream() + } + + /// Returns the approximate total amount of heap used by this automaton, in + /// units of bytes. + /// + /// # Examples + /// + /// This example shows the difference in heap usage between a few + /// configurations: + /// + /// ```ignore + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let ac = AhoCorasickBuilder::new() + /// .dfa(false) // default + /// .build(&["foo", "bar", "baz"]); + /// assert_eq!(10_336, ac.heap_bytes()); + /// + /// let ac = AhoCorasickBuilder::new() + /// .dfa(false) // default + /// .ascii_case_insensitive(true) + /// .build(&["foo", "bar", "baz"]); + /// assert_eq!(10_384, ac.heap_bytes()); + /// + /// let ac = AhoCorasickBuilder::new() + /// .dfa(true) + /// .ascii_case_insensitive(true) + /// .build(&["foo", "bar", "baz"]); + /// assert_eq!(1_248, ac.heap_bytes()); + /// ``` + pub fn heap_bytes(&self) -> usize { + match self.imp { + Imp::NFA(ref nfa) => nfa.heap_bytes(), + Imp::DFA(ref dfa) => dfa.heap_bytes(), + } + } +} + +/// The internal implementation of Aho-Corasick, which is either an NFA or +/// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses +/// more memory. +#[derive(Clone, Debug)] +enum Imp { + NFA(NFA), + DFA(DFA), +} + +impl Imp { + /// Returns the type of match semantics implemented by this automaton. + fn match_kind(&self) -> &MatchKind { + match *self { + Imp::NFA(ref nfa) => nfa.match_kind(), + Imp::DFA(ref dfa) => dfa.match_kind(), + } + } + + /// Returns the identifier of the start state. + fn start_state(&self) -> S { + match *self { + Imp::NFA(ref nfa) => nfa.start_state(), + Imp::DFA(ref dfa) => dfa.start_state(), + } + } + + /// The length, in bytes, of the longest pattern in this automaton. This + /// information is useful for maintaining correct buffer sizes when + /// searching on streams. + fn max_pattern_len(&self) -> usize { + match *self { + Imp::NFA(ref nfa) => nfa.max_pattern_len(), + Imp::DFA(ref dfa) => dfa.max_pattern_len(), + } + } + + /// The total number of patterns added to this automaton. This includes + /// patterns that may never match. The maximum matching pattern that can be + /// reported is exactly one less than this number. + fn pattern_count(&self) -> usize { + match *self { + Imp::NFA(ref nfa) => nfa.pattern_count(), + Imp::DFA(ref dfa) => dfa.pattern_count(), + } + } + + /// Returns the prefilter object, if one exists, for the underlying + /// automaton. + fn prefilter(&self) -> Option<&dyn Prefilter> { + match *self { + Imp::NFA(ref nfa) => nfa.prefilter(), + Imp::DFA(ref dfa) => dfa.prefilter(), + } + } + + /// Returns true if and only if we should attempt to use a prefilter. + fn use_prefilter(&self) -> bool { + let p = match self.prefilter() { + None => return false, + Some(p) => p, + }; + !p.looks_for_non_start_of_match() + } + + #[inline(always)] + fn overlapping_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut S, + match_index: &mut usize, + ) -> Option { + match *self { + Imp::NFA(ref nfa) => nfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + Imp::DFA(ref dfa) => dfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + } + } + + #[inline(always)] + fn earliest_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut S, + ) -> Option { + match *self { + Imp::NFA(ref nfa) => { + nfa.earliest_find_at(prestate, haystack, at, state_id) + } + Imp::DFA(ref dfa) => { + dfa.earliest_find_at(prestate, haystack, at, state_id) + } + } + } + + #[inline(always)] + fn find_at_no_state( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Option { + match *self { + Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at), + Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at), + } + } +} + +/// An iterator of non-overlapping matches in a particular haystack. +/// +/// This iterator yields matches according to the +/// [`MatchKind`](enum.MatchKind.html) +/// used by this automaton. +/// +/// This iterator is constructed via the +/// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter) +/// method. +/// +/// The type variable `S` refers to the representation used for state +/// identifiers. (By default, this is `usize`.) +/// +/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. +/// +/// The lifetime `'b` refers to the lifetime of the haystack being searched. +#[derive(Debug)] +pub struct FindIter<'a, 'b, S: StateID> { + fsm: &'a Imp, + prestate: PrefilterState, + haystack: &'b [u8], + pos: usize, +} + +impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> { + fn new(ac: &'a AhoCorasick, haystack: &'b [u8]) -> FindIter<'a, 'b, S> { + let prestate = PrefilterState::new(ac.max_pattern_len()); + FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 } + } +} + +impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> { + type Item = Match; + + fn next(&mut self) -> Option { + if self.pos > self.haystack.len() { + return None; + } + let result = self.fsm.find_at_no_state( + &mut self.prestate, + self.haystack, + self.pos, + ); + let mat = match result { + None => return None, + Some(mat) => mat, + }; + if mat.end() == self.pos { + // If the automaton can match the empty string and if we found an + // empty match, then we need to forcefully move the position. + self.pos += 1; + } else { + self.pos = mat.end(); + } + Some(mat) + } +} + +/// An iterator of overlapping matches in a particular haystack. +/// +/// This iterator will report all possible matches in a particular haystack, +/// even when the matches overlap. +/// +/// This iterator is constructed via the +/// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter) +/// method. +/// +/// The type variable `S` refers to the representation used for state +/// identifiers. (By default, this is `usize`.) +/// +/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. +/// +/// The lifetime `'b` refers to the lifetime of the haystack being searched. +#[derive(Debug)] +pub struct FindOverlappingIter<'a, 'b, S: StateID> { + fsm: &'a Imp, + prestate: PrefilterState, + haystack: &'b [u8], + pos: usize, + state_id: S, + match_index: usize, +} + +impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> { + fn new( + ac: &'a AhoCorasick, + haystack: &'b [u8], + ) -> FindOverlappingIter<'a, 'b, S> { + assert!( + ac.supports_overlapping(), + "automaton does not support overlapping searches" + ); + let prestate = PrefilterState::new(ac.max_pattern_len()); + FindOverlappingIter { + fsm: &ac.imp, + prestate, + haystack, + pos: 0, + state_id: ac.imp.start_state(), + match_index: 0, + } + } +} + +impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> { + type Item = Match; + + fn next(&mut self) -> Option { + let result = self.fsm.overlapping_find_at( + &mut self.prestate, + self.haystack, + self.pos, + &mut self.state_id, + &mut self.match_index, + ); + match result { + None => return None, + Some(m) => { + self.pos = m.end(); + Some(m) + } + } + } +} + +/// An iterator that reports Aho-Corasick matches in a stream. +/// +/// This iterator yields elements of type `io::Result`, where an error +/// is reported if there was a problem reading from the underlying stream. +/// The iterator terminates only when the underlying stream reaches `EOF`. +/// +/// This iterator is constructed via the +/// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter) +/// method. +/// +/// The type variable `R` refers to the `io::Read` stream that is being read +/// from. +/// +/// The type variable `S` refers to the representation used for state +/// identifiers. (By default, this is `usize`.) +/// +/// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. +#[derive(Debug)] +pub struct StreamFindIter<'a, R, S: StateID> { + it: StreamChunkIter<'a, R, S>, +} + +impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> { + fn new(ac: &'a AhoCorasick, rdr: R) -> StreamFindIter<'a, R, S> { + StreamFindIter { it: StreamChunkIter::new(ac, rdr) } + } +} + +impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> { + type Item = io::Result; + + fn next(&mut self) -> Option> { + loop { + match self.it.next() { + None => return None, + Some(Err(err)) => return Some(Err(err)), + Some(Ok(StreamChunk::NonMatch { .. })) => {} + Some(Ok(StreamChunk::Match { mat, .. })) => { + return Some(Ok(mat)); + } + } + } + } +} + +/// An iterator over chunks in an underlying reader. Each chunk either +/// corresponds to non-matching bytes or matching bytes, but all bytes from +/// the underlying reader are reported in sequence. There may be an arbitrary +/// number of non-matching chunks before seeing a matching chunk. +/// +/// N.B. This does not actually implement Iterator because we need to borrow +/// from the underlying reader. But conceptually, it's still an iterator. +#[derive(Debug)] +struct StreamChunkIter<'a, R, S: StateID> { + /// The AC automaton. + fsm: &'a Imp, + /// State associated with this automaton's prefilter. It is a heuristic + /// for stopping the prefilter if it's deemed ineffective. + prestate: PrefilterState, + /// The source of bytes we read from. + rdr: R, + /// A fixed size buffer. This is what we actually search. There are some + /// invariants around the buffer's size, namely, it must be big enough to + /// contain the longest possible match. + buf: Buffer, + /// The ID of the FSM state we're currently in. + state_id: S, + /// The current position at which to start the next search in `buf`. + search_pos: usize, + /// The absolute position of `search_pos`, where `0` corresponds to the + /// position of the first byte read from `rdr`. + absolute_pos: usize, + /// The ending position of the last StreamChunk that was returned to the + /// caller. This position is used to determine whether we need to emit + /// non-matching bytes before emitting a match. + report_pos: usize, + /// A match that should be reported on the next call. + pending_match: Option, + /// Enabled only when the automaton can match the empty string. When + /// enabled, we need to execute one final search after consuming the + /// reader to find the trailing empty match. + has_empty_match_at_end: bool, +} + +/// A single chunk yielded by the stream chunk iterator. +/// +/// The `'r` lifetime refers to the lifetime of the stream chunk iterator. +#[derive(Debug)] +enum StreamChunk<'r> { + /// A chunk that does not contain any matches. + NonMatch { bytes: &'r [u8] }, + /// A chunk that precisely contains a match. + Match { bytes: &'r [u8], mat: Match }, +} + +impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> { + fn new(ac: &'a AhoCorasick, rdr: R) -> StreamChunkIter<'a, R, S> { + assert!( + ac.supports_stream(), + "stream searching is only supported for Standard match semantics" + ); + + let prestate = if ac.imp.use_prefilter() { + PrefilterState::new(ac.max_pattern_len()) + } else { + PrefilterState::disabled() + }; + let buf = Buffer::new(ac.imp.max_pattern_len()); + let state_id = ac.imp.start_state(); + StreamChunkIter { + fsm: &ac.imp, + prestate, + rdr, + buf, + state_id, + absolute_pos: 0, + report_pos: 0, + search_pos: 0, + pending_match: None, + has_empty_match_at_end: ac.is_match(""), + } + } + + fn next(&mut self) -> Option> { + loop { + if let Some(mut mat) = self.pending_match.take() { + let bytes = &self.buf.buffer()[mat.start()..mat.end()]; + self.report_pos = mat.end(); + mat = mat.increment(self.absolute_pos); + return Some(Ok(StreamChunk::Match { bytes, mat })); + } + if self.search_pos >= self.buf.len() { + if let Some(end) = self.unreported() { + let bytes = &self.buf.buffer()[self.report_pos..end]; + self.report_pos = end; + return Some(Ok(StreamChunk::NonMatch { bytes })); + } + if self.buf.len() >= self.buf.min_buffer_len() { + // This is the point at which we roll our buffer, which we + // only do if our buffer has at least the minimum amount of + // bytes in it. Before rolling, we update our various + // positions to be consistent with the buffer after it has + // been rolled. + + self.report_pos -= + self.buf.len() - self.buf.min_buffer_len(); + self.absolute_pos += + self.search_pos - self.buf.min_buffer_len(); + self.search_pos = self.buf.min_buffer_len(); + self.buf.roll(); + } + match self.buf.fill(&mut self.rdr) { + Err(err) => return Some(Err(err)), + Ok(false) => { + // We've hit EOF, but if there are still some + // unreported bytes remaining, return them now. + if self.report_pos < self.buf.len() { + let bytes = &self.buf.buffer()[self.report_pos..]; + self.report_pos = self.buf.len(); + + let chunk = StreamChunk::NonMatch { bytes }; + return Some(Ok(chunk)); + } else { + // We've reported everything, but there might still + // be a match at the very last position. + if !self.has_empty_match_at_end { + return None; + } + // fallthrough for another search to get trailing + // empty matches + self.has_empty_match_at_end = false; + } + } + Ok(true) => {} + } + } + let result = self.fsm.earliest_find_at( + &mut self.prestate, + self.buf.buffer(), + self.search_pos, + &mut self.state_id, + ); + match result { + None => { + self.search_pos = self.buf.len(); + } + Some(mat) => { + self.state_id = self.fsm.start_state(); + if mat.end() == self.search_pos { + // If the automaton can match the empty string and if + // we found an empty match, then we need to forcefully + // move the position. + self.search_pos += 1; + } else { + self.search_pos = mat.end(); + } + self.pending_match = Some(mat.clone()); + if self.report_pos < mat.start() { + let bytes = + &self.buf.buffer()[self.report_pos..mat.start()]; + self.report_pos = mat.start(); + + let chunk = StreamChunk::NonMatch { bytes }; + return Some(Ok(chunk)); + } + } + } + } + } + + fn unreported(&self) -> Option { + let end = self.search_pos.saturating_sub(self.buf.min_buffer_len()); + if self.report_pos < end { + Some(end) + } else { + None + } + } +} + +/// A builder for configuring an Aho-Corasick automaton. +#[derive(Clone, Debug)] +pub struct AhoCorasickBuilder { + nfa_builder: nfa::Builder, + dfa_builder: dfa::Builder, + dfa: bool, +} + +impl Default for AhoCorasickBuilder { + fn default() -> AhoCorasickBuilder { + AhoCorasickBuilder::new() + } +} + +impl AhoCorasickBuilder { + /// Create a new builder for configuring an Aho-Corasick automaton. + /// + /// If you don't need fine grained configuration or aren't sure which knobs + /// to set, try using + /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) + /// instead. + pub fn new() -> AhoCorasickBuilder { + AhoCorasickBuilder { + nfa_builder: nfa::Builder::new(), + dfa_builder: dfa::Builder::new(), + dfa: false, + } + } + + /// Build an Aho-Corasick automaton using the configuration set on this + /// builder. + /// + /// A builder may be reused to create more automatons. + /// + /// This method will use the default for representing internal state + /// identifiers, which is `usize`. This guarantees that building the + /// automaton will succeed and is generally a good default, but can make + /// the size of the automaton 2-8 times bigger than it needs to be, + /// depending on your target platform. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// let patterns = &["foo", "bar", "baz"]; + /// let ac = AhoCorasickBuilder::new() + /// .build(patterns); + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// ``` + pub fn build(&self, patterns: I) -> AhoCorasick + where + I: IntoIterator, + P: AsRef<[u8]>, + { + // The builder only returns an error if the chosen state ID + // representation is too small to fit all of the given patterns. In + // this case, since we fix the representation to usize, it will always + // work because it's impossible to overflow usize since the underlying + // storage would OOM long before that happens. + self.build_with_size::(patterns) + .expect("usize state ID type should always work") + } + + /// Build an Aho-Corasick automaton using the configuration set on this + /// builder with a specific state identifier representation. This only has + /// an effect when the `dfa` option is enabled. + /// + /// Generally, the choices for a state identifier representation are + /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default. + /// The advantage of choosing a smaller state identifier representation + /// is that the automaton produced will be smaller. This might be + /// beneficial for just generally using less space, or might even allow it + /// to fit more of the automaton in your CPU's cache, leading to overall + /// better search performance. + /// + /// Unlike the standard `build` method, this can report an error if the + /// state identifier representation cannot support the size of the + /// automaton. + /// + /// Note that the state identifier representation is determined by the + /// `S` type variable. This requires a type hint of some sort, either + /// by specifying the return type or using the turbofish, e.g., + /// `build_with_size::(...)`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder}; + /// + /// # fn example() -> Result<(), ::aho_corasick::Error> { + /// let patterns = &["foo", "bar", "baz"]; + /// let ac: AhoCorasick = AhoCorasickBuilder::new() + /// .build_with_size(patterns)?; + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// # Ok(()) }; example().unwrap() + /// ``` + /// + /// Or alternatively, with turbofish: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// # fn example() -> Result<(), ::aho_corasick::Error> { + /// let patterns = &["foo", "bar", "baz"]; + /// let ac = AhoCorasickBuilder::new() + /// .build_with_size::(patterns)?; + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn build_with_size( + &self, + patterns: I, + ) -> Result> + where + S: StateID, + I: IntoIterator, + P: AsRef<[u8]>, + { + let nfa = self.nfa_builder.build(patterns)?; + let match_kind = nfa.match_kind().clone(); + let imp = if self.dfa { + let dfa = self.dfa_builder.build(&nfa)?; + Imp::DFA(dfa) + } else { + Imp::NFA(nfa) + }; + Ok(AhoCorasick { imp, match_kind }) + } + + /// Automatically configure the settings on this builder according to the + /// patterns that will be used to construct the automaton. + /// + /// The idea here is to balance space and time automatically. That is, when + /// searching a small number of patterns, this will attempt to use the + /// fastest possible configuration since the total space required will be + /// small anyway. As the number of patterns grows, this will fall back to + /// slower configurations that use less space. + /// + /// This is guaranteed to never set `match_kind`, but any other option may + /// be overridden. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// let patterns = &["foo", "bar", "baz"]; + /// let ac = AhoCorasickBuilder::new() + /// .auto_configure(patterns) + /// .build(patterns); + /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); + /// ``` + pub fn auto_configure>( + &mut self, + patterns: &[B], + ) -> &mut AhoCorasickBuilder { + // N.B. Currently we only use the length of `patterns` to make a + // decision here, and could therefore ask for an `ExactSizeIterator` + // instead. But it's conceivable that we might adapt this to look at + // the total number of bytes, which would requires a second pass. + // + // The logic here is fairly rudimentary at the moment, but probably + // OK. The idea here is to use the fastest thing possible for a small + // number of patterns. That is, a DFA with no byte classes, since byte + // classes require an extra indirection for every byte searched. With a + // moderate number of patterns, we still want a DFA, but save on both + // space and compilation time by enabling byte classes. Finally, fall + // back to the slower but smaller NFA. + if patterns.len() <= 100 { + // N.B. Using byte classes can actually be faster by improving + // locality, but this only really applies for multi-megabyte + // automata (i.e., automata that don't fit in your CPU's cache). + self.dfa(true); + } else if patterns.len() <= 5000 { + self.dfa(true); + } + self + } + + /// Set the desired match semantics. + /// + /// The default is `MatchKind::Standard`, which corresponds to the match + /// semantics supported by the standard textbook description of the + /// Aho-Corasick algorithm. Namely, matches are reported as soon as they + /// are found. Moreover, this is the only way to get overlapping matches + /// or do stream searching. + /// + /// The other kinds of match semantics that are supported are + /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former + /// corresponds to the match you would get if you were to try to match + /// each pattern at each position in the haystack in the same order that + /// you give to the automaton. That is, it returns the leftmost match + /// corresponding the earliest pattern given to the automaton. The latter + /// corresponds to finding the longest possible match among all leftmost + /// matches. + /// + /// For more details on match semantics, see the + /// [documentation for `MatchKind`](enum.MatchKind.html). + /// + /// # Examples + /// + /// In these examples, we demonstrate the differences between match + /// semantics for a particular set of patterns in a specific order: + /// `b`, `abc`, `abcd`. + /// + /// Standard semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::Standard) // default, not necessary + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("b", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// Leftmost-first semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostFirst) + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); + /// ``` + /// + /// Leftmost-longest semantics: + /// + /// ``` + /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; + /// + /// let patterns = &["b", "abc", "abcd"]; + /// let haystack = "abcd"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .match_kind(MatchKind::LeftmostLongest) + /// .build(patterns); + /// let mat = ac.find(haystack).expect("should have a match"); + /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); + /// ``` + pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder { + self.nfa_builder.match_kind(kind); + self + } + + /// Enable anchored mode, which requires all matches to start at the + /// first position in a haystack. + /// + /// This option is disabled by default. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// let patterns = &["foo", "bar"]; + /// let haystack = "foobar"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .anchored(true) + /// .build(patterns); + /// assert_eq!(1, ac.find_iter(haystack).count()); + /// ``` + /// + /// When searching for overlapping matches, all matches that start at + /// the beginning of a haystack will be reported: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// let patterns = &["foo", "foofoo"]; + /// let haystack = "foofoo"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .anchored(true) + /// .build(patterns); + /// assert_eq!(2, ac.find_overlapping_iter(haystack).count()); + /// // A non-anchored search would return 3 matches. + /// ``` + pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.nfa_builder.anchored(yes); + self + } + + /// Enable ASCII-aware case insensitive matching. + /// + /// When this option is enabled, searching will be performed without + /// respect to case for ASCII letters (`a-z` and `A-Z`) only. + /// + /// Enabling this option does not change the search algorithm, but it may + /// increase the size of the automaton. + /// + /// **NOTE:** It is unlikely that support for Unicode case folding will + /// be added in the future. The ASCII case works via a simple hack to the + /// underlying automaton, but full Unicode handling requires a fair bit of + /// sophistication. If you do need Unicode handling, you might consider + /// using the [`regex` crate](https://docs.rs/regex) or the lower level + /// [`regex-automata` crate](https://docs.rs/regex-automata). + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::AhoCorasickBuilder; + /// + /// let patterns = &["FOO", "bAr", "BaZ"]; + /// let haystack = "foo bar baz"; + /// + /// let ac = AhoCorasickBuilder::new() + /// .ascii_case_insensitive(true) + /// .build(patterns); + /// assert_eq!(3, ac.find_iter(haystack).count()); + /// ``` + pub fn ascii_case_insensitive( + &mut self, + yes: bool, + ) -> &mut AhoCorasickBuilder { + self.nfa_builder.ascii_case_insensitive(yes); + self + } + + /// Set the limit on how many NFA states use a dense representation for + /// their transitions. + /// + /// A dense representation uses more space, but supports faster access to + /// transitions at search time. Thus, this setting permits the control of a + /// space vs time trade off when using the NFA variant of Aho-Corasick. + /// + /// This limit is expressed in terms of the depth of a state, i.e., the + /// number of transitions from the starting state of the NFA. The idea is + /// that most of the time searching will be spent near the starting state + /// of the automaton, so states near the start state should use a dense + /// representation. States further away from the start state would then use + /// a sparse representation, which uses less space but is slower to access + /// transitions at search time. + /// + /// By default, this is set to a low but non-zero number. + /// + /// This setting has no effect if the `dfa` option is enabled. + pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder { + self.nfa_builder.dense_depth(depth); + self + } + + /// Compile the standard Aho-Corasick automaton into a deterministic finite + /// automaton (DFA). + /// + /// When this is disabled (which is the default), then a non-deterministic + /// finite automaton (NFA) is used instead. + /// + /// The main benefit to a DFA is that it can execute searches more quickly + /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the + /// DFA uses more space and can take much longer to build. + /// + /// Enabling this option does not change the time complexity for + /// constructing the Aho-Corasick automaton (which is `O(p)` where + /// `p` is the total number of patterns being compiled). Enabling this + /// option does however reduce the time complexity of non-overlapping + /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the + /// haystack. + /// + /// In general, it's a good idea to enable this if you're searching a + /// small number of fairly short patterns (~1000), or if you want the + /// fastest possible search without regard to compilation time or space + /// usage. + pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.dfa = yes; + self + } + + /// Enable heuristic prefilter optimizations. + /// + /// When enabled, searching will attempt to quickly skip to match + /// candidates using specialized literal search routines. A prefilter + /// cannot always be used, and is generally treated as a heuristic. It + /// can be useful to disable this if the prefilter is observed to be + /// sub-optimal for a particular workload. + /// + /// This is enabled by default. + pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.nfa_builder.prefilter(yes); + self + } + + /// Shrink the size of the transition alphabet by mapping bytes to their + /// equivalence classes. This only has an effect when the `dfa` option is + /// enabled. + /// + /// When enabled, each a DFA will use a map from all possible bytes + /// to their corresponding equivalence class. Each equivalence class + /// represents a set of bytes that does not discriminate between a match + /// and a non-match in the DFA. For example, the patterns `bar` and `baz` + /// have at least five equivalence classes: singleton sets of `b`, `a`, `r` + /// and `z`, and a final set that contains every other byte. + /// + /// The advantage of this map is that the size of the transition table can + /// be reduced drastically from `#states * 256 * sizeof(id)` to + /// `#states * k * sizeof(id)` where `k` is the number of equivalence + /// classes. As a result, total space usage can decrease substantially. + /// Moreover, since a smaller alphabet is used, compilation becomes faster + /// as well. + /// + /// The disadvantage of this map is that every byte searched must be + /// passed through this map before it can be used to determine the next + /// transition. This has a small match time performance cost. However, if + /// the DFA is otherwise very large without byte classes, then using byte + /// classes can greatly improve memory locality and thus lead to better + /// overall performance. + /// + /// This option is enabled by default. + #[deprecated( + since = "0.7.16", + note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" + )] + pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.dfa_builder.byte_classes(yes); + self + } + + /// Premultiply state identifiers in the transition table. This only has + /// an effect when the `dfa` option is enabled. + /// + /// When enabled, state identifiers are premultiplied to point to their + /// corresponding row in the transition table. That is, given the `i`th + /// state, its corresponding premultiplied identifier is `i * k` where `k` + /// is the alphabet size of the automaton. (The alphabet size is at most + /// 256, but is in practice smaller if byte classes is enabled.) + /// + /// When state identifiers are not premultiplied, then the identifier of + /// the `i`th state is `i`. + /// + /// The advantage of premultiplying state identifiers is that is saves a + /// multiplication instruction per byte when searching with a DFA. This has + /// been observed to lead to a 20% performance benefit in micro-benchmarks. + /// + /// The primary disadvantage of premultiplying state identifiers is + /// that they require a larger integer size to represent. For example, + /// if the DFA has 200 states, then its premultiplied form requires 16 + /// bits to represent every possible state identifier, where as its + /// non-premultiplied form only requires 8 bits. + /// + /// This option is enabled by default. + #[deprecated( + since = "0.7.16", + note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57" + )] + pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder { + self.dfa_builder.premultiply(yes); + self + } +} + +/// A knob for controlling the match semantics of an Aho-Corasick automaton. +/// +/// There are two generally different ways that Aho-Corasick automatons can +/// report matches. The first way is the "standard" approach that results from +/// implementing most textbook explanations of Aho-Corasick. The second way is +/// to report only the leftmost non-overlapping matches. The leftmost approach +/// is in turn split into two different ways of resolving ambiguous matches: +/// leftmost-first and leftmost-longest. +/// +/// The `Standard` match kind is the default and is the only one that supports +/// overlapping matches and stream searching. (Trying to find overlapping +/// or streaming matches using leftmost match semantics will result in a +/// panic.) The `Standard` match kind will report matches as they are seen. +/// When searching for overlapping matches, then all possible matches are +/// reported. When searching for non-overlapping matches, the first match seen +/// is reported. For example, for non-overlapping matches, given the patterns +/// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is +/// reported since it is detected first. The `abcd` match is never reported +/// since it overlaps with the `b` match. +/// +/// In contrast, the leftmost match kind always prefers the leftmost match +/// among all possible matches. Given the same example as above with `abcd` and +/// `b` as patterns and `abcdef` as the subject string, the leftmost match is +/// `abcd` since it begins before the `b` match, even though the `b` match is +/// detected before the `abcd` match. In this case, the `b` match is not +/// reported at all since it overlaps with the `abcd` match. +/// +/// The difference between leftmost-first and leftmost-longest is in how they +/// resolve ambiguous matches when there are multiple leftmost matches to +/// choose from. Leftmost-first always chooses the pattern that was provided +/// earliest, where as leftmost-longest always chooses the longest matching +/// pattern. For example, given the patterns `a` and `ab` and the subject +/// string `ab`, the leftmost-first match is `a` but the leftmost-longest match +/// is `ab`. Conversely, if the patterns were given in reverse order, i.e., +/// `ab` and `a`, then both the leftmost-first and leftmost-longest matches +/// would be `ab`. Stated differently, the leftmost-first match depends on the +/// order in which the patterns were given to the Aho-Corasick automaton. +/// Because of that, when leftmost-first matching is used, if a pattern `A` +/// that appears before a pattern `B` is a prefix of `B`, then it is impossible +/// to ever observe a match of `B`. +/// +/// If you're not sure which match kind to pick, then stick with the standard +/// kind, which is the default. In particular, if you need overlapping or +/// streaming matches, then you _must_ use the standard kind. The leftmost +/// kinds are useful in specific circumstances. For example, leftmost-first can +/// be very useful as a way to implement match priority based on the order of +/// patterns given and leftmost-longest can be useful for dictionary searching +/// such that only the longest matching words are reported. +/// +/// # Relationship with regular expression alternations +/// +/// Understanding match semantics can be a little tricky, and one easy way +/// to conceptualize non-overlapping matches from an Aho-Corasick automaton +/// is to think about them as a simple alternation of literals in a regular +/// expression. For example, let's say we wanted to match the strings +/// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It +/// turns out that regular expression engines have two different ways of +/// matching this alternation. The first way, leftmost-longest, is commonly +/// found in POSIX compatible implementations of regular expressions (such as +/// `grep`). The second way, leftmost-first, is commonly found in backtracking +/// implementations such as Perl. (Some regex engines, such as RE2 and Rust's +/// regex engine do not use backtracking, but still implement leftmost-first +/// semantics in an effort to match the behavior of dominant backtracking +/// regex engines such as those found in Perl, Ruby, Python, Javascript and +/// PHP.) +/// +/// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex +/// will match `Samwise` because it is the longest possible match, but a +/// Perl-like regex will match `Sam` since it appears earlier in the +/// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine +/// will never match `Samwise` since `Sam` will always have higher priority. +/// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to +/// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is +/// still longest match, but it also appears earlier than `Sam`. +/// +/// The "standard" match semantics of Aho-Corasick generally don't correspond +/// to the match semantics of any large group of regex implementations, so +/// there's no direct analogy that can be made here. Standard match semantics +/// are generally useful for overlapping matches, or if you just want to see +/// matches as they are detected. +/// +/// The main conclusion to draw from this section is that the match semantics +/// can be tweaked to precisely match either Perl-like regex alternations or +/// POSIX regex alternations. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchKind { + /// Use standard match semantics, which support overlapping matches. When + /// used with non-overlapping matches, matches are reported as they are + /// seen. + Standard, + /// Use leftmost-first match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the match + /// corresponding to the pattern that appeared earlier when constructing + /// the automaton is reported. + /// + /// This does **not** support overlapping matches or stream searching. If + /// this match kind is used, attempting to find overlapping matches or + /// stream matches will panic. + LeftmostFirst, + /// Use leftmost-longest match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the longest match + /// is chosen. + /// + /// This does **not** support overlapping matches or stream searching. If + /// this match kind is used, attempting to find overlapping matches or + /// stream matches will panic. + LeftmostLongest, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +/// The default match kind is `MatchKind::Standard`. +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::Standard + } +} + +impl MatchKind { + fn supports_overlapping(&self) -> bool { + self.is_standard() + } + + fn supports_stream(&self) -> bool { + // TODO: It may be possible to support this. It's hard. + // + // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838 + self.is_standard() + } + + pub(crate) fn is_standard(&self) -> bool { + *self == MatchKind::Standard + } + + pub(crate) fn is_leftmost(&self) -> bool { + *self == MatchKind::LeftmostFirst + || *self == MatchKind::LeftmostLongest + } + + pub(crate) fn is_leftmost_first(&self) -> bool { + *self == MatchKind::LeftmostFirst + } + + /// Convert this match kind into a packed match kind. If this match kind + /// corresponds to standard semantics, then this returns None, since + /// packed searching does not support standard semantics. + pub(crate) fn as_packed(&self) -> Option { + match *self { + MatchKind::Standard => None, + MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst), + MatchKind::LeftmostLongest => { + Some(packed::MatchKind::LeftmostLongest) + } + MatchKind::__Nonexhaustive => unreachable!(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn oibits() { + use std::panic::{RefUnwindSafe, UnwindSafe}; + + fn assert_send() {} + fn assert_sync() {} + fn assert_unwind_safe() {} + + assert_send::(); + assert_sync::(); + assert_unwind_safe::(); + assert_send::(); + assert_sync::(); + assert_unwind_safe::(); + } +} diff --git a/src/automaton.rs b/src/automaton.rs new file mode 100644 index 0000000..d88743a --- /dev/null +++ b/src/automaton.rs @@ -0,0 +1,573 @@ +use crate::ahocorasick::MatchKind; +use crate::prefilter::{self, Candidate, Prefilter, PrefilterState}; +use crate::state_id::{dead_id, fail_id, StateID}; +use crate::Match; + +// NOTE: This trait essentially started as a copy of the same trait from from +// regex-automata, with some wording changed since we use this trait for +// NFAs in addition to DFAs in this crate. Additionally, we do not export +// this trait. It's only used internally to reduce code duplication. The +// regex-automata crate needs to expose it because its Regex type is generic +// over implementations of this trait. In this crate, we encapsulate everything +// behind the AhoCorasick type. +// +// This trait is a bit of a mess, but it's not quite clear how to fix it. +// Basically, there are several competing concerns: +// +// * We need performance, so everything effectively needs to get monomorphized. +// * There are several variations on searching Aho-Corasick automatons: +// overlapping, standard and leftmost. Overlapping and standard are somewhat +// combined together below, but there is no real way to combine standard with +// leftmost. Namely, leftmost requires continuing a search even after a match +// is found, in order to correctly disambiguate a match. +// * On top of that, *sometimes* callers want to know which state the automaton +// is in after searching. This is principally useful for overlapping and +// stream searches. However, when callers don't care about this, we really +// do not want to be forced to compute it, since it sometimes requires extra +// work. Thus, there are effectively two copies of leftmost searching: one +// for tracking the state ID and one that doesn't. We should ideally do the +// same for standard searching, but my sanity stopped me. + +// SAFETY RATIONALE: Previously, the code below went to some length to remove +// all bounds checks. This generally produced tighter assembly and lead to +// 20-50% improvements in micro-benchmarks on corpora made up of random +// characters. This somewhat makes sense, since the branch predictor is going +// to be at its worse on random text. +// +// However, using the aho-corasick-debug tool and manually benchmarking +// different inputs, the code *with* bounds checks actually wound up being +// slightly faster: +// +// $ cat input +// Sherlock Holmes +// John Watson +// Professor Moriarty +// Irene Adler +// Mary Watson +// +// $ aho-corasick-debug-safe \ +// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa +// pattern read time: 32.824µs +// automaton build time: 444.687µs +// automaton heap usage: 72392 bytes +// match count: 639 +// count time: 1.809961702s +// +// $ aho-corasick-debug-master \ +// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa +// pattern read time: 31.425µs +// automaton build time: 317.434µs +// automaton heap usage: 72392 bytes +// match count: 639 +// count time: 2.059157705s +// +// I was able to reproduce this result on two different machines (an i5 and +// an i7). Therefore, we go the route of safe code for now. + +/// A trait describing the interface of an Aho-Corasick finite state machine. +/// +/// Every automaton has exactly one fail state, one dead state and exactly one +/// start state. Generally, these correspond to the first, second and third +/// states, respectively. The dead state is always treated as a sentinel. That +/// is, no correct Aho-Corasick automaton will ever transition into the fail +/// state. The dead state, however, can be transitioned into, but only when +/// leftmost-first or leftmost-longest match semantics are enabled and only +/// when at least one match has been observed. +/// +/// Every automaton also has one or more match states, such that +/// `Automaton::is_match_state(id)` returns `true` if and only if `id` +/// corresponds to a match state. +pub trait Automaton { + /// The representation used for state identifiers in this automaton. + /// + /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. + type ID: StateID; + + /// The type of matching that should be done. + fn match_kind(&self) -> &MatchKind; + + /// Returns true if and only if this automaton uses anchored searches. + fn anchored(&self) -> bool; + + /// An optional prefilter for quickly skipping to the next candidate match. + /// A prefilter must report at least every match, although it may report + /// positions that do not correspond to a match. That is, it must not allow + /// false negatives, but can allow false positives. + /// + /// Currently, a prefilter only runs when the automaton is in the start + /// state. That is, the position reported by a prefilter should always + /// correspond to the start of a potential match. + fn prefilter(&self) -> Option<&dyn Prefilter>; + + /// Return the identifier of this automaton's start state. + fn start_state(&self) -> Self::ID; + + /// Returns true if and only if the given state identifier refers to a + /// valid state. + fn is_valid(&self, id: Self::ID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a match + /// state. + /// + /// The state ID given must be valid, or else implementors may panic. + fn is_match_state(&self, id: Self::ID) -> bool; + + /// Returns true if and only if the given identifier corresponds to a state + /// that is either the dead state or a match state. + /// + /// Depending on the implementation of the automaton, this routine can + /// be used to save a branch in the core matching loop. Nevertheless, + /// `is_match_state(id) || id == dead_id()` is always a valid + /// implementation. Indeed, this is the default implementation. + /// + /// The state ID given must be valid, or else implementors may panic. + fn is_match_or_dead_state(&self, id: Self::ID) -> bool { + id == dead_id() || self.is_match_state(id) + } + + /// If the given state is a match state, return the match corresponding + /// to the given match index. `end` must be the ending position of the + /// detected match. If no match exists or if `match_index` exceeds the + /// number of matches in this state, then `None` is returned. + /// + /// The state ID given must be valid, or else implementors may panic. + /// + /// If the given state ID is correct and if the `match_index` is less than + /// the number of matches for that state, then this is guaranteed to return + /// a match. + fn get_match( + &self, + id: Self::ID, + match_index: usize, + end: usize, + ) -> Option; + + /// Returns the number of matches for the given state. If the given state + /// is not a match state, then this returns 0. + /// + /// The state ID given must be valid, or else implementors must panic. + fn match_count(&self, id: Self::ID) -> usize; + + /// Given the current state that this automaton is in and the next input + /// byte, this method returns the identifier of the next state. The + /// identifier returned must always be valid and may never correspond to + /// the fail state. The returned identifier may, however, point to the + /// dead state. + /// + /// This is not safe so that implementors may look up the next state + /// without memory safety checks such as bounds checks. As such, callers + /// must ensure that the given identifier corresponds to a valid automaton + /// state. Implementors must, in turn, ensure that this routine is safe for + /// all valid state identifiers and for all possible `u8` values. + fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; + + /// Like next_state, but debug_asserts that the underlying + /// implementation never returns a `fail_id()` for the next state. + fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID { + let next = self.next_state(current, input); + // We should never see a transition to the failure state. + debug_assert!( + next != fail_id(), + "automaton should never return fail_id for next state" + ); + next + } + + /// Execute a search using standard match semantics. + /// + /// This can be used even when the automaton was constructed with leftmost + /// match semantics when you want to find the earliest possible match. This + /// can also be used as part of an overlapping search implementation. + /// + /// N.B. This does not report a match if `state_id` is given as a matching + /// state. As such, this should not be used directly. + #[inline(always)] + fn standard_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut Self::ID, + ) -> Option { + if let Some(pre) = self.prefilter() { + self.standard_find_at_imp( + prestate, + Some(pre), + haystack, + at, + state_id, + ) + } else { + self.standard_find_at_imp(prestate, None, haystack, at, state_id) + } + } + + // It's important for this to always be inlined. Namely, its only caller + // is standard_find_at, and the inlining should remove the case analysis + // for prefilter scanning when there is no prefilter available. + #[inline(always)] + fn standard_find_at_imp( + &self, + prestate: &mut PrefilterState, + prefilter: Option<&dyn Prefilter>, + haystack: &[u8], + mut at: usize, + state_id: &mut Self::ID, + ) -> Option { + while at < haystack.len() { + if let Some(pre) = prefilter { + if prestate.is_effective(at) && *state_id == self.start_state() + { + let c = prefilter::next(prestate, pre, haystack, at) + .into_option(); + match c { + None => return None, + Some(i) => { + at = i; + } + } + } + } + // CORRECTNESS: next_state is correct for all possible u8 values, + // so the only thing we're concerned about is the validity of + // `state_id`. `state_id` either comes from the caller (in which + // case, we assume it is correct), or it comes from the return + // value of next_state, which is guaranteed to be correct. + *state_id = self.next_state_no_fail(*state_id, haystack[at]); + at += 1; + // This routine always quits immediately after seeing a + // match, and since dead states can only come after seeing + // a match, seeing a dead state here is impossible. (Unless + // we have an anchored automaton, in which case, dead states + // are used to stop a search.) + debug_assert!( + *state_id != dead_id() || self.anchored(), + "standard find should never see a dead state" + ); + + if self.is_match_or_dead_state(*state_id) { + return if *state_id == dead_id() { + None + } else { + self.get_match(*state_id, 0, at) + }; + } + } + None + } + + /// Execute a search using leftmost (either first or longest) match + /// semantics. + /// + /// The principle difference between searching with standard semantics and + /// searching with leftmost semantics is that leftmost searching will + /// continue searching even after a match has been found. Once a match + /// is found, the search does not stop until either the haystack has been + /// exhausted or a dead state is observed in the automaton. (Dead states + /// only exist in automatons constructed with leftmost semantics.) That is, + /// we rely on the construction of the automaton to tell us when to quit. + #[inline(never)] + fn leftmost_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut Self::ID, + ) -> Option { + if let Some(pre) = self.prefilter() { + self.leftmost_find_at_imp( + prestate, + Some(pre), + haystack, + at, + state_id, + ) + } else { + self.leftmost_find_at_imp(prestate, None, haystack, at, state_id) + } + } + + // It's important for this to always be inlined. Namely, its only caller + // is leftmost_find_at, and the inlining should remove the case analysis + // for prefilter scanning when there is no prefilter available. + #[inline(always)] + fn leftmost_find_at_imp( + &self, + prestate: &mut PrefilterState, + prefilter: Option<&dyn Prefilter>, + haystack: &[u8], + mut at: usize, + state_id: &mut Self::ID, + ) -> Option { + debug_assert!(self.match_kind().is_leftmost()); + if self.anchored() && at > 0 && *state_id == self.start_state() { + return None; + } + let mut last_match = self.get_match(*state_id, 0, at); + while at < haystack.len() { + if let Some(pre) = prefilter { + if prestate.is_effective(at) && *state_id == self.start_state() + { + let c = prefilter::next(prestate, pre, haystack, at) + .into_option(); + match c { + None => return None, + Some(i) => { + at = i; + } + } + } + } + // CORRECTNESS: next_state is correct for all possible u8 values, + // so the only thing we're concerned about is the validity of + // `state_id`. `state_id` either comes from the caller (in which + // case, we assume it is correct), or it comes from the return + // value of next_state, which is guaranteed to be correct. + *state_id = self.next_state_no_fail(*state_id, haystack[at]); + at += 1; + if self.is_match_or_dead_state(*state_id) { + if *state_id == dead_id() { + // The only way to enter into a dead state is if a match + // has been found, so we assert as much. This is different + // from normal automata, where you might enter a dead state + // if you know a subsequent match will never be found + // (regardless of whether a match has already been found). + // For Aho-Corasick, it is built so that we can match at + // any position, so the possibility of a match always + // exists. + // + // (Unless we have an anchored automaton, in which case, + // dead states are used to stop a search.) + debug_assert!( + last_match.is_some() || self.anchored(), + "dead state should only be seen after match" + ); + return last_match; + } + last_match = self.get_match(*state_id, 0, at); + } + } + last_match + } + + /// This is like leftmost_find_at, but does not need to track a caller + /// provided state id. In other words, the only output of this routine is a + /// match, if one exists. + /// + /// It is regrettable that we need to effectively copy a chunk of + /// implementation twice, but when we don't need to track the state ID, we + /// can allow the prefilter to report matches immediately without having + /// to re-confirm them with the automaton. The re-confirmation step is + /// necessary in leftmost_find_at because tracing through the automaton is + /// the only way to correctly set the state ID. (Perhaps an alternative + /// would be to keep a map from pattern ID to matching state ID, but that + /// complicates the code and still doesn't permit us to defer to the + /// prefilter entirely when possible.) + /// + /// I did try a few things to avoid the code duplication here, but nothing + /// optimized as well as this approach. (In microbenchmarks, there was + /// about a 25% difference.) + #[inline(never)] + fn leftmost_find_at_no_state( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Option { + if let Some(pre) = self.prefilter() { + self.leftmost_find_at_no_state_imp( + prestate, + Some(pre), + haystack, + at, + ) + } else { + self.leftmost_find_at_no_state_imp(prestate, None, haystack, at) + } + } + + // It's important for this to always be inlined. Namely, its only caller + // is leftmost_find_at_no_state, and the inlining should remove the case + // analysis for prefilter scanning when there is no prefilter available. + #[inline(always)] + fn leftmost_find_at_no_state_imp( + &self, + prestate: &mut PrefilterState, + prefilter: Option<&dyn Prefilter>, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(self.match_kind().is_leftmost()); + if self.anchored() && at > 0 { + return None; + } + // If our prefilter handles confirmation of matches 100% of the + // time, and since we don't need to track state IDs, we can avoid + // Aho-Corasick completely. + if let Some(pre) = prefilter { + // We should never have a prefilter during an anchored search. + debug_assert!(!self.anchored()); + if !pre.reports_false_positives() { + return match pre.next_candidate(prestate, haystack, at) { + Candidate::None => None, + Candidate::Match(m) => Some(m), + Candidate::PossibleStartOfMatch(_) => unreachable!(), + }; + } + } + + let mut state_id = self.start_state(); + let mut last_match = self.get_match(state_id, 0, at); + while at < haystack.len() { + if let Some(pre) = prefilter { + if prestate.is_effective(at) && state_id == self.start_state() + { + match prefilter::next(prestate, pre, haystack, at) { + Candidate::None => return None, + // Since we aren't tracking a state ID, we can + // quit early once we know we have a match. + Candidate::Match(m) => return Some(m), + Candidate::PossibleStartOfMatch(i) => { + at = i; + } + } + } + } + // CORRECTNESS: next_state is correct for all possible u8 values, + // so the only thing we're concerned about is the validity of + // `state_id`. `state_id` either comes from the caller (in which + // case, we assume it is correct), or it comes from the return + // value of next_state, which is guaranteed to be correct. + state_id = self.next_state_no_fail(state_id, haystack[at]); + at += 1; + if self.is_match_or_dead_state(state_id) { + if state_id == dead_id() { + // The only way to enter into a dead state is if a + // match has been found, so we assert as much. This + // is different from normal automata, where you might + // enter a dead state if you know a subsequent match + // will never be found (regardless of whether a match + // has already been found). For Aho-Corasick, it is + // built so that we can match at any position, so the + // possibility of a match always exists. + // + // (Unless we have an anchored automaton, in which + // case, dead states are used to stop a search.) + debug_assert!( + last_match.is_some() || self.anchored(), + "dead state should only be seen after match" + ); + return last_match; + } + last_match = self.get_match(state_id, 0, at); + } + } + last_match + } + + /// Execute an overlapping search. + /// + /// When executing an overlapping match, the previous state ID in addition + /// to the previous match index should be given. If there are more matches + /// at the given state, then the match is reported and the given index is + /// incremented. + #[inline(always)] + fn overlapping_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut Self::ID, + match_index: &mut usize, + ) -> Option { + if self.anchored() && at > 0 && *state_id == self.start_state() { + return None; + } + + let match_count = self.match_count(*state_id); + if *match_index < match_count { + // This is guaranteed to return a match since + // match_index < match_count. + let result = self.get_match(*state_id, *match_index, at); + debug_assert!(result.is_some(), "must be a match"); + *match_index += 1; + return result; + } + + *match_index = 0; + match self.standard_find_at(prestate, haystack, at, state_id) { + None => None, + Some(m) => { + *match_index = 1; + Some(m) + } + } + } + + /// Return the earliest match found. This returns as soon as we know that + /// we have a match. As such, this does not necessarily correspond to the + /// leftmost starting match, but rather, the leftmost position at which a + /// match ends. + #[inline(always)] + fn earliest_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut Self::ID, + ) -> Option { + if *state_id == self.start_state() { + if self.anchored() && at > 0 { + return None; + } + if let Some(m) = self.get_match(*state_id, 0, at) { + return Some(m); + } + } + self.standard_find_at(prestate, haystack, at, state_id) + } + + /// A convenience function for finding the next match according to the + /// match semantics of this automaton. For standard match semantics, this + /// finds the earliest match. Otherwise, the leftmost match is found. + #[inline(always)] + fn find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut Self::ID, + ) -> Option { + match *self.match_kind() { + MatchKind::Standard => { + self.earliest_find_at(prestate, haystack, at, state_id) + } + MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { + self.leftmost_find_at(prestate, haystack, at, state_id) + } + MatchKind::__Nonexhaustive => unreachable!(), + } + } + + /// Like find_at, but does not track state identifiers. This permits some + /// optimizations when a prefilter that confirms its own matches is + /// present. + #[inline(always)] + fn find_at_no_state( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Option { + match *self.match_kind() { + MatchKind::Standard => { + let mut state = self.start_state(); + self.earliest_find_at(prestate, haystack, at, &mut state) + } + MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { + self.leftmost_find_at_no_state(prestate, haystack, at) + } + MatchKind::__Nonexhaustive => unreachable!(), + } + } +} diff --git a/src/buffer.rs b/src/buffer.rs new file mode 100644 index 0000000..e7339eb --- /dev/null +++ b/src/buffer.rs @@ -0,0 +1,132 @@ +use std::cmp; +use std::io; +use std::ptr; + +/// The default buffer capacity that we use for the stream buffer. +const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1 << 10); // 8 KB + +/// A fairly simple roll buffer for supporting stream searches. +/// +/// This buffer acts as a temporary place to store a fixed amount of data when +/// reading from a stream. Its central purpose is to allow "rolling" some +/// suffix of the data to the beginning of the buffer before refilling it with +/// more data from the stream. For example, let's say we are trying to match +/// "foobar" on a stream. When we report the match, we'd like to not only +/// report the correct offsets at which the match occurs, but also the matching +/// bytes themselves. So let's say our stream is a file with the following +/// contents: `test test foobar test test`. Now assume that we happen to read +/// the aforementioned file in two chunks: `test test foo` and `bar test test`. +/// Naively, it would not be possible to report a single contiguous `foobar` +/// match, but this roll buffer allows us to do that. Namely, after the second +/// read, the contents of the buffer should be `st foobar test test`, where the +/// search should ultimately resume immediately after `foo`. (The prefix `st ` +/// is included because the roll buffer saves N bytes at the end of the buffer, +/// where N is the maximum possible length of a match.) +/// +/// A lot of the logic for dealing with this is unfortunately split out between +/// this roll buffer and the `StreamChunkIter`. +#[derive(Debug)] +pub struct Buffer { + /// The raw buffer contents. This has a fixed size and never increases. + buf: Vec, + /// The minimum size of the buffer, which is equivalent to the maximum + /// possible length of a match. This corresponds to the amount that we + /// roll + min: usize, + /// The end of the contents of this buffer. + end: usize, +} + +impl Buffer { + /// Create a new buffer for stream searching. The minimum buffer length + /// given should be the size of the maximum possible match length. + pub fn new(min_buffer_len: usize) -> Buffer { + let min = cmp::max(1, min_buffer_len); + // The minimum buffer amount is also the amount that we roll our + // buffer in order to support incremental searching. To this end, + // our actual capacity needs to be at least 1 byte bigger than our + // minimum amount, otherwise we won't have any overlap. In actuality, + // we want our buffer to be a bit bigger than that for performance + // reasons, so we set a lower bound of `8 * min`. + // + // TODO: It would be good to find a way to test the streaming + // implementation with the minimal buffer size. For now, we just + // uncomment out the next line and comment out the subsequent line. + // let capacity = 1 + min; + let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); + Buffer { buf: vec![0; capacity], min, end: 0 } + } + + /// Return the contents of this buffer. + #[inline] + pub fn buffer(&self) -> &[u8] { + &self.buf[..self.end] + } + + /// Return the minimum size of the buffer. The only way a buffer may be + /// smaller than this is if the stream itself contains less than the + /// minimum buffer amount. + #[inline] + pub fn min_buffer_len(&self) -> usize { + self.min + } + + /// Return the total length of the contents in the buffer. + #[inline] + pub fn len(&self) -> usize { + self.end + } + + /// Return all free capacity in this buffer. + fn free_buffer(&mut self) -> &mut [u8] { + &mut self.buf[self.end..] + } + + /// Refill the contents of this buffer by reading as much as possible into + /// this buffer's free capacity. If no more bytes could be read, then this + /// returns false. Otherwise, this reads until it has filled the buffer + /// past the minimum amount. + pub fn fill(&mut self, mut rdr: R) -> io::Result { + let mut readany = false; + loop { + let readlen = rdr.read(self.free_buffer())?; + if readlen == 0 { + return Ok(readany); + } + readany = true; + self.end += readlen; + if self.len() >= self.min { + return Ok(true); + } + } + } + + /// Roll the contents of the buffer so that the suffix of this buffer is + /// moved to the front and all other contents are dropped. The size of the + /// suffix corresponds precisely to the minimum buffer length. + /// + /// This should only be called when the entire contents of this buffer have + /// been searched. + pub fn roll(&mut self) { + let roll_start = self + .end + .checked_sub(self.min) + .expect("buffer capacity should be bigger than minimum amount"); + let roll_len = self.min; + + assert!(roll_start + roll_len <= self.end); + unsafe { + // SAFETY: A buffer contains Copy data, so there's no problem + // moving it around. Safety also depends on our indices being in + // bounds, which they always should be, given the assert above. + // + // TODO: Switch to [T]::copy_within once our MSRV is high enough. + ptr::copy( + self.buf[roll_start..].as_ptr(), + self.buf.as_mut_ptr(), + roll_len, + ); + } + self.end = roll_len; + } +} diff --git a/src/byte_frequencies.rs b/src/byte_frequencies.rs new file mode 100644 index 0000000..c313b62 --- /dev/null +++ b/src/byte_frequencies.rs @@ -0,0 +1,258 @@ +pub const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; diff --git a/src/classes.rs b/src/classes.rs new file mode 100644 index 0000000..f84ae21 --- /dev/null +++ b/src/classes.rs @@ -0,0 +1,238 @@ +use std::fmt; + +/// A representation of byte oriented equivalence classes. +/// +/// This is used in an FSM to reduce the size of the transition table. This can +/// have a particularly large impact not only on the total size of an FSM, but +/// also on compile times. +#[derive(Clone, Copy)] +pub struct ByteClasses([u8; 256]); + +impl ByteClasses { + /// Creates a new set of equivalence classes where all bytes are mapped to + /// the same class. + pub fn empty() -> ByteClasses { + ByteClasses([0; 256]) + } + + /// Creates a new set of equivalence classes where each byte belongs to + /// its own equivalence class. + pub fn singletons() -> ByteClasses { + let mut classes = ByteClasses::empty(); + for i in 0..256 { + classes.set(i as u8, i as u8); + } + classes + } + + /// Set the equivalence class for the given byte. + #[inline] + pub fn set(&mut self, byte: u8, class: u8) { + self.0[byte as usize] = class; + } + + /// Get the equivalence class for the given byte. + #[inline] + pub fn get(&self, byte: u8) -> u8 { + // SAFETY: This is safe because all dense transitions have + // exactly 256 elements, so all u8 values are valid indices. + self.0[byte as usize] + } + + /// Return the total number of elements in the alphabet represented by + /// these equivalence classes. Equivalently, this returns the total number + /// of equivalence classes. + #[inline] + pub fn alphabet_len(&self) -> usize { + self.0[255] as usize + 1 + } + + /// Returns true if and only if every byte in this class maps to its own + /// equivalence class. Equivalently, there are 256 equivalence classes + /// and each class contains exactly one byte. + #[inline] + pub fn is_singleton(&self) -> bool { + self.alphabet_len() == 256 + } + + /// Returns an iterator over a sequence of representative bytes from each + /// equivalence class. Namely, this yields exactly N items, where N is + /// equivalent to the number of equivalence classes. Each item is an + /// arbitrary byte drawn from each equivalence class. + /// + /// This is useful when one is determinizing an NFA and the NFA's alphabet + /// hasn't been converted to equivalence classes yet. Picking an arbitrary + /// byte from each equivalence class then permits a full exploration of + /// the NFA instead of using every possible byte value. + pub fn representatives(&self) -> ByteClassRepresentatives<'_> { + ByteClassRepresentatives { classes: self, byte: 0, last_class: None } + } + + /// Returns all of the bytes in the given equivalence class. + /// + /// The second element in the tuple indicates the number of elements in + /// the array. + fn elements(&self, equiv: u8) -> ([u8; 256], usize) { + let (mut array, mut len) = ([0; 256], 0); + for b in 0..256 { + if self.get(b as u8) == equiv { + array[len] = b as u8; + len += 1; + } + } + (array, len) + } +} + +impl fmt::Debug for ByteClasses { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.is_singleton() { + write!(f, "ByteClasses({{singletons}})") + } else { + write!(f, "ByteClasses(")?; + for equiv in 0..self.alphabet_len() { + let (members, len) = self.elements(equiv as u8); + write!(f, " {} => {:?}", equiv, &members[..len])?; + } + write!(f, ")") + } + } +} + +/// An iterator over representative bytes from each equivalence class. +#[derive(Debug)] +pub struct ByteClassRepresentatives<'a> { + classes: &'a ByteClasses, + byte: usize, + last_class: Option, +} + +impl<'a> Iterator for ByteClassRepresentatives<'a> { + type Item = u8; + + fn next(&mut self) -> Option { + while self.byte < 256 { + let byte = self.byte as u8; + let class = self.classes.get(byte); + self.byte += 1; + + if self.last_class != Some(class) { + self.last_class = Some(class); + return Some(byte); + } + } + None + } +} + +/// A byte class builder keeps track of an *approximation* of equivalence +/// classes of bytes during NFA construction. That is, every byte in an +/// equivalence class cannot discriminate between a match and a non-match. +/// +/// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w] +/// and [{-\xFF] never discriminate between a match and a non-match, precisely +/// because they never occur in the literals anywhere. +/// +/// Note though that this does not necessarily compute the minimal set of +/// equivalence classes. For example, in the literals above, the byte ranges +/// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence +/// classes even though they could be treated a single class. The reason for +/// this is implementation complexity. In the future, we should endeavor to +/// compute the minimal equivalence classes since they can have a rather large +/// impact on the size of the DFA. +/// +/// The representation here is 256 booleans, all initially set to false. Each +/// boolean maps to its corresponding byte based on position. A `true` value +/// indicates the end of an equivalence class, where its corresponding byte +/// and all of the bytes corresponding to all previous contiguous `false` +/// values are in the same equivalence class. +/// +/// This particular representation only permits contiguous ranges of bytes to +/// be in the same equivalence class, which means that we can never discover +/// the true minimal set of equivalence classes. +#[derive(Debug)] +pub struct ByteClassBuilder(Vec); + +impl ByteClassBuilder { + /// Create a new builder of byte classes where all bytes are part of the + /// same equivalence class. + pub fn new() -> ByteClassBuilder { + ByteClassBuilder(vec![false; 256]) + } + + /// Indicate the the range of byte given (inclusive) can discriminate a + /// match between it and all other bytes outside of the range. + pub fn set_range(&mut self, start: u8, end: u8) { + debug_assert!(start <= end); + if start > 0 { + self.0[start as usize - 1] = true; + } + self.0[end as usize] = true; + } + + /// Build byte classes that map all byte values to their corresponding + /// equivalence class. The last mapping indicates the largest equivalence + /// class identifier (which is never bigger than 255). + pub fn build(&self) -> ByteClasses { + let mut classes = ByteClasses::empty(); + let mut class = 0u8; + let mut i = 0; + loop { + classes.set(i as u8, class as u8); + if i >= 255 { + break; + } + if self.0[i] { + class = class.checked_add(1).unwrap(); + } + i += 1; + } + classes + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn byte_classes() { + let mut set = ByteClassBuilder::new(); + set.set_range(b'a', b'z'); + + let classes = set.build(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(b'a' - 1), 0); + assert_eq!(classes.get(b'a'), 1); + assert_eq!(classes.get(b'm'), 1); + assert_eq!(classes.get(b'z'), 1); + assert_eq!(classes.get(b'z' + 1), 2); + assert_eq!(classes.get(254), 2); + assert_eq!(classes.get(255), 2); + + let mut set = ByteClassBuilder::new(); + set.set_range(0, 2); + set.set_range(4, 6); + let classes = set.build(); + assert_eq!(classes.get(0), 0); + assert_eq!(classes.get(1), 0); + assert_eq!(classes.get(2), 0); + assert_eq!(classes.get(3), 1); + assert_eq!(classes.get(4), 2); + assert_eq!(classes.get(5), 2); + assert_eq!(classes.get(6), 2); + assert_eq!(classes.get(7), 3); + assert_eq!(classes.get(255), 3); + } + + #[test] + fn full_byte_classes() { + let mut set = ByteClassBuilder::new(); + for i in 0..256u16 { + set.set_range(i as u8, i as u8); + } + assert_eq!(set.build().alphabet_len(), 256); + } +} diff --git a/src/dfa.rs b/src/dfa.rs new file mode 100644 index 0000000..a03a254 --- /dev/null +++ b/src/dfa.rs @@ -0,0 +1,713 @@ +use std::mem::size_of; + +use crate::ahocorasick::MatchKind; +use crate::automaton::Automaton; +use crate::classes::ByteClasses; +use crate::error::Result; +use crate::nfa::{PatternID, PatternLength, NFA}; +use crate::prefilter::{Prefilter, PrefilterObj, PrefilterState}; +use crate::state_id::{dead_id, fail_id, premultiply_overflow_error, StateID}; +use crate::Match; + +#[derive(Clone, Debug)] +pub enum DFA { + Standard(Standard), + ByteClass(ByteClass), + Premultiplied(Premultiplied), + PremultipliedByteClass(PremultipliedByteClass), +} + +impl DFA { + fn repr(&self) -> &Repr { + match *self { + DFA::Standard(ref dfa) => dfa.repr(), + DFA::ByteClass(ref dfa) => dfa.repr(), + DFA::Premultiplied(ref dfa) => dfa.repr(), + DFA::PremultipliedByteClass(ref dfa) => dfa.repr(), + } + } + + pub fn match_kind(&self) -> &MatchKind { + &self.repr().match_kind + } + + pub fn heap_bytes(&self) -> usize { + self.repr().heap_bytes + } + + pub fn max_pattern_len(&self) -> usize { + self.repr().max_pattern_len + } + + pub fn pattern_count(&self) -> usize { + self.repr().pattern_count + } + + pub fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + + pub fn start_state(&self) -> S { + self.repr().start_id + } + + #[inline(always)] + pub fn overlapping_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut S, + match_index: &mut usize, + ) -> Option { + match *self { + DFA::Standard(ref dfa) => dfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + DFA::ByteClass(ref dfa) => dfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + DFA::Premultiplied(ref dfa) => dfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + DFA::PremultipliedByteClass(ref dfa) => dfa.overlapping_find_at( + prestate, + haystack, + at, + state_id, + match_index, + ), + } + } + + #[inline(always)] + pub fn earliest_find_at( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + state_id: &mut S, + ) -> Option { + match *self { + DFA::Standard(ref dfa) => { + dfa.earliest_find_at(prestate, haystack, at, state_id) + } + DFA::ByteClass(ref dfa) => { + dfa.earliest_find_at(prestate, haystack, at, state_id) + } + DFA::Premultiplied(ref dfa) => { + dfa.earliest_find_at(prestate, haystack, at, state_id) + } + DFA::PremultipliedByteClass(ref dfa) => { + dfa.earliest_find_at(prestate, haystack, at, state_id) + } + } + } + + #[inline(always)] + pub fn find_at_no_state( + &self, + prestate: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Option { + match *self { + DFA::Standard(ref dfa) => { + dfa.find_at_no_state(prestate, haystack, at) + } + DFA::ByteClass(ref dfa) => { + dfa.find_at_no_state(prestate, haystack, at) + } + DFA::Premultiplied(ref dfa) => { + dfa.find_at_no_state(prestate, haystack, at) + } + DFA::PremultipliedByteClass(ref dfa) => { + dfa.find_at_no_state(prestate, haystack, at) + } + } + } +} + +#[derive(Clone, Debug)] +pub struct Standard(Repr); + +impl Standard { + fn repr(&self) -> &Repr { + &self.0 + } +} + +impl Automaton for Standard { + type ID = S; + + fn match_kind(&self) -> &MatchKind { + &self.repr().match_kind + } + + fn anchored(&self) -> bool { + self.repr().anchored + } + + fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + + fn start_state(&self) -> S { + self.repr().start_id + } + + fn is_valid(&self, id: S) -> bool { + id.to_usize() < self.repr().state_count + } + + fn is_match_state(&self, id: S) -> bool { + self.repr().is_match_state(id) + } + + fn is_match_or_dead_state(&self, id: S) -> bool { + self.repr().is_match_or_dead_state(id) + } + + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + self.repr().get_match(id, match_index, end) + } + + fn match_count(&self, id: S) -> usize { + self.repr().match_count(id) + } + + fn next_state(&self, current: S, input: u8) -> S { + let o = current.to_usize() * 256 + input as usize; + self.repr().trans[o] + } +} + +#[derive(Clone, Debug)] +pub struct ByteClass(Repr); + +impl ByteClass { + fn repr(&self) -> &Repr { + &self.0 + } +} + +impl Automaton for ByteClass { + type ID = S; + + fn match_kind(&self) -> &MatchKind { + &self.repr().match_kind + } + + fn anchored(&self) -> bool { + self.repr().anchored + } + + fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + + fn start_state(&self) -> S { + self.repr().start_id + } + + fn is_valid(&self, id: S) -> bool { + id.to_usize() < self.repr().state_count + } + + fn is_match_state(&self, id: S) -> bool { + self.repr().is_match_state(id) + } + + fn is_match_or_dead_state(&self, id: S) -> bool { + self.repr().is_match_or_dead_state(id) + } + + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + self.repr().get_match(id, match_index, end) + } + + fn match_count(&self, id: S) -> usize { + self.repr().match_count(id) + } + + fn next_state(&self, current: S, input: u8) -> S { + let alphabet_len = self.repr().byte_classes.alphabet_len(); + let input = self.repr().byte_classes.get(input); + let o = current.to_usize() * alphabet_len + input as usize; + self.repr().trans[o] + } +} + +#[derive(Clone, Debug)] +pub struct Premultiplied(Repr); + +impl Premultiplied { + fn repr(&self) -> &Repr { + &self.0 + } +} + +impl Automaton for Premultiplied { + type ID = S; + + fn match_kind(&self) -> &MatchKind { + &self.repr().match_kind + } + + fn anchored(&self) -> bool { + self.repr().anchored + } + + fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + + fn start_state(&self) -> S { + self.repr().start_id + } + + fn is_valid(&self, id: S) -> bool { + (id.to_usize() / 256) < self.repr().state_count + } + + fn is_match_state(&self, id: S) -> bool { + self.repr().is_match_state(id) + } + + fn is_match_or_dead_state(&self, id: S) -> bool { + self.repr().is_match_or_dead_state(id) + } + + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + if id > self.repr().max_match { + return None; + } + self.repr() + .matches + .get(id.to_usize() / 256) + .and_then(|m| m.get(match_index)) + .map(|&(id, len)| Match { pattern: id, len, end }) + } + + fn match_count(&self, id: S) -> usize { + let o = id.to_usize() / 256; + self.repr().matches[o].len() + } + + fn next_state(&self, current: S, input: u8) -> S { + let o = current.to_usize() + input as usize; + self.repr().trans[o] + } +} + +#[derive(Clone, Debug)] +pub struct PremultipliedByteClass(Repr); + +impl PremultipliedByteClass { + fn repr(&self) -> &Repr { + &self.0 + } +} + +impl Automaton for PremultipliedByteClass { + type ID = S; + + fn match_kind(&self) -> &MatchKind { + &self.repr().match_kind + } + + fn anchored(&self) -> bool { + self.repr().anchored + } + + fn prefilter(&self) -> Option<&dyn Prefilter> { + self.repr().prefilter.as_ref().map(|p| p.as_ref()) + } + + fn start_state(&self) -> S { + self.repr().start_id + } + + fn is_valid(&self, id: S) -> bool { + (id.to_usize() / self.repr().alphabet_len()) < self.repr().state_count + } + + fn is_match_state(&self, id: S) -> bool { + self.repr().is_match_state(id) + } + + fn is_match_or_dead_state(&self, id: S) -> bool { + self.repr().is_match_or_dead_state(id) + } + + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + if id > self.repr().max_match { + return None; + } + self.repr() + .matches + .get(id.to_usize() / self.repr().alphabet_len()) + .and_then(|m| m.get(match_index)) + .map(|&(id, len)| Match { pattern: id, len, end }) + } + + fn match_count(&self, id: S) -> usize { + let o = id.to_usize() / self.repr().alphabet_len(); + self.repr().matches[o].len() + } + + fn next_state(&self, current: S, input: u8) -> S { + let input = self.repr().byte_classes.get(input); + let o = current.to_usize() + input as usize; + self.repr().trans[o] + } +} + +#[derive(Clone, Debug)] +pub struct Repr { + match_kind: MatchKind, + anchored: bool, + premultiplied: bool, + start_id: S, + /// The length, in bytes, of the longest pattern in this automaton. This + /// information is useful for keeping correct buffer sizes when searching + /// on streams. + max_pattern_len: usize, + /// The total number of patterns added to this automaton. This includes + /// patterns that may never match. + pattern_count: usize, + state_count: usize, + max_match: S, + /// The number of bytes of heap used by this NFA's transition table. + heap_bytes: usize, + /// A prefilter for quickly detecting candidate matchs, if pertinent. + prefilter: Option, + byte_classes: ByteClasses, + trans: Vec, + matches: Vec>, +} + +impl Repr { + /// Returns the total alphabet size for this DFA. + /// + /// If byte classes are enabled, then this corresponds to the number of + /// equivalence classes. If they are disabled, then this is always 256. + fn alphabet_len(&self) -> usize { + self.byte_classes.alphabet_len() + } + + /// Returns true only if the given state is a match state. + fn is_match_state(&self, id: S) -> bool { + id <= self.max_match && id > dead_id() + } + + /// Returns true only if the given state is either a dead state or a match + /// state. + fn is_match_or_dead_state(&self, id: S) -> bool { + id <= self.max_match + } + + /// Get the ith match for the given state, where the end position of a + /// match was found at `end`. + /// + /// # Panics + /// + /// The caller must ensure that the given state identifier is valid, + /// otherwise this may panic. The `match_index` need not be valid. That is, + /// if the given state has no matches then this returns `None`. + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + if id > self.max_match { + return None; + } + self.matches + .get(id.to_usize()) + .and_then(|m| m.get(match_index)) + .map(|&(id, len)| Match { pattern: id, len, end }) + } + + /// Return the total number of matches for the given state. + /// + /// # Panics + /// + /// The caller must ensure that the given identifier is valid, or else + /// this panics. + fn match_count(&self, id: S) -> usize { + self.matches[id.to_usize()].len() + } + + /// Get the next state given `from` as the current state and `byte` as the + /// current input byte. + fn next_state(&self, from: S, byte: u8) -> S { + let alphabet_len = self.alphabet_len(); + let byte = self.byte_classes.get(byte); + self.trans[from.to_usize() * alphabet_len + byte as usize] + } + + /// Set the `byte` transition for the `from` state to point to `to`. + fn set_next_state(&mut self, from: S, byte: u8, to: S) { + let alphabet_len = self.alphabet_len(); + let byte = self.byte_classes.get(byte); + self.trans[from.to_usize() * alphabet_len + byte as usize] = to; + } + + /// Swap the given states in place. + fn swap_states(&mut self, id1: S, id2: S) { + assert!(!self.premultiplied, "can't swap states in premultiplied DFA"); + + let o1 = id1.to_usize() * self.alphabet_len(); + let o2 = id2.to_usize() * self.alphabet_len(); + for b in 0..self.alphabet_len() { + self.trans.swap(o1 + b, o2 + b); + } + self.matches.swap(id1.to_usize(), id2.to_usize()); + } + + /// This routine shuffles all match states in this DFA to the beginning + /// of the DFA such that every non-match state appears after every match + /// state. (With one exception: the special fail and dead states remain as + /// the first two states.) + /// + /// The purpose of doing this shuffling is to avoid an extra conditional + /// in the search loop, and in particular, detecting whether a state is a + /// match or not does not need to access any memory. + /// + /// This updates `self.max_match` to point to the last matching state as + /// well as `self.start` if the starting state was moved. + fn shuffle_match_states(&mut self) { + assert!( + !self.premultiplied, + "cannot shuffle match states of premultiplied DFA" + ); + + if self.state_count <= 1 { + return; + } + + let mut first_non_match = self.start_id.to_usize(); + while first_non_match < self.state_count + && self.matches[first_non_match].len() > 0 + { + first_non_match += 1; + } + + let mut swaps: Vec = vec![fail_id(); self.state_count]; + let mut cur = self.state_count - 1; + while cur > first_non_match { + if self.matches[cur].len() > 0 { + self.swap_states( + S::from_usize(cur), + S::from_usize(first_non_match), + ); + swaps[cur] = S::from_usize(first_non_match); + swaps[first_non_match] = S::from_usize(cur); + + first_non_match += 1; + while first_non_match < cur + && self.matches[first_non_match].len() > 0 + { + first_non_match += 1; + } + } + cur -= 1; + } + for id in (0..self.state_count).map(S::from_usize) { + let alphabet_len = self.alphabet_len(); + let offset = id.to_usize() * alphabet_len; + for next in &mut self.trans[offset..offset + alphabet_len] { + if swaps[next.to_usize()] != fail_id() { + *next = swaps[next.to_usize()]; + } + } + } + if swaps[self.start_id.to_usize()] != fail_id() { + self.start_id = swaps[self.start_id.to_usize()]; + } + self.max_match = S::from_usize(first_non_match - 1); + } + + fn premultiply(&mut self) -> Result<()> { + if self.premultiplied || self.state_count <= 1 { + return Ok(()); + } + + let alpha_len = self.alphabet_len(); + premultiply_overflow_error( + S::from_usize(self.state_count - 1), + alpha_len, + )?; + + for id in (2..self.state_count).map(S::from_usize) { + let offset = id.to_usize() * alpha_len; + for next in &mut self.trans[offset..offset + alpha_len] { + if *next == dead_id() { + continue; + } + *next = S::from_usize(next.to_usize() * alpha_len); + } + } + self.premultiplied = true; + self.start_id = S::from_usize(self.start_id.to_usize() * alpha_len); + self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len); + Ok(()) + } + + /// Computes the total amount of heap used by this NFA in bytes. + fn calculate_size(&mut self) { + let mut size = (self.trans.len() * size_of::()) + + (self.matches.len() + * size_of::>()); + for state_matches in &self.matches { + size += + state_matches.len() * size_of::<(PatternID, PatternLength)>(); + } + size += self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes()); + self.heap_bytes = size; + } +} + +/// A builder for configuring the determinization of an NFA into a DFA. +#[derive(Clone, Debug)] +pub struct Builder { + premultiply: bool, + byte_classes: bool, +} + +impl Builder { + /// Create a new builder for a DFA. + pub fn new() -> Builder { + Builder { premultiply: true, byte_classes: true } + } + + /// Build a DFA from the given NFA. + /// + /// This returns an error if the state identifiers exceed their + /// representation size. This can only happen when state ids are + /// premultiplied (which is enabled by default). + pub fn build(&self, nfa: &NFA) -> Result> { + let byte_classes = if self.byte_classes { + nfa.byte_classes().clone() + } else { + ByteClasses::singletons() + }; + let alphabet_len = byte_classes.alphabet_len(); + let trans = vec![fail_id(); alphabet_len * nfa.state_len()]; + let matches = vec![vec![]; nfa.state_len()]; + let mut repr = Repr { + match_kind: nfa.match_kind().clone(), + anchored: nfa.anchored(), + premultiplied: false, + start_id: nfa.start_state(), + max_pattern_len: nfa.max_pattern_len(), + pattern_count: nfa.pattern_count(), + state_count: nfa.state_len(), + max_match: fail_id(), + heap_bytes: 0, + prefilter: nfa.prefilter_obj().map(|p| p.clone()), + byte_classes: byte_classes.clone(), + trans, + matches, + }; + for id in (0..nfa.state_len()).map(S::from_usize) { + repr.matches[id.to_usize()].extend_from_slice(nfa.matches(id)); + + let fail = nfa.failure_transition(id); + nfa.iter_all_transitions(&byte_classes, id, |b, mut next| { + if next == fail_id() { + next = nfa_next_state_memoized(nfa, &repr, id, fail, b); + } + repr.set_next_state(id, b, next); + }); + } + repr.shuffle_match_states(); + repr.calculate_size(); + if self.premultiply { + repr.premultiply()?; + if byte_classes.is_singleton() { + Ok(DFA::Premultiplied(Premultiplied(repr))) + } else { + Ok(DFA::PremultipliedByteClass(PremultipliedByteClass(repr))) + } + } else { + if byte_classes.is_singleton() { + Ok(DFA::Standard(Standard(repr))) + } else { + Ok(DFA::ByteClass(ByteClass(repr))) + } + } + } + + /// Whether to use byte classes or in the DFA. + pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { + self.byte_classes = yes; + self + } + + /// Whether to premultiply state identifier in the DFA. + pub fn premultiply(&mut self, yes: bool) -> &mut Builder { + self.premultiply = yes; + self + } +} + +/// This returns the next NFA transition (including resolving failure +/// transitions), except once it sees a state id less than the id of the DFA +/// state that is currently being populated, then we no longer need to follow +/// failure transitions and can instead query the pre-computed state id from +/// the DFA itself. +/// +/// In general, this should only be called when a failure transition is seen. +fn nfa_next_state_memoized( + nfa: &NFA, + dfa: &Repr, + populating: S, + mut current: S, + input: u8, +) -> S { + loop { + if current < populating { + return dfa.next_state(current, input); + } + let next = nfa.next_state(current, input); + if next != fail_id() { + return next; + } + current = nfa.failure_transition(current); + } +} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..a57a777 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,101 @@ +use std::error; +use std::fmt; +use std::result; + +pub type Result = result::Result; + +/// An error that occurred during the construction of an Aho-Corasick +/// automaton. +#[derive(Clone, Debug)] +pub struct Error { + kind: ErrorKind, +} + +/// The kind of error that occurred. +#[derive(Clone, Debug)] +pub enum ErrorKind { + /// An error that occurs when constructing an automaton would require the + /// use of a state ID that overflows the chosen state ID representation. + /// For example, if one is using `u8` for state IDs and builds a DFA with + /// 257 states, then the last state's ID will be `256` which cannot be + /// represented with `u8`. + StateIDOverflow { + /// The maximum possible state ID. + max: usize, + }, + /// An error that occurs when premultiplication of state IDs is requested + /// when constructing an Aho-Corasick DFA, but doing so would overflow the + /// chosen state ID representation. + /// + /// When `max == requested_max`, then the state ID would overflow `usize`. + PremultiplyOverflow { + /// The maximum possible state id. + max: usize, + /// The maximum ID required by premultiplication. + requested_max: usize, + }, +} + +impl Error { + /// Return the kind of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + pub(crate) fn state_id_overflow(max: usize) -> Error { + Error { kind: ErrorKind::StateIDOverflow { max } } + } + + pub(crate) fn premultiply_overflow( + max: usize, + requested_max: usize, + ) -> Error { + Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } } + } +} + +impl error::Error for Error { + fn description(&self) -> &str { + match self.kind { + ErrorKind::StateIDOverflow { .. } => { + "state id representation too small" + } + ErrorKind::PremultiplyOverflow { .. } => { + "state id representation too small for premultiplication" + } + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.kind { + ErrorKind::StateIDOverflow { max } => write!( + f, + "building the automaton failed because it required \ + building more states that can be identified, where the \ + maximum ID for the chosen representation is {}", + max, + ), + ErrorKind::PremultiplyOverflow { max, requested_max } => { + if max == requested_max { + write!( + f, + "premultiplication of states requires the ability to \ + represent a state ID greater than what can fit on \ + this platform's usize, which is {}", + ::std::usize::MAX, + ) + } else { + write!( + f, + "premultiplication of states requires the ability to \ + represent at least a state ID of {}, but the chosen \ + representation only permits a maximum state ID of {}", + requested_max, max, + ) + } + } + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..4465a56 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,303 @@ +/*! +A library for finding occurrences of many patterns at once. This library +provides multiple pattern search principally through an implementation of the +[Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), +which builds a fast finite state machine for executing searches in linear time. + +Additionally, this library provides a number of configuration options for +building the automaton that permit controlling the space versus time trade +off. Other features include simple ASCII case insensitive matching, finding +overlapping matches, replacements, searching streams and even searching and +replacing text in streams. + +Finally, unlike all other (known) Aho-Corasick implementations, this one +supports enabling +[leftmost-first](enum.MatchKind.html#variant.LeftmostFirst) +or +[leftmost-longest](enum.MatchKind.html#variant.LeftmostFirst) +match semantics, using a (seemingly) novel alternative construction algorithm. +For more details on what match semantics means, see the +[`MatchKind`](enum.MatchKind.html) +type. + +# Overview + +This section gives a brief overview of the primary types in this crate: + +* [`AhoCorasick`](struct.AhoCorasick.html) is the primary type and represents + an Aho-Corasick automaton. This is the type you use to execute searches. +* [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) can be used to build + an Aho-Corasick automaton, and supports configuring a number of options. +* [`Match`](struct.Match.html) represents a single match reported by an + Aho-Corasick automaton. Each match has two pieces of information: the pattern + that matched and the start and end byte offsets corresponding to the position + in the haystack at which it matched. + +Additionally, the [`packed`](packed/index.html) sub-module contains a lower +level API for using fast vectorized routines for finding a small number of +patterns in a haystack. + +# Example: basic searching + +This example shows how to search for occurrences of multiple patterns +simultaneously. Each match includes the pattern that matched along with the +byte offsets of the match. + +``` +use aho_corasick::AhoCorasick; + +let patterns = &["apple", "maple", "Snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasick::new(patterns); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (1, 13, 18), + (0, 28, 33), + (2, 43, 50), +]); +``` + +# Example: case insensitivity + +This is like the previous example, but matches `Snapple` case insensitively +using `AhoCorasickBuilder`: + +``` +use aho_corasick::AhoCorasickBuilder; + +let patterns = &["apple", "maple", "snapple"]; +let haystack = "Nobody likes maple in their apple flavored Snapple."; + +let ac = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(patterns); +let mut matches = vec![]; +for mat in ac.find_iter(haystack) { + matches.push((mat.pattern(), mat.start(), mat.end())); +} +assert_eq!(matches, vec![ + (1, 13, 18), + (0, 28, 33), + (2, 43, 50), +]); +``` + +# Example: replacing matches in a stream + +This example shows how to execute a search and replace on a stream without +loading the entire stream into memory first. + +``` +use aho_corasick::AhoCorasick; + +# fn example() -> Result<(), ::std::io::Error> { +let patterns = &["fox", "brown", "quick"]; +let replace_with = &["sloth", "grey", "slow"]; + +// In a real example, these might be `std::fs::File`s instead. All you need to +// do is supply a pair of `std::io::Read` and `std::io::Write` implementations. +let rdr = "The quick brown fox."; +let mut wtr = vec![]; + +let ac = AhoCorasick::new(patterns); +ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?; +assert_eq!(b"The slow grey sloth.".to_vec(), wtr); +# Ok(()) }; example().unwrap() +``` + +# Example: finding the leftmost first match + +In the textbook description of Aho-Corasick, its formulation is typically +structured such that it reports all possible matches, even when they overlap +with another. In many cases, overlapping matches may not be desired, such as +the case of finding all successive non-overlapping matches like you might with +a standard regular expression. + +Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do +this doesn't always work in the expected way, since it will report matches as +soon as they are seen. For example, consider matching the regex `Samwise|Sam` +against the text `Samwise`. Most regex engines (that are Perl-like, or +non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick +algorithm modified for reporting non-overlapping matches will report `Sam`. + +A novel contribution of this library is the ability to change the match +semantics of Aho-Corasick (without additional search time overhead) such that +`Samwise` is reported instead. For example, here's the standard approach: + +``` +use aho_corasick::AhoCorasick; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasick::new(patterns); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Sam", &haystack[mat.start()..mat.end()]); +``` + +And now here's the leftmost-first version, which matches how a Perl-like +regex will work: + +``` +use aho_corasick::{AhoCorasickBuilder, MatchKind}; + +let patterns = &["Samwise", "Sam"]; +let haystack = "Samwise"; + +let ac = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .build(patterns); +let mat = ac.find(haystack).expect("should have a match"); +assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); +``` + +In addition to leftmost-first semantics, this library also supports +leftmost-longest semantics, which match the POSIX behavior of a regular +expression alternation. See +[`MatchKind`](enum.MatchKind.html) +for more details. + +# Prefilters + +While an Aho-Corasick automaton can perform admirably when compared to more +naive solutions, it is generally slower than more specialized algorithms that +are accelerated using vector instructions such as SIMD. + +For that reason, this library will internally use a "prefilter" to attempt +to accelerate searches when possible. Currently, this library has several +different algorithms it might use depending on the patterns provided. Once the +number of patterns gets too big, prefilters are no longer used. + +While a prefilter is generally good to have on by default since it works +well in the common case, it can lead to less predictable or even sub-optimal +performance in some cases. For that reason, prefilters can be explicitly +disabled via +[`AhoCorasickBuilder::prefilter`](struct.AhoCorasickBuilder.html#method.prefilter). +*/ + +#![deny(missing_docs)] + +// We can never be truly no_std, but we could be alloc-only some day, so +// require the std feature for now. +#[cfg(not(feature = "std"))] +compile_error!("`std` feature is currently required to build this crate"); + +// #[cfg(doctest)] +// #[macro_use] +// extern crate doc_comment; + +// #[cfg(doctest)] +// doctest!("../README.md"); + +pub use crate::ahocorasick::{ + AhoCorasick, AhoCorasickBuilder, FindIter, FindOverlappingIter, MatchKind, + StreamFindIter, +}; +pub use crate::error::{Error, ErrorKind}; +pub use crate::state_id::StateID; + +mod ahocorasick; +mod automaton; +mod buffer; +mod byte_frequencies; +mod classes; +mod dfa; +mod error; +mod nfa; +pub mod packed; +mod prefilter; +mod state_id; +#[cfg(test)] +mod tests; + +/// A representation of a match reported by an Aho-Corasick automaton. +/// +/// A match has two essential pieces of information: the identifier of the +/// pattern that matched, along with the start and end offsets of the match +/// in the haystack. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use aho_corasick::AhoCorasick; +/// +/// let ac = AhoCorasick::new(&[ +/// "foo", "bar", "baz", +/// ]); +/// let mat = ac.find("xxx bar xxx").expect("should have a match"); +/// assert_eq!(1, mat.pattern()); +/// assert_eq!(4, mat.start()); +/// assert_eq!(7, mat.end()); +/// ``` +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct Match { + /// The pattern id. + pattern: usize, + /// The length of this match, such that the starting position of the match + /// is `end - len`. + /// + /// We use length here because, other than the pattern id, the only + /// information about each pattern that the automaton stores is its length. + /// So using the length here is just a bit more natural. But it isn't + /// technically required. + len: usize, + /// The end offset of the match, exclusive. + end: usize, +} + +impl Match { + /// Returns the identifier of the pattern that matched. + /// + /// The identifier of a pattern is derived from the position in which it + /// was originally inserted into the corresponding automaton. The first + /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` + /// and so on. + #[inline] + pub fn pattern(&self) -> usize { + self.pattern + } + + /// The starting position of the match. + #[inline] + pub fn start(&self) -> usize { + self.end - self.len + } + + /// The ending position of the match. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// The length, in bytes, of the match. + #[inline] + pub fn len(&self) -> usize { + self.len + } + + /// Returns true if and only if this match is empty. That is, when + /// `start() == end()`. + /// + /// An empty match can only be returned when the empty string was among + /// the patterns used to build the Aho-Corasick automaton. + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + #[inline] + fn increment(&self, by: usize) -> Match { + Match { pattern: self.pattern, len: self.len, end: self.end + by } + } + + #[inline] + fn from_span(id: usize, start: usize, end: usize) -> Match { + Match { pattern: id, len: end - start, end } + } +} diff --git a/src/nfa.rs b/src/nfa.rs new file mode 100644 index 0000000..05c5cfb --- /dev/null +++ b/src/nfa.rs @@ -0,0 +1,1214 @@ +use std::cmp; +use std::collections::{BTreeSet, VecDeque}; +use std::fmt; +use std::mem::size_of; +use std::ops::{Index, IndexMut}; + +use crate::ahocorasick::MatchKind; +use crate::automaton::Automaton; +use crate::classes::{ByteClassBuilder, ByteClasses}; +use crate::error::Result; +use crate::prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj}; +use crate::state_id::{dead_id, fail_id, usize_to_state_id, StateID}; +use crate::Match; + +/// The identifier for a pattern, which is simply the position of the pattern +/// in the sequence of patterns given by the caller. +pub type PatternID = usize; + +/// The length of a pattern, in bytes. +pub type PatternLength = usize; + +/// An Aho-Corasick automaton, represented as an NFA. +/// +/// This is the classical formulation of Aho-Corasick, which involves building +/// up a prefix trie of a given set of patterns, and then wiring up failure +/// transitions between states in order to guarantee linear time matching. The +/// standard formulation is, technically, an NFA because of these failure +/// transitions. That is, one can see them as enabling the automaton to be in +/// multiple states at once. Indeed, during search, it is possible to check +/// the transitions on multiple states for a single input byte. +/// +/// This particular implementation not only supports the standard style of +/// matching, but also provides a mode for choosing leftmost-first or +/// leftmost-longest match semantics. When a leftmost mode is chosen, some +/// failure transitions that would otherwise be added are elided. See +/// the documentation of `MatchKind` for more details and examples on how the +/// match semantics may differ. +/// +/// If one wants a DFA, then it is necessary to first build an NFA and convert +/// it into a DFA. Note, however, that because we've constrained ourselves to +/// matching literal patterns, this does not need to use subset construction +/// for determinization. Instead, the DFA has at most a number of states +/// equivalent to the number of NFA states. The only real difference between +/// them is that all failure transitions are followed and pre-computed. This +/// uses much more memory, but also executes searches more quickly. +#[derive(Clone)] +pub struct NFA { + /// The match semantics built into this NFA. + match_kind: MatchKind, + /// The start state id as an index into `states`. + start_id: S, + /// The length, in bytes, of the longest pattern in this automaton. This + /// information is useful for keeping correct buffer sizes when searching + /// on streams. + max_pattern_len: usize, + /// The total number of patterns added to this automaton, including + /// patterns that may never be matched. + pattern_count: usize, + /// The number of bytes of heap used by this NFA's transition table. + heap_bytes: usize, + /// A prefilter for quickly skipping to candidate matches, if pertinent. + prefilter: Option, + /// Whether this automaton anchors all matches to the start of input. + anchored: bool, + /// A set of equivalence classes in terms of bytes. We compute this while + /// building the NFA, but don't use it in the NFA's states. Instead, we + /// use this for building the DFA. We store it on the NFA since it's easy + /// to compute while visiting the patterns. + byte_classes: ByteClasses, + /// A set of states. Each state defines its own transitions, a fail + /// transition and a set of indices corresponding to matches. + /// + /// The first state is always the fail state, which is used only as a + /// sentinel. Namely, in the final NFA, no transition into the fail state + /// exists. (Well, they do, but they aren't followed. Instead, the state's + /// failure transition is followed.) + /// + /// The second state (index 1) is always the dead state. Dead states are + /// in every automaton, but only used when leftmost-{first,longest} match + /// semantics are enabled. Specifically, they instruct search to stop + /// at specific points in order to report the correct match location. In + /// the standard Aho-Corasick construction, there are no transitions to + /// the dead state. + /// + /// The third state (index 2) is generally intended to be the starting or + /// "root" state. + states: Vec>, +} + +impl NFA { + /// Returns the equivalence classes of bytes found while constructing + /// this NFA. + /// + /// Note that the NFA doesn't actually make use of these equivalence + /// classes. Instead, these are useful for building the DFA when desired. + pub fn byte_classes(&self) -> &ByteClasses { + &self.byte_classes + } + + /// Returns a prefilter, if one exists. + pub fn prefilter_obj(&self) -> Option<&PrefilterObj> { + self.prefilter.as_ref() + } + + /// Returns the total number of heap bytes used by this NFA's transition + /// table. + pub fn heap_bytes(&self) -> usize { + self.heap_bytes + + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes()) + } + + /// Return the length of the longest pattern in this automaton. + pub fn max_pattern_len(&self) -> usize { + self.max_pattern_len + } + + /// Return the total number of patterns added to this automaton. + pub fn pattern_count(&self) -> usize { + self.pattern_count + } + + /// Returns the total number of states in this NFA. + pub fn state_len(&self) -> usize { + self.states.len() + } + + /// Returns the matches for the given state. + pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] { + &self.states[id.to_usize()].matches + } + + /// Returns an iterator over all transitions in the given state according + /// to the given equivalence classes, including transitions to `fail_id()`. + /// The number of transitions returned is always equivalent to the number + /// of equivalence classes. + pub fn iter_all_transitions( + &self, + byte_classes: &ByteClasses, + id: S, + f: F, + ) { + self.states[id.to_usize()].trans.iter_all(byte_classes, f); + } + + /// Returns the failure transition for the given state. + pub fn failure_transition(&self, id: S) -> S { + self.states[id.to_usize()].fail + } + + /// Returns the next state for the given state and input byte. + /// + /// Note that this does not follow failure transitions. As such, the id + /// returned may be `fail_id`. + pub fn next_state(&self, current: S, input: u8) -> S { + self.states[current.to_usize()].next_state(input) + } + + fn state(&self, id: S) -> &State { + &self.states[id.to_usize()] + } + + fn state_mut(&mut self, id: S) -> &mut State { + &mut self.states[id.to_usize()] + } + + fn start(&self) -> &State { + self.state(self.start_id) + } + + fn start_mut(&mut self) -> &mut State { + let id = self.start_id; + self.state_mut(id) + } + + fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut<'_, S> { + IterTransitionsMut::new(self, id) + } + + fn copy_matches(&mut self, src: S, dst: S) { + let (src, dst) = + get_two_mut(&mut self.states, src.to_usize(), dst.to_usize()); + dst.matches.extend_from_slice(&src.matches); + } + + fn copy_empty_matches(&mut self, dst: S) { + let start_id = self.start_id; + self.copy_matches(start_id, dst); + } + + fn add_dense_state(&mut self, depth: usize) -> Result { + let trans = Transitions::Dense(Dense::new()); + let id = usize_to_state_id(self.states.len())?; + self.states.push(State { + trans, + // Anchored automatons do not have any failure transitions. + fail: if self.anchored { dead_id() } else { self.start_id }, + depth, + matches: vec![], + }); + Ok(id) + } + + fn add_sparse_state(&mut self, depth: usize) -> Result { + let trans = Transitions::Sparse(vec![]); + let id = usize_to_state_id(self.states.len())?; + self.states.push(State { + trans, + // Anchored automatons do not have any failure transitions. + fail: if self.anchored { dead_id() } else { self.start_id }, + depth, + matches: vec![], + }); + Ok(id) + } +} + +impl Automaton for NFA { + type ID = S; + + fn match_kind(&self) -> &MatchKind { + &self.match_kind + } + + fn anchored(&self) -> bool { + self.anchored + } + + fn prefilter(&self) -> Option<&dyn Prefilter> { + self.prefilter.as_ref().map(|p| p.as_ref()) + } + + fn start_state(&self) -> S { + self.start_id + } + + fn is_valid(&self, id: S) -> bool { + id.to_usize() < self.states.len() + } + + fn is_match_state(&self, id: S) -> bool { + self.states[id.to_usize()].is_match() + } + + fn get_match( + &self, + id: S, + match_index: usize, + end: usize, + ) -> Option { + let state = match self.states.get(id.to_usize()) { + None => return None, + Some(state) => state, + }; + state.matches.get(match_index).map(|&(id, len)| Match { + pattern: id, + len, + end, + }) + } + + fn match_count(&self, id: S) -> usize { + self.states[id.to_usize()].matches.len() + } + + fn next_state(&self, mut current: S, input: u8) -> S { + // This terminates since: + // + // 1. `State.fail` never points to fail_id(). + // 2. All `State.fail` values point to a state closer to `start`. + // 3. The start state has no transitions to fail_id(). + loop { + let state = &self.states[current.to_usize()]; + let next = state.next_state(input); + if next != fail_id() { + return next; + } + current = state.fail; + } + } +} + +/// A representation of an NFA state for an Aho-Corasick automaton. +/// +/// It contains the transitions to the next state, a failure transition for +/// cases where there exists no other transition for the current input byte, +/// the matches implied by visiting this state (if any) and the depth of this +/// state. The depth of a state is simply the distance from it to the start +/// state in the automaton, where the depth of the start state is 0. +#[derive(Clone, Debug)] +pub struct State { + trans: Transitions, + fail: S, + matches: Vec<(PatternID, PatternLength)>, + // TODO: Strictly speaking, this isn't needed for searching. It's only + // used when building an NFA that supports leftmost match semantics. We + // could drop this from the state and dynamically build a map only when + // computing failure transitions, but it's not clear which is better. + // Benchmark this. + depth: usize, +} + +impl State { + fn heap_bytes(&self) -> usize { + self.trans.heap_bytes() + + (self.matches.len() * size_of::<(PatternID, PatternLength)>()) + } + + fn add_match(&mut self, i: PatternID, len: PatternLength) { + self.matches.push((i, len)); + } + + fn is_match(&self) -> bool { + !self.matches.is_empty() + } + + fn next_state(&self, input: u8) -> S { + self.trans.next_state(input) + } + + fn set_next_state(&mut self, input: u8, next: S) { + self.trans.set_next_state(input, next); + } +} + +/// Represents the transitions for a single dense state. +/// +/// The primary purpose here is to encapsulate index access. Namely, since a +/// dense representation always contains 256 elements, all values of `u8` are +/// valid indices. +#[derive(Clone, Debug)] +struct Dense(Vec); + +impl Dense +where + S: StateID, +{ + fn new() -> Self { + Dense(vec![fail_id(); 256]) + } + + #[inline] + fn len(&self) -> usize { + self.0.len() + } +} + +impl Index for Dense { + type Output = S; + + #[inline] + fn index(&self, i: u8) -> &S { + // SAFETY: This is safe because all dense transitions have + // exactly 256 elements, so all u8 values are valid indices. + &self.0[i as usize] + } +} + +impl IndexMut for Dense { + #[inline] + fn index_mut(&mut self, i: u8) -> &mut S { + // SAFETY: This is safe because all dense transitions have + // exactly 256 elements, so all u8 values are valid indices. + &mut self.0[i as usize] + } +} + +/// A representation of transitions in an NFA. +/// +/// Transitions have either a sparse representation, which is slower for +/// lookups but uses less memory, or a dense representation, which is faster +/// for lookups but uses more memory. In the sparse representation, the absence +/// of a state implies a transition to `fail_id()`. Transitions to `dead_id()` +/// are still explicitly represented. +/// +/// For the NFA, by default, we use a dense representation for transitions for +/// states close to the start state because it's likely these are the states +/// that will be most frequently visited. +#[derive(Clone, Debug)] +enum Transitions { + Sparse(Vec<(u8, S)>), + Dense(Dense), +} + +impl Transitions { + fn heap_bytes(&self) -> usize { + match *self { + Transitions::Sparse(ref sparse) => { + sparse.len() * size_of::<(u8, S)>() + } + Transitions::Dense(ref dense) => dense.len() * size_of::(), + } + } + + fn next_state(&self, input: u8) -> S { + match *self { + Transitions::Sparse(ref sparse) => { + for &(b, id) in sparse { + if b == input { + return id; + } + } + fail_id() + } + Transitions::Dense(ref dense) => dense[input], + } + } + + fn set_next_state(&mut self, input: u8, next: S) { + match *self { + Transitions::Sparse(ref mut sparse) => { + match sparse.binary_search_by_key(&input, |&(b, _)| b) { + Ok(i) => sparse[i] = (input, next), + Err(i) => sparse.insert(i, (input, next)), + } + } + Transitions::Dense(ref mut dense) => { + dense[input] = next; + } + } + } + + /// Iterate over transitions in this state while skipping over transitions + /// to `fail_id()`. + fn iter(&self, mut f: F) { + match *self { + Transitions::Sparse(ref sparse) => { + for &(b, id) in sparse { + f(b, id); + } + } + Transitions::Dense(ref dense) => { + for b in AllBytesIter::new() { + let id = dense[b]; + if id != fail_id() { + f(b, id); + } + } + } + } + } + + /// Iterate over all transitions in this state according to the given + /// equivalence classes, including transitions to `fail_id()`. + fn iter_all(&self, classes: &ByteClasses, mut f: F) { + if classes.is_singleton() { + match *self { + Transitions::Sparse(ref sparse) => { + sparse_iter(sparse, f); + } + Transitions::Dense(ref dense) => { + for b in AllBytesIter::new() { + f(b, dense[b]); + } + } + } + } else { + // In this case, we only want to yield a single byte for each + // equivalence class. + match *self { + Transitions::Sparse(ref sparse) => { + let mut last_class = None; + sparse_iter(sparse, |b, next| { + let class = classes.get(b); + if last_class != Some(class) { + last_class = Some(class); + f(b, next); + } + }) + } + Transitions::Dense(ref dense) => { + for b in classes.representatives() { + f(b, dense[b]); + } + } + } + } + } +} + +/// Iterator over transitions in a state, skipping transitions to `fail_id()`. +/// +/// This abstracts over the representation of NFA transitions, which may be +/// either in a sparse or dense representation. +/// +/// This somewhat idiosyncratically borrows the NFA mutably, so that when one +/// is iterating over transitions, the caller can still mutate the NFA. This +/// is useful when creating failure transitions. +#[derive(Debug)] +struct IterTransitionsMut<'a, S: StateID> { + nfa: &'a mut NFA, + state_id: S, + cur: usize, +} + +impl<'a, S: StateID> IterTransitionsMut<'a, S> { + fn new(nfa: &'a mut NFA, state_id: S) -> IterTransitionsMut<'a, S> { + IterTransitionsMut { nfa, state_id, cur: 0 } + } + + fn nfa(&mut self) -> &mut NFA { + self.nfa + } +} + +impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> { + type Item = (u8, S); + + fn next(&mut self) -> Option<(u8, S)> { + match self.nfa.states[self.state_id.to_usize()].trans { + Transitions::Sparse(ref sparse) => { + if self.cur >= sparse.len() { + return None; + } + let i = self.cur; + self.cur += 1; + Some(sparse[i]) + } + Transitions::Dense(ref dense) => { + while self.cur < dense.len() { + // There are always exactly 255 transitions in dense repr. + debug_assert!(self.cur < 256); + + let b = self.cur as u8; + let id = dense[b]; + self.cur += 1; + if id != fail_id() { + return Some((b, id)); + } + } + None + } + } + } +} + +/// A simple builder for configuring the NFA construction of Aho-Corasick. +#[derive(Clone, Debug)] +pub struct Builder { + dense_depth: usize, + match_kind: MatchKind, + prefilter: bool, + anchored: bool, + ascii_case_insensitive: bool, +} + +impl Default for Builder { + fn default() -> Builder { + Builder { + dense_depth: 2, + match_kind: MatchKind::default(), + prefilter: true, + anchored: false, + ascii_case_insensitive: false, + } + } +} + +impl Builder { + pub fn new() -> Builder { + Builder::default() + } + + pub fn build(&self, patterns: I) -> Result> + where + I: IntoIterator, + P: AsRef<[u8]>, + { + Compiler::new(self)?.compile(patterns) + } + + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { + self.match_kind = kind; + self + } + + pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { + self.dense_depth = depth; + self + } + + pub fn prefilter(&mut self, yes: bool) -> &mut Builder { + self.prefilter = yes; + self + } + + pub fn anchored(&mut self, yes: bool) -> &mut Builder { + self.anchored = yes; + self + } + + pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { + self.ascii_case_insensitive = yes; + self + } +} + +/// A compiler uses a builder configuration and builds up the NFA formulation +/// of an Aho-Corasick automaton. This roughly corresponds to the standard +/// formulation described in textbooks. +#[derive(Debug)] +struct Compiler<'a, S: StateID> { + builder: &'a Builder, + prefilter: prefilter::Builder, + nfa: NFA, + byte_classes: ByteClassBuilder, +} + +impl<'a, S: StateID> Compiler<'a, S> { + fn new(builder: &'a Builder) -> Result> { + Ok(Compiler { + builder, + prefilter: prefilter::Builder::new(builder.match_kind) + .ascii_case_insensitive(builder.ascii_case_insensitive), + nfa: NFA { + match_kind: builder.match_kind, + start_id: usize_to_state_id(2)?, + max_pattern_len: 0, + pattern_count: 0, + heap_bytes: 0, + prefilter: None, + anchored: builder.anchored, + byte_classes: ByteClasses::singletons(), + states: vec![], + }, + byte_classes: ByteClassBuilder::new(), + }) + } + + fn compile(mut self, patterns: I) -> Result> + where + I: IntoIterator, + P: AsRef<[u8]>, + { + self.add_state(0)?; // the fail state, which is never entered + self.add_state(0)?; // the dead state, only used for leftmost + self.add_state(0)?; // the start state + self.build_trie(patterns)?; + self.add_start_state_loop(); + self.add_dead_state_loop(); + if !self.builder.anchored { + self.fill_failure_transitions(); + } + self.close_start_state_loop(); + self.nfa.byte_classes = self.byte_classes.build(); + if !self.builder.anchored { + self.nfa.prefilter = self.prefilter.build(); + } + self.calculate_size(); + Ok(self.nfa) + } + + /// This sets up the initial prefix trie that makes up the Aho-Corasick + /// automaton. Effectively, it creates the basic structure of the + /// automaton, where every pattern given has a path from the start state to + /// the end of the pattern. + fn build_trie(&mut self, patterns: I) -> Result<()> + where + I: IntoIterator, + P: AsRef<[u8]>, + { + 'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() { + let pat = pat.as_ref(); + self.nfa.max_pattern_len = + cmp::max(self.nfa.max_pattern_len, pat.len()); + self.nfa.pattern_count += 1; + + let mut prev = self.nfa.start_id; + let mut saw_match = false; + for (depth, &b) in pat.iter().enumerate() { + // When leftmost-first match semantics are requested, we + // specifically stop adding patterns when a previously added + // pattern is a prefix of it. We avoid adding it because + // leftmost-first semantics imply that the pattern can never + // match. This is not just an optimization to save space! It + // is necessary for correctness. In fact, this is the only + // difference in the automaton between the implementations for + // leftmost-first and leftmost-longest. + saw_match = saw_match || self.nfa.state(prev).is_match(); + if self.builder.match_kind.is_leftmost_first() && saw_match { + // Skip to the next pattern immediately. This avoids + // incorrectly adding a match after this loop terminates. + continue 'PATTERNS; + } + + // Add this byte to our equivalence classes. We don't use these + // for NFA construction. These are instead used only if we're + // building a DFA. They would technically be useful for the + // NFA, but it would require a second pass over the patterns. + self.byte_classes.set_range(b, b); + if self.builder.ascii_case_insensitive { + let b = opposite_ascii_case(b); + self.byte_classes.set_range(b, b); + } + + // If the transition from prev using the current byte already + // exists, then just move through it. Otherwise, add a new + // state. We track the depth here so that we can determine + // how to represent transitions. States near the start state + // use a dense representation that uses more memory but is + // faster. Other states use a sparse representation that uses + // less memory but is slower. + let next = self.nfa.state(prev).next_state(b); + if next != fail_id() { + prev = next; + } else { + let next = self.add_state(depth + 1)?; + self.nfa.state_mut(prev).set_next_state(b, next); + if self.builder.ascii_case_insensitive { + let b = opposite_ascii_case(b); + self.nfa.state_mut(prev).set_next_state(b, next); + } + prev = next; + } + } + // Once the pattern has been added, log the match in the final + // state that it reached. + self.nfa.state_mut(prev).add_match(pati, pat.len()); + // ... and hand it to the prefilter builder, if applicable. + if self.builder.prefilter { + self.prefilter.add(pat); + } + } + Ok(()) + } + + /// This routine creates failure transitions according to the standard + /// textbook formulation of the Aho-Corasick algorithm, with a couple small + /// tweaks to support "leftmost" semantics. + /// + /// Building failure transitions is the most interesting part of building + /// the Aho-Corasick automaton, because they are what allow searches to + /// be performed in linear time. Specifically, a failure transition is + /// a single transition associated with each state that points back to + /// the longest proper suffix of the pattern being searched. The failure + /// transition is followed whenever there exists no transition on the + /// current state for the current input byte. If there is no other proper + /// suffix, then the failure transition points back to the starting state. + /// + /// For example, let's say we built an Aho-Corasick automaton with the + /// following patterns: 'abcd' and 'cef'. The trie looks like this: + /// + /// ```ignore + /// a - S1 - b - S2 - c - S3 - d - S4* + /// / + /// S0 - c - S5 - e - S6 - f - S7* + /// ``` + /// + /// At this point, it should be fairly straight-forward to see how this + /// trie can be used in a simplistic way. At any given position in the + /// text we're searching (called the "subject" string), all we need to do + /// is follow the transitions in the trie by consuming one transition for + /// each byte in the subject string. If we reach a match state, then we can + /// report that location as a match. + /// + /// The trick comes when searching a subject string like 'abcef'. We'll + /// initially follow the transition from S0 to S1 and wind up in S3 after + /// observng the 'c' byte. At this point, the next byte is 'e' but state + /// S3 has no transition for 'e', so the search fails. We then would need + /// to restart the search at the next position in 'abcef', which + /// corresponds to 'b'. The match would fail, but the next search starting + /// at 'c' would finally succeed. The problem with this approach is that + /// we wind up searching the subject string potentially many times. In + /// effect, this makes the algorithm have worst case `O(n * m)` complexity, + /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead + /// like to achieve a `O(n + m)` worst case complexity. + /// + /// This is where failure transitions come in. Instead of dying at S3 in + /// the first search, the automaton can instruct the search to move to + /// another part of the automaton that corresponds to a suffix of what + /// we've seen so far. Recall that we've seen 'abc' in the subject string, + /// and the automaton does indeed have a non-empty suffix, 'c', that could + /// potentially lead to another match. Thus, the actual Aho-Corasick + /// automaton for our patterns in this case looks like this: + /// + /// ```ignore + /// a - S1 - b - S2 - c - S3 - d - S4* + /// / / + /// / ---------------- + /// / / + /// S0 - c - S5 - e - S6 - f - S7* + /// ``` + /// + /// That is, we have a failure transition from S3 to S5, which is followed + /// exactly in cases when we are in state S3 but see any byte other than + /// 'd' (that is, we've "failed" to find a match in this portion of our + /// trie). We know we can transition back to S5 because we've already seen + /// a 'c' byte, so we don't need to re-scan it. We can then pick back up + /// with the search starting at S5 and complete our match. + /// + /// Adding failure transitions to a trie is fairly simple, but subtle. The + /// key issue is that you might have multiple failure transition that you + /// need to follow. For example, look at the trie for the patterns + /// 'abcd', 'b', 'bcd' and 'cd': + /// + /// ```ignore + /// - a - S1 - b - S2* - c - S3 - d - S4* + /// / / / + /// / ------- ------- + /// / / / + /// S0 --- b - S5* - c - S6 - d - S7* + /// \ / + /// \ -------- + /// \ / + /// - c - S8 - d - S9* + /// ``` + /// + /// The failure transitions for this trie are defined from S2 to S5, + /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it + /// corresponds to a match, since its failure transition to S5 is itself + /// a match state. + /// + /// Perhaps simplest way to think about adding these failure transitions + /// is recursively. That is, if you know the failure transitions for every + /// possible previous state that could be visited (e.g., when computing the + /// failure transition for S3, you already know the failure transitions + /// for S0, S1 and S2), then you can simply follow the failure transition + /// of the previous state and check whether the incoming transition is + /// defined after following the failure transition. + /// + /// For example, when determining the failure state for S3, by our + /// assumptions, we already know that there is a failure transition from + /// S2 (the previous state) to S5. So we follow that transition and check + /// whether the transition connecting S2 to S3 is defined. Indeed, it is, + /// as there is a transition from S5 to S6 for the byte 'c'. If no such + /// transition existed, we could keep following the failure transitions + /// until we reach the start state, which is the failure transition for + /// every state that has no corresponding proper suffix. + /// + /// We don't actually use recursion to implement this, but instead, use a + /// breadth first search of the automaton. Our base case is the start + /// state, whose failure transition is just a transition to itself. + /// + /// When building a leftmost automaton, we proceed as above, but only + /// include a subset of failure transitions. Namely, we omit any failure + /// transitions that appear after a match state in the trie. This is + /// because failure transitions always point back to a proper suffix of + /// what has been seen so far. Thus, following a failure transition after + /// a match implies looking for a match that starts after the one that has + /// already been seen, which is of course therefore not the leftmost match. + /// + /// N.B. I came up with this algorithm on my own, and after scouring all of + /// the other AC implementations I know of (Perl, Snort, many on GitHub). + /// I couldn't find any that implement leftmost semantics like this. + /// Perl of course needs leftmost-first semantics, but they implement it + /// with a seeming hack at *search* time instead of encoding it into the + /// automaton. There are also a couple Java libraries that support leftmost + /// longest semantics, but they do it by building a queue of matches at + /// search time, which is even worse than what Perl is doing. ---AG + fn fill_failure_transitions(&mut self) { + let kind = self.match_kind(); + // Initialize the queue for breadth first search with all transitions + // out of the start state. We handle the start state specially because + // we only want to follow non-self transitions. If we followed self + // transitions, then this would never terminate. + let mut queue = VecDeque::new(); + let mut seen = self.queued_set(); + let mut it = self.nfa.iter_transitions_mut(self.nfa.start_id); + while let Some((_, next)) = it.next() { + // Skip anything we've seen before and any self-transitions on the + // start state. + if next == it.nfa().start_id || seen.contains(next) { + continue; + } + queue.push_back(next); + seen.insert(next); + // Under leftmost semantics, if a state immediately following + // the start state is a match state, then we never want to + // follow its failure transition since the failure transition + // necessarily leads back to the start state, which we never + // want to do for leftmost matching after a match has been + // found. + // + // We apply the same logic to non-start states below as well. + if kind.is_leftmost() && it.nfa().state(next).is_match() { + it.nfa().state_mut(next).fail = dead_id(); + } + } + while let Some(id) = queue.pop_front() { + let mut it = self.nfa.iter_transitions_mut(id); + while let Some((b, next)) = it.next() { + if seen.contains(next) { + // The only way to visit a duplicate state in a transition + // list is when ASCII case insensitivity is enabled. In + // this case, we want to skip it since it's redundant work. + // But it would also end up duplicating matches, which + // results in reporting duplicate matches in some cases. + // See the 'acasei010' regression test. + continue; + } + queue.push_back(next); + seen.insert(next); + + // As above for start states, under leftmost semantics, once + // we see a match all subsequent states should have no failure + // transitions because failure transitions always imply looking + // for a match that is a suffix of what has been seen so far + // (where "seen so far" corresponds to the string formed by + // following the transitions from the start state to the + // current state). Under leftmost semantics, we specifically do + // not want to allow this to happen because we always want to + // report the match found at the leftmost position. + // + // The difference between leftmost-first and leftmost-longest + // occurs previously while we build the trie. For + // leftmost-first, we simply omit any entries that would + // otherwise require passing through a match state. + // + // Note that for correctness, the failure transition has to be + // set to the dead state for ALL states following a match, not + // just the match state itself. However, by setting the failure + // transition to the dead state on all match states, the dead + // state will automatically propagate to all subsequent states + // via the failure state computation below. + if kind.is_leftmost() && it.nfa().state(next).is_match() { + it.nfa().state_mut(next).fail = dead_id(); + continue; + } + let mut fail = it.nfa().state(id).fail; + while it.nfa().state(fail).next_state(b) == fail_id() { + fail = it.nfa().state(fail).fail; + } + fail = it.nfa().state(fail).next_state(b); + it.nfa().state_mut(next).fail = fail; + it.nfa().copy_matches(fail, next); + } + // If the start state is a match state, then this automaton can + // match the empty string. This implies all states are match states + // since every position matches the empty string, so copy the + // matches from the start state to every state. Strictly speaking, + // this is only necessary for overlapping matches since each + // non-empty non-start match state needs to report empty matches + // in addition to its own. For the non-overlapping case, such + // states only report the first match, which is never empty since + // it isn't a start state. + if !kind.is_leftmost() { + it.nfa().copy_empty_matches(id); + } + } + } + + /// Returns a set that tracked queued states. + /// + /// This is only necessary when ASCII case insensitivity is enabled, since + /// it is the only way to visit the same state twice. Otherwise, this + /// returns an inert set that nevers adds anything and always reports + /// `false` for every member test. + fn queued_set(&self) -> QueuedSet { + if self.builder.ascii_case_insensitive { + QueuedSet::active() + } else { + QueuedSet::inert() + } + } + + /// Set the failure transitions on the start state to loop back to the + /// start state. This effectively permits the Aho-Corasick automaton to + /// match at any position. This is also required for finding the next + /// state to terminate, namely, finding the next state should never return + /// a fail_id. + /// + /// This must be done after building the initial trie, since trie + /// construction depends on transitions to `fail_id` to determine whether a + /// state already exists or not. + fn add_start_state_loop(&mut self) { + let start_id = self.nfa.start_id; + let start = self.nfa.start_mut(); + for b in AllBytesIter::new() { + if start.next_state(b) == fail_id() { + start.set_next_state(b, start_id); + } + } + } + + /// Remove the start state loop by rewriting any transitions on the start + /// state back to the start state with transitions to the dead state. + /// + /// The loop is only closed when two conditions are met: the start state + /// is a match state and the match kind is leftmost-first or + /// leftmost-longest. (Alternatively, if this is an anchored automaton, + /// then the start state is always closed, regardless of aforementioned + /// conditions.) + /// + /// The reason for this is that under leftmost semantics, a start state + /// that is also a match implies that we should never restart the search + /// process. We allow normal transitions out of the start state, but if + /// none exist, we transition to the dead state, which signals that + /// searching should stop. + fn close_start_state_loop(&mut self) { + if self.builder.anchored + || (self.match_kind().is_leftmost() && self.nfa.start().is_match()) + { + let start_id = self.nfa.start_id; + let start = self.nfa.start_mut(); + for b in AllBytesIter::new() { + if start.next_state(b) == start_id { + start.set_next_state(b, dead_id()); + } + } + } + } + + /// Sets all transitions on the dead state to point back to the dead state. + /// Normally, missing transitions map back to the failure state, but the + /// point of the dead state is to act as a sink that can never be escaped. + fn add_dead_state_loop(&mut self) { + let dead = self.nfa.state_mut(dead_id()); + for b in AllBytesIter::new() { + dead.set_next_state(b, dead_id()); + } + } + + /// Computes the total amount of heap used by this NFA in bytes. + fn calculate_size(&mut self) { + let mut size = 0; + for state in &self.nfa.states { + size += size_of::>() + state.heap_bytes(); + } + self.nfa.heap_bytes = size; + } + + /// Add a new state to the underlying NFA with the given depth. The depth + /// is used to determine how to represent the transitions. + /// + /// If adding the new state would overflow the chosen state ID + /// representation, then this returns an error. + fn add_state(&mut self, depth: usize) -> Result { + if depth < self.builder.dense_depth { + self.nfa.add_dense_state(depth) + } else { + self.nfa.add_sparse_state(depth) + } + } + + /// Returns the match kind configured on the underlying builder. + fn match_kind(&self) -> MatchKind { + self.builder.match_kind + } +} + +/// A set of state identifiers used to avoid revisiting the same state multiple +/// times when filling in failure transitions. +/// +/// This set has an "inert" and an "active" mode. When inert, the set never +/// stores anything and always returns `false` for every member test. This is +/// useful to avoid the performance and memory overhead of maintaining this +/// set when it is not needed. +#[derive(Debug)] +struct QueuedSet { + set: Option>, +} + +impl QueuedSet { + /// Return an inert set that returns `false` for every state ID membership + /// test. + fn inert() -> QueuedSet { + QueuedSet { set: None } + } + + /// Return an active set that tracks state ID membership. + fn active() -> QueuedSet { + QueuedSet { set: Some(BTreeSet::new()) } + } + + /// Inserts the given state ID into this set. (If the set is inert, then + /// this is a no-op.) + fn insert(&mut self, state_id: S) { + if let Some(ref mut set) = self.set { + set.insert(state_id); + } + } + + /// Returns true if and only if the given state ID is in this set. If the + /// set is inert, this always returns false. + fn contains(&self, state_id: S) -> bool { + match self.set { + None => false, + Some(ref set) => set.contains(&state_id), + } + } +} + +/// An iterator over every byte value. +/// +/// We use this instead of (0..256).map(|b| b as u8) because this optimizes +/// better in debug builds. +/// +/// We also use this instead of 0..=255 because we're targeting Rust 1.24 and +/// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this +/// once our MSRV is Rust 1.26 or newer. +#[derive(Debug)] +struct AllBytesIter(u16); + +impl AllBytesIter { + fn new() -> AllBytesIter { + AllBytesIter(0) + } +} + +impl Iterator for AllBytesIter { + type Item = u8; + + fn next(&mut self) -> Option { + if self.0 >= 256 { + None + } else { + let b = self.0 as u8; + self.0 += 1; + Some(b) + } + } +} + +impl fmt::Debug for NFA { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "NFA(")?; + writeln!(f, "match_kind: {:?}", self.match_kind)?; + writeln!(f, "prefilter: {:?}", self.prefilter)?; + writeln!(f, "{}", "-".repeat(79))?; + for (id, s) in self.states.iter().enumerate() { + let mut trans = vec![]; + s.trans.iter(|byte, next| { + // The start state has a bunch of uninteresting transitions + // back into itself. It's questionable to hide them since they + // are critical to understanding the automaton, but they are + // very noisy without better formatting for contiugous ranges + // to the same state. + if id == self.start_id.to_usize() && next == self.start_id { + return; + } + // Similarly, the dead state has a bunch of uninteresting + // transitions too. + if id == dead_id() { + return; + } + trans.push(format!("{} => {}", escape(byte), next.to_usize())); + }); + writeln!(f, "{:04}: {}", id, trans.join(", "))?; + + let matches: Vec = s + .matches + .iter() + .map(|&(pattern_id, _)| pattern_id.to_string()) + .collect(); + writeln!(f, " matches: {}", matches.join(", "))?; + writeln!(f, " fail: {}", s.fail.to_usize())?; + writeln!(f, " depth: {}", s.depth)?; + } + writeln!(f, "{}", "-".repeat(79))?; + writeln!(f, ")")?; + Ok(()) + } +} + +/// Iterate over all possible byte transitions given a sparse set. +fn sparse_iter(trans: &[(u8, S)], mut f: F) { + let mut byte = 0u16; + for &(b, id) in trans { + while byte < (b as u16) { + f(byte as u8, fail_id()); + byte += 1; + } + f(b, id); + byte += 1; + } + for b in byte..256 { + f(b as u8, fail_id()); + } +} + +/// Safely return two mutable borrows to two different locations in the given +/// slice. +/// +/// This panics if i == j. +fn get_two_mut(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) { + assert!(i != j, "{} must not be equal to {}", i, j); + if i < j { + let (before, after) = xs.split_at_mut(j); + (&mut before[i], &mut after[0]) + } else { + let (before, after) = xs.split_at_mut(i); + (&mut after[0], &mut before[j]) + } +} + +/// Return the given byte as its escaped string form. +fn escape(b: u8) -> String { + use std::ascii; + + String::from_utf8(ascii::escape_default(b).collect::>()).unwrap() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn scratch() { + let nfa: NFA = Builder::new() + .dense_depth(0) + // .match_kind(MatchKind::LeftmostShortest) + // .match_kind(MatchKind::LeftmostLongest) + .match_kind(MatchKind::LeftmostFirst) + // .build(&["abcd", "ce", "b"]) + // .build(&["ab", "bc"]) + // .build(&["b", "bcd", "ce"]) + // .build(&["abc", "bx"]) + // .build(&["abc", "bd", "ab"]) + // .build(&["abcdefghi", "hz", "abcdefgh"]) + // .build(&["abcd", "bce", "b"]) + .build(&["abcdefg", "bcde", "bcdef"]) + .unwrap(); + println!("{:?}", nfa); + } +} diff --git a/src/packed/api.rs b/src/packed/api.rs new file mode 100644 index 0000000..51703d0 --- /dev/null +++ b/src/packed/api.rs @@ -0,0 +1,625 @@ +use std::u16; + +use crate::packed::pattern::Patterns; +use crate::packed::rabinkarp::RabinKarp; +use crate::packed::teddy::{self, Teddy}; +use crate::Match; + +/// This is a limit placed on the total number of patterns we're willing to try +/// and match at once. As more sophisticated algorithms are added, this number +/// may be increased. +const PATTERN_LIMIT: usize = 128; + +/// A knob for controlling the match semantics of a packed multiple string +/// searcher. +/// +/// This differs from the +/// [`MatchKind`](../enum.MatchKind.html) +/// type in the top-level crate module in that it doesn't support +/// "standard" match semantics, and instead only supports leftmost-first or +/// leftmost-longest. Namely, "standard" semantics cannot be easily supported +/// by packed searchers. +/// +/// For more information on the distinction between leftmost-first and +/// leftmost-longest, see the docs on the top-level `MatchKind` type. +/// +/// Unlike the top-level `MatchKind` type, the default match semantics for this +/// type are leftmost-first. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum MatchKind { + /// Use leftmost-first match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the match + /// corresponding to the pattern that appeared earlier when constructing + /// the automaton is reported. + /// + /// This is the default. + LeftmostFirst, + /// Use leftmost-longest match semantics, which reports leftmost matches. + /// When there are multiple possible leftmost matches, the longest match + /// is chosen. + LeftmostLongest, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl Default for MatchKind { + fn default() -> MatchKind { + MatchKind::LeftmostFirst + } +} + +/// The configuration for a packed multiple pattern searcher. +/// +/// The configuration is currently limited only to being able to select the +/// match semantics (leftmost-first or leftmost-longest) of a searcher. In the +/// future, more knobs may be made available. +/// +/// A configuration produces a [`packed::Builder`](struct.Builder.html), which +/// in turn can be used to construct a +/// [`packed::Searcher`](struct.Searcher.html) for searching. +/// +/// # Example +/// +/// This example shows how to use leftmost-longest semantics instead of the +/// default (leftmost-first). +/// +/// ``` +/// use aho_corasick::packed::{Config, MatchKind}; +/// +/// # fn example() -> Option<()> { +/// let searcher = Config::new() +/// .match_kind(MatchKind::LeftmostLongest) +/// .builder() +/// .add("foo") +/// .add("foobar") +/// .build()?; +/// let matches: Vec = searcher +/// .find_iter("foobar") +/// .map(|mat| mat.pattern()) +/// .collect(); +/// assert_eq!(vec![1], matches); +/// # Some(()) } +/// # if cfg!(target_arch = "x86_64") { +/// # example().unwrap() +/// # } else { +/// # assert!(example().is_none()); +/// # } +/// ``` +#[derive(Clone, Debug)] +pub struct Config { + kind: MatchKind, + force: Option, + force_teddy_fat: Option, + force_avx: Option, +} + +/// An internal option for forcing the use of a particular packed algorithm. +/// +/// When an algorithm is forced, if a searcher could not be constructed for it, +/// then no searcher will be returned even if an alternative algorithm would +/// work. +#[derive(Clone, Debug)] +enum ForceAlgorithm { + Teddy, + RabinKarp, +} + +impl Default for Config { + fn default() -> Config { + Config::new() + } +} + +impl Config { + /// Create a new default configuration. A default configuration uses + /// leftmost-first match semantics. + pub fn new() -> Config { + Config { + kind: MatchKind::LeftmostFirst, + force: None, + force_teddy_fat: None, + force_avx: None, + } + } + + /// Create a packed builder from this configuration. The builder can be + /// used to accumulate patterns and create a + /// [`Searcher`](struct.Searcher.html) + /// from them. + pub fn builder(&self) -> Builder { + Builder::from_config(self.clone()) + } + + /// Set the match semantics for this configuration. + pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { + self.kind = kind; + self + } + + /// An undocumented method for forcing the use of the Teddy algorithm. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn force_teddy(&mut self, yes: bool) -> &mut Config { + if yes { + self.force = Some(ForceAlgorithm::Teddy); + } else { + self.force = None; + } + self + } + + /// An undocumented method for forcing the use of the Fat Teddy algorithm. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn force_teddy_fat(&mut self, yes: Option) -> &mut Config { + self.force_teddy_fat = yes; + self + } + + /// An undocumented method for forcing the use of SSE (`Some(false)`) or + /// AVX (`Some(true)`) algorithms. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn force_avx(&mut self, yes: Option) -> &mut Config { + self.force_avx = yes; + self + } + + /// An undocumented method for forcing the use of the Rabin-Karp algorithm. + /// + /// This is only exposed for more precise testing and benchmarks. Callers + /// should not use it as it is not part of the API stability guarantees of + /// this crate. + #[doc(hidden)] + pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config { + if yes { + self.force = Some(ForceAlgorithm::RabinKarp); + } else { + self.force = None; + } + self + } +} + +/// A builder for constructing a packed searcher from a collection of patterns. +/// +/// # Example +/// +/// This example shows how to use a builder to construct a searcher. By +/// default, leftmost-first match semantics are used. +/// +/// ``` +/// use aho_corasick::packed::{Builder, MatchKind}; +/// +/// # fn example() -> Option<()> { +/// let searcher = Builder::new() +/// .add("foobar") +/// .add("foo") +/// .build()?; +/// let matches: Vec = searcher +/// .find_iter("foobar") +/// .map(|mat| mat.pattern()) +/// .collect(); +/// assert_eq!(vec![0], matches); +/// # Some(()) } +/// # if cfg!(target_arch = "x86_64") { +/// # example().unwrap() +/// # } else { +/// # assert!(example().is_none()); +/// # } +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + /// The configuration of this builder and subsequent matcher. + config: Config, + /// Set to true if the builder detects that a matcher cannot be built. + inert: bool, + /// The patterns provided by the caller. + patterns: Patterns, +} + +impl Builder { + /// Create a new builder for constructing a multi-pattern searcher. This + /// constructor uses the default configuration. + pub fn new() -> Builder { + Builder::from_config(Config::new()) + } + + fn from_config(config: Config) -> Builder { + Builder { config, inert: false, patterns: Patterns::new() } + } + + /// Build a searcher from the patterns added to this builder so far. + pub fn build(&self) -> Option { + if self.inert || self.patterns.is_empty() { + return None; + } + let mut patterns = self.patterns.clone(); + patterns.set_match_kind(self.config.kind); + let rabinkarp = RabinKarp::new(&patterns); + // Effectively, we only want to return a searcher if we can use Teddy, + // since Teddy is our only fast packed searcher at the moment. + // Rabin-Karp is only used when searching haystacks smaller than what + // Teddy can support. Thus, the only way to get a Rabin-Karp searcher + // is to force it using undocumented APIs (for tests/benchmarks). + let (search_kind, minimum_len) = match self.config.force { + None | Some(ForceAlgorithm::Teddy) => { + let teddy = match self.build_teddy(&patterns) { + None => return None, + Some(teddy) => teddy, + }; + let minimum_len = teddy.minimum_len(); + (SearchKind::Teddy(teddy), minimum_len) + } + Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0), + }; + Some(Searcher { patterns, rabinkarp, search_kind, minimum_len }) + } + + fn build_teddy(&self, patterns: &Patterns) -> Option { + teddy::Builder::new() + .avx(self.config.force_avx) + .fat(self.config.force_teddy_fat) + .build(&patterns) + } + + /// Add the given pattern to this set to match. + /// + /// The order in which patterns are added is significant. Namely, when + /// using leftmost-first match semantics, then when multiple patterns can + /// match at a particular location, the pattern that was added first is + /// used as the match. + /// + /// If the number of patterns added exceeds the amount supported by packed + /// searchers, then the builder will stop accumulating patterns and render + /// itself inert. At this point, constructing a searcher will always return + /// `None`. + pub fn add>(&mut self, pattern: P) -> &mut Builder { + if self.inert { + return self; + } else if self.patterns.len() >= PATTERN_LIMIT { + self.inert = true; + self.patterns.reset(); + return self; + } + // Just in case PATTERN_LIMIT increases beyond u16::MAX. + assert!(self.patterns.len() <= u16::MAX as usize); + + let pattern = pattern.as_ref(); + if pattern.is_empty() { + self.inert = true; + self.patterns.reset(); + return self; + } + self.patterns.add(pattern); + self + } + + /// Add the given iterator of patterns to this set to match. + /// + /// The iterator must yield elements that can be converted into a `&[u8]`. + /// + /// The order in which patterns are added is significant. Namely, when + /// using leftmost-first match semantics, then when multiple patterns can + /// match at a particular location, the pattern that was added first is + /// used as the match. + /// + /// If the number of patterns added exceeds the amount supported by packed + /// searchers, then the builder will stop accumulating patterns and render + /// itself inert. At this point, constructing a searcher will always return + /// `None`. + pub fn extend(&mut self, patterns: I) -> &mut Builder + where + I: IntoIterator, + P: AsRef<[u8]>, + { + for p in patterns { + self.add(p); + } + self + } +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +/// A packed searcher for quickly finding occurrences of multiple patterns. +/// +/// If callers need more flexible construction, or if one wants to change the +/// match semantics (either leftmost-first or leftmost-longest), then one can +/// use the [`Config`](struct.Config.html) and/or +/// [`Builder`](struct.Builder.html) types for more fine grained control. +/// +/// # Example +/// +/// This example shows how to create a searcher from an iterator of patterns. +/// By default, leftmost-first match semantics are used. +/// +/// ``` +/// use aho_corasick::packed::{MatchKind, Searcher}; +/// +/// # fn example() -> Option<()> { +/// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; +/// let matches: Vec = searcher +/// .find_iter("foobar") +/// .map(|mat| mat.pattern()) +/// .collect(); +/// assert_eq!(vec![0], matches); +/// # Some(()) } +/// # if cfg!(target_arch = "x86_64") { +/// # example().unwrap() +/// # } else { +/// # assert!(example().is_none()); +/// # } +/// ``` +#[derive(Clone, Debug)] +pub struct Searcher { + patterns: Patterns, + rabinkarp: RabinKarp, + search_kind: SearchKind, + minimum_len: usize, +} + +#[derive(Clone, Debug)] +enum SearchKind { + Teddy(Teddy), + RabinKarp, +} + +impl Searcher { + /// A convenience function for constructing a searcher from an iterator + /// of things that can be converted to a `&[u8]`. + /// + /// If a searcher could not be constructed (either because of an + /// unsupported CPU or because there are too many patterns), then `None` + /// is returned. + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::packed::{MatchKind, Searcher}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let matches: Vec = searcher + /// .find_iter("foobar") + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![0], matches); + /// # Some(()) } + /// # if cfg!(target_arch = "x86_64") { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + pub fn new(patterns: I) -> Option + where + I: IntoIterator, + P: AsRef<[u8]>, + { + Builder::new().extend(patterns).build() + } + + /// Return the first occurrence of any of the patterns in this searcher, + /// according to its match semantics, in the given haystack. The `Match` + /// returned will include the identifier of the pattern that matched, which + /// corresponds to the index of the pattern (starting from `0`) in which it + /// was added. + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::packed::{MatchKind, Searcher}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let mat = searcher.find("foobar")?; + /// assert_eq!(0, mat.pattern()); + /// assert_eq!(0, mat.start()); + /// assert_eq!(6, mat.end()); + /// # Some(()) } + /// # if cfg!(target_arch = "x86_64") { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + pub fn find>(&self, haystack: B) -> Option { + self.find_at(haystack, 0) + } + + /// Return the first occurrence of any of the patterns in this searcher, + /// according to its match semantics, in the given haystack starting from + /// the given position. + /// + /// The `Match` returned will include the identifier of the pattern that + /// matched, which corresponds to the index of the pattern (starting from + /// `0`) in which it was added. The offsets in the `Match` will be relative + /// to the start of `haystack` (and not `at`). + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::packed::{MatchKind, Searcher}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let mat = searcher.find_at("foofoobar", 3)?; + /// assert_eq!(0, mat.pattern()); + /// assert_eq!(3, mat.start()); + /// assert_eq!(9, mat.end()); + /// # Some(()) } + /// # if cfg!(target_arch = "x86_64") { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + pub fn find_at>( + &self, + haystack: B, + at: usize, + ) -> Option { + let haystack = haystack.as_ref(); + match self.search_kind { + SearchKind::Teddy(ref teddy) => { + if haystack[at..].len() < teddy.minimum_len() { + return self.slow_at(haystack, at); + } + teddy.find_at(&self.patterns, haystack, at) + } + SearchKind::RabinKarp => { + self.rabinkarp.find_at(&self.patterns, haystack, at) + } + } + } + + /// Return an iterator of non-overlapping occurrences of the patterns in + /// this searcher, according to its match semantics, in the given haystack. + /// + /// # Example + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::packed::{MatchKind, Searcher}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// let matches: Vec = searcher + /// .find_iter("foobar fooba foofoo") + /// .map(|mat| mat.pattern()) + /// .collect(); + /// assert_eq!(vec![0, 1, 1, 1], matches); + /// # Some(()) } + /// # if cfg!(target_arch = "x86_64") { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( + &'a self, + haystack: &'b B, + ) -> FindIter<'a, 'b> { + FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 } + } + + /// Returns the match kind used by this packed searcher. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use aho_corasick::packed::{MatchKind, Searcher}; + /// + /// # fn example() -> Option<()> { + /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; + /// // leftmost-first is the default. + /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind()); + /// # Some(()) } + /// # if cfg!(target_arch = "x86_64") { + /// # example().unwrap() + /// # } else { + /// # assert!(example().is_none()); + /// # } + /// ``` + pub fn match_kind(&self) -> &MatchKind { + self.patterns.match_kind() + } + + /// Returns the minimum length of a haystack that is required in order for + /// packed searching to be effective. + /// + /// In some cases, the underlying packed searcher may not be able to search + /// very short haystacks. When that occurs, the implementation will defer + /// to a slower non-packed searcher (which is still generally faster than + /// Aho-Corasick for a small number of patterns). However, callers may + /// want to avoid ever using the slower variant, which one can do by + /// never passing a haystack shorter than the minimum length returned by + /// this method. + pub fn minimum_len(&self) -> usize { + self.minimum_len + } + + /// Returns the approximate total amount of heap used by this searcher, in + /// units of bytes. + pub fn heap_bytes(&self) -> usize { + self.patterns.heap_bytes() + + self.rabinkarp.heap_bytes() + + self.search_kind.heap_bytes() + } + + /// Use a slow (non-packed) searcher. + /// + /// This is useful when a packed searcher could be constructed, but could + /// not be used to search a specific haystack. For example, if Teddy was + /// built but the haystack is smaller than ~34 bytes, then Teddy might not + /// be able to run. + fn slow_at(&self, haystack: &[u8], at: usize) -> Option { + self.rabinkarp.find_at(&self.patterns, haystack, at) + } +} + +impl SearchKind { + fn heap_bytes(&self) -> usize { + match *self { + SearchKind::Teddy(ref ted) => ted.heap_bytes(), + SearchKind::RabinKarp => 0, + } + } +} + +/// An iterator over non-overlapping matches from a packed searcher. +/// +/// The lifetime `'s` refers to the lifetime of the underlying +/// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the +/// lifetime of the haystack being searched. +#[derive(Debug)] +pub struct FindIter<'s, 'h> { + searcher: &'s Searcher, + haystack: &'h [u8], + at: usize, +} + +impl<'s, 'h> Iterator for FindIter<'s, 'h> { + type Item = Match; + + fn next(&mut self) -> Option { + if self.at > self.haystack.len() { + return None; + } + match self.searcher.find_at(&self.haystack, self.at) { + None => None, + Some(c) => { + self.at = c.end; + Some(c) + } + } + } +} diff --git a/src/packed/mod.rs b/src/packed/mod.rs new file mode 100644 index 0000000..97c40ff --- /dev/null +++ b/src/packed/mod.rs @@ -0,0 +1,117 @@ +/*! +A lower level API for packed multiple substring search, principally for a small +number of patterns. + +This sub-module provides vectorized routines for quickly finding matches of a +small number of patterns. In general, users of this crate shouldn't need to +interface with this module directly, as the primary +[`AhoCorasick`](../struct.AhoCorasick.html) +searcher will use these routines automatically as a prefilter when applicable. +However, in some cases, callers may want to bypass the Aho-Corasick machinery +entirely and use this vectorized searcher directly. + +# Overview + +The primary types in this sub-module are: + +* [`Searcher`](struct.Searcher.html) executes the actual search algorithm to + report matches in a haystack. +* [`Builder`](struct.Builder.html) accumulates patterns incrementally and can + construct a `Searcher`. +* [`Config`](struct.Config.html) permits tuning the searcher, and itself will + produce a `Builder` (which can then be used to build a `Searcher`). + Currently, the only tuneable knob are the match semantics, but this may be + expanded in the future. + +# Examples + +This example shows how to create a searcher from an iterator of patterns. +By default, leftmost-first match semantics are used. (See the top-level +[`MatchKind`](../enum.MatchKind.html) type for more details about match +semantics, which apply similarly to packed substring search.) + +``` +use aho_corasick::packed::{MatchKind, Searcher}; + +# fn example() -> Option<()> { +let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; +let matches: Vec = searcher + .find_iter("foobar") + .map(|mat| mat.pattern()) + .collect(); +assert_eq!(vec![0], matches); +# Some(()) } +# if cfg!(target_arch = "x86_64") { +# example().unwrap() +# } else { +# assert!(example().is_none()); +# } +``` + +This example shows how to use [`Config`](struct.Config.html) to change the +match semantics to leftmost-longest: + +``` +use aho_corasick::packed::{Config, MatchKind}; + +# fn example() -> Option<()> { +let searcher = Config::new() + .match_kind(MatchKind::LeftmostLongest) + .builder() + .add("foo") + .add("foobar") + .build()?; +let matches: Vec = searcher + .find_iter("foobar") + .map(|mat| mat.pattern()) + .collect(); +assert_eq!(vec![1], matches); +# Some(()) } +# if cfg!(target_arch = "x86_64") { +# example().unwrap() +# } else { +# assert!(example().is_none()); +# } +``` + +# Packed substring searching + +Packed substring searching refers to the use of SIMD (Single Instruction, +Multiple Data) to accelerate the detection of matches in a haystack. Unlike +conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring +search tend to do better with a small number of patterns, where as Aho-Corasick +generally maintains reasonably consistent performance regardless of the number +of patterns you give it. Because of this, the vectorized searcher in this +sub-module cannot be used as a general purpose searcher, since building the +searcher may fail. However, in exchange, when searching for a small number of +patterns, searching can be quite a bit faster than Aho-Corasick (sometimes by +an order of magnitude). + +The key take away here is that constructing a searcher from a list of patterns +is a fallible operation. While the precise conditions under which building a +searcher can fail is specifically an implementation detail, here are some +common reasons: + +* Too many patterns were given. Typically, the limit is on the order of 100 or + so, but this limit may fluctuate based on available CPU features. +* The available packed algorithms require CPU features that aren't available. + For example, currently, this crate only provides packed algorithms for + `x86_64`. Therefore, constructing a packed searcher on any other target + (e.g., ARM) will always fail. +* Zero patterns were given, or one of the patterns given was empty. Packed + searchers require at least one pattern and that all patterns are non-empty. +* Something else about the nature of the patterns (typically based on + heuristics) suggests that a packed searcher would perform very poorly, so + no searcher is built. +*/ + +pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher}; + +mod api; +mod pattern; +mod rabinkarp; +mod teddy; +#[cfg(test)] +mod tests; +#[cfg(target_arch = "x86_64")] +mod vector; diff --git a/src/packed/pattern.rs b/src/packed/pattern.rs new file mode 100644 index 0000000..f4c6756 --- /dev/null +++ b/src/packed/pattern.rs @@ -0,0 +1,318 @@ +use std::cmp; +use std::fmt; +use std::mem; +use std::u16; +use std::usize; + +use crate::packed::api::MatchKind; + +/// The type used for representing a pattern identifier. +/// +/// We don't use `usize` here because our packed searchers don't scale to +/// huge numbers of patterns, so we keep things a bit smaller. +pub type PatternID = u16; + +/// A non-empty collection of non-empty patterns to search for. +/// +/// This collection of patterns is what is passed around to both execute +/// searches and to construct the searchers themselves. Namely, this permits +/// searches to avoid copying all of the patterns, and allows us to keep only +/// one copy throughout all packed searchers. +/// +/// Note that this collection is not a set. The same pattern can appear more +/// than once. +#[derive(Clone, Debug)] +pub struct Patterns { + /// The match semantics supported by this collection of patterns. + /// + /// The match semantics determines the order of the iterator over patterns. + /// For leftmost-first, patterns are provided in the same order as were + /// provided by the caller. For leftmost-longest, patterns are provided in + /// descending order of length, with ties broken by the order in which they + /// were provided by the caller. + kind: MatchKind, + /// The collection of patterns, indexed by their identifier. + by_id: Vec>, + /// The order of patterns defined for iteration, given by pattern + /// identifiers. The order of `by_id` and `order` is always the same for + /// leftmost-first semantics, but may be different for leftmost-longest + /// semantics. + order: Vec, + /// The length of the smallest pattern, in bytes. + minimum_len: usize, + /// The largest pattern identifier. This should always be equivalent to + /// the number of patterns minus one in this collection. + max_pattern_id: PatternID, + /// The total number of pattern bytes across the entire collection. This + /// is used for reporting total heap usage in constant time. + total_pattern_bytes: usize, +} + +impl Patterns { + /// Create a new collection of patterns for the given match semantics. The + /// ID of each pattern is the index of the pattern at which it occurs in + /// the `by_id` slice. + /// + /// If any of the patterns in the slice given are empty, then this panics. + /// Similarly, if the number of patterns given is zero, then this also + /// panics. + pub fn new() -> Patterns { + Patterns { + kind: MatchKind::default(), + by_id: vec![], + order: vec![], + minimum_len: usize::MAX, + max_pattern_id: 0, + total_pattern_bytes: 0, + } + } + + /// Add a pattern to this collection. + /// + /// This panics if the pattern given is empty. + pub fn add(&mut self, bytes: &[u8]) { + assert!(!bytes.is_empty()); + assert!(self.by_id.len() <= u16::MAX as usize); + + let id = self.by_id.len() as u16; + self.max_pattern_id = id; + self.order.push(id); + self.by_id.push(bytes.to_vec()); + self.minimum_len = cmp::min(self.minimum_len, bytes.len()); + self.total_pattern_bytes += bytes.len(); + } + + /// Set the match kind semantics for this collection of patterns. + /// + /// If the kind is not set, then the default is leftmost-first. + pub fn set_match_kind(&mut self, kind: MatchKind) { + match kind { + MatchKind::LeftmostFirst => { + self.order.sort(); + } + MatchKind::LeftmostLongest => { + let (order, by_id) = (&mut self.order, &mut self.by_id); + order.sort_by(|&id1, &id2| { + by_id[id1 as usize] + .len() + .cmp(&by_id[id2 as usize].len()) + .reverse() + }); + } + MatchKind::__Nonexhaustive => unreachable!(), + } + } + + /// Return the number of patterns in this collection. + /// + /// This is guaranteed to be greater than zero. + pub fn len(&self) -> usize { + self.by_id.len() + } + + /// Returns true if and only if this collection of patterns is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the approximate total amount of heap used by these patterns, in + /// units of bytes. + pub fn heap_bytes(&self) -> usize { + self.order.len() * mem::size_of::() + + self.by_id.len() * mem::size_of::>() + + self.total_pattern_bytes + } + + /// Clears all heap memory associated with this collection of patterns and + /// resets all state such that it is a valid empty collection. + pub fn reset(&mut self) { + self.kind = MatchKind::default(); + self.by_id.clear(); + self.order.clear(); + self.minimum_len = usize::MAX; + self.max_pattern_id = 0; + } + + /// Return the maximum pattern identifier in this collection. This can be + /// useful in searchers for ensuring that the collection of patterns they + /// are provided at search time and at build time have the same size. + pub fn max_pattern_id(&self) -> PatternID { + assert_eq!((self.max_pattern_id + 1) as usize, self.len()); + self.max_pattern_id + } + + /// Returns the length, in bytes, of the smallest pattern. + /// + /// This is guaranteed to be at least one. + pub fn minimum_len(&self) -> usize { + self.minimum_len + } + + /// Returns the match semantics used by these patterns. + pub fn match_kind(&self) -> &MatchKind { + &self.kind + } + + /// Return the pattern with the given identifier. If such a pattern does + /// not exist, then this panics. + pub fn get(&self, id: PatternID) -> Pattern<'_> { + Pattern(&self.by_id[id as usize]) + } + + /// Return the pattern with the given identifier without performing bounds + /// checks. + /// + /// # Safety + /// + /// Callers must ensure that a pattern with the given identifier exists + /// before using this method. + #[cfg(target_arch = "x86_64")] + pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> { + Pattern(self.by_id.get_unchecked(id as usize)) + } + + /// Return an iterator over all the patterns in this collection, in the + /// order in which they should be matched. + /// + /// Specifically, in a naive multi-pattern matcher, the following is + /// guaranteed to satisfy the match semantics of this collection of + /// patterns: + /// + /// ```ignore + /// for i in 0..haystack.len(): + /// for p in patterns.iter(): + /// if haystack[i..].starts_with(p.bytes()): + /// return Match(p.id(), i, i + p.bytes().len()) + /// ``` + /// + /// Namely, among the patterns in a collection, if they are matched in + /// the order provided by this iterator, then the result is guaranteed + /// to satisfy the correct match semantics. (Either leftmost-first or + /// leftmost-longest.) + pub fn iter(&self) -> PatternIter<'_> { + PatternIter { patterns: self, i: 0 } + } +} + +/// An iterator over the patterns in the `Patterns` collection. +/// +/// The order of the patterns provided by this iterator is consistent with the +/// match semantics of the originating collection of patterns. +/// +/// The lifetime `'p` corresponds to the lifetime of the collection of patterns +/// this is iterating over. +#[derive(Debug)] +pub struct PatternIter<'p> { + patterns: &'p Patterns, + i: usize, +} + +impl<'p> Iterator for PatternIter<'p> { + type Item = (PatternID, Pattern<'p>); + + fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> { + if self.i >= self.patterns.len() { + return None; + } + let id = self.patterns.order[self.i]; + let p = self.patterns.get(id); + self.i += 1; + Some((id, p)) + } +} + +/// A pattern that is used in packed searching. +#[derive(Clone)] +pub struct Pattern<'a>(&'a [u8]); + +impl<'a> fmt::Debug for Pattern<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Pattern") + .field("lit", &String::from_utf8_lossy(&self.0)) + .finish() + } +} + +impl<'p> Pattern<'p> { + /// Returns the length of this pattern, in bytes. + pub fn len(&self) -> usize { + self.0.len() + } + + /// Returns the bytes of this pattern. + pub fn bytes(&self) -> &[u8] { + &self.0 + } + + /// Returns the first `len` low nybbles from this pattern. If this pattern + /// is shorter than `len`, then this panics. + #[cfg(target_arch = "x86_64")] + pub fn low_nybbles(&self, len: usize) -> Vec { + let mut nybs = vec![]; + for &b in self.bytes().iter().take(len) { + nybs.push(b & 0xF); + } + nybs + } + + /// Returns true if this pattern is a prefix of the given bytes. + #[inline(always)] + pub fn is_prefix(&self, bytes: &[u8]) -> bool { + self.len() <= bytes.len() && self.equals(&bytes[..self.len()]) + } + + /// Returns true if and only if this pattern equals the given bytes. + #[inline(always)] + pub fn equals(&self, bytes: &[u8]) -> bool { + // Why not just use memcmp for this? Well, memcmp requires calling out + // to libc, and this routine is called in fairly hot code paths. Other + // than just calling out to libc, it also seems to result in worse + // codegen. By rolling our own memcpy in pure Rust, it seems to appear + // more friendly to the optimizer. + // + // This results in an improvement in just about every benchmark. Some + // smaller than others, but in some cases, up to 30% faster. + + if self.len() != bytes.len() { + return false; + } + if self.len() < 8 { + for (&b1, &b2) in self.bytes().iter().zip(bytes) { + if b1 != b2 { + return false; + } + } + return true; + } + // When we have 8 or more bytes to compare, then proceed in chunks of + // 8 at a time using unaligned loads. + let mut p1 = self.bytes().as_ptr(); + let mut p2 = bytes.as_ptr(); + let p1end = self.bytes()[self.len() - 8..].as_ptr(); + let p2end = bytes[bytes.len() - 8..].as_ptr(); + // SAFETY: Via the conditional above, we know that both `p1` and `p2` + // have the same length, so `p1 < p1end` implies that `p2 < p2end`. + // Thus, derefencing both `p1` and `p2` in the loop below is safe. + // + // Moreover, we set `p1end` and `p2end` to be 8 bytes before the actual + // end of of `p1` and `p2`. Thus, the final dereference outside of the + // loop is guaranteed to be valid. + // + // Finally, we needn't worry about 64-bit alignment here, since we + // do unaligned loads. + unsafe { + while p1 < p1end { + let v1 = (p1 as *const u64).read_unaligned(); + let v2 = (p2 as *const u64).read_unaligned(); + if v1 != v2 { + return false; + } + p1 = p1.add(8); + p2 = p2.add(8); + } + let v1 = (p1end as *const u64).read_unaligned(); + let v2 = (p2end as *const u64).read_unaligned(); + v1 == v2 + } + } +} diff --git a/src/packed/rabinkarp.rs b/src/packed/rabinkarp.rs new file mode 100644 index 0000000..c081f70 --- /dev/null +++ b/src/packed/rabinkarp.rs @@ -0,0 +1,185 @@ +use std::mem; + +use crate::packed::pattern::{PatternID, Patterns}; +use crate::Match; + +/// The type of the rolling hash used in the Rabin-Karp algorithm. +type Hash = usize; + +/// The number of buckets to store our patterns in. We don't want this to be +/// too big in order to avoid wasting memory, but we don't want it to be too +/// small either to avoid spending too much time confirming literals. +/// +/// The number of buckets MUST be a power of two. Otherwise, determining the +/// bucket from a hash will slow down the code considerably. Using a power +/// of two means `hash % NUM_BUCKETS` can compile down to a simple `and` +/// instruction. +const NUM_BUCKETS: usize = 64; + +/// An implementation of the Rabin-Karp algorithm. The main idea of this +/// algorithm is to maintain a rolling hash as it moves through the input, and +/// then check whether that hash corresponds to the same hash for any of the +/// patterns we're looking for. +/// +/// A draw back of naively scaling Rabin-Karp to multiple patterns is that +/// it requires all of the patterns to be the same length, which in turn +/// corresponds to the number of bytes to hash. We adapt this to work for +/// multiple patterns of varying size by fixing the number of bytes to hash +/// to be the length of the smallest pattern. We also split the patterns into +/// several buckets to hopefully make the confirmation step faster. +/// +/// Wikipedia has a decent explanation, if a bit heavy on the theory: +/// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm +/// +/// But ESMAJ provides something a bit more concrete: +/// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html +#[derive(Clone, Debug)] +pub struct RabinKarp { + /// The order of patterns in each bucket is significant. Namely, they are + /// arranged such that the first one to match is the correct match. This + /// may not necessarily correspond to the order provided by the caller. + /// For example, if leftmost-longest semantics are used, then the patterns + /// are sorted by their length in descending order. If leftmost-first + /// semantics are used, then the patterns are sorted by their pattern ID + /// in ascending order (which corresponds to the caller's order). + buckets: Vec>, + /// The length of the hashing window. Generally, this corresponds to the + /// length of the smallest pattern. + hash_len: usize, + /// The factor to subtract out of a hash before updating it with a new + /// byte. + hash_2pow: usize, + /// The maximum identifier of a pattern. This is used as a sanity check + /// to ensure that the patterns provided by the caller are the same as + /// the patterns that were used to compile the matcher. This sanity check + /// possibly permits safely eliminating bounds checks regardless of what + /// patterns are provided by the caller. + /// + /// (Currently, we don't use this to elide bounds checks since it doesn't + /// result in a measurable performance improvement, but we do use it for + /// better failure modes.) + max_pattern_id: PatternID, +} + +impl RabinKarp { + /// Compile a new Rabin-Karp matcher from the patterns given. + /// + /// This panics if any of the patterns in the collection are empty, or if + /// the collection is itself empty. + pub fn new(patterns: &Patterns) -> RabinKarp { + assert!(patterns.len() >= 1); + let hash_len = patterns.minimum_len(); + assert!(hash_len >= 1); + + let mut hash_2pow = 1usize; + for _ in 1..hash_len { + hash_2pow = hash_2pow.wrapping_shl(1); + } + + let mut rk = RabinKarp { + buckets: vec![vec![]; NUM_BUCKETS], + hash_len, + hash_2pow, + max_pattern_id: patterns.max_pattern_id(), + }; + for (id, pat) in patterns.iter() { + let hash = rk.hash(&pat.bytes()[..rk.hash_len]); + let bucket = hash % NUM_BUCKETS; + rk.buckets[bucket].push((hash, id)); + } + rk + } + + /// Return the first matching pattern in the given haystack, begining the + /// search at `at`. + pub fn find_at( + &self, + patterns: &Patterns, + haystack: &[u8], + mut at: usize, + ) -> Option { + assert_eq!(NUM_BUCKETS, self.buckets.len()); + assert_eq!( + self.max_pattern_id, + patterns.max_pattern_id(), + "Rabin-Karp must be called with same patterns it was built with", + ); + + if at + self.hash_len > haystack.len() { + return None; + } + let mut hash = self.hash(&haystack[at..at + self.hash_len]); + loop { + let bucket = &self.buckets[hash % NUM_BUCKETS]; + for &(phash, pid) in bucket { + if phash == hash { + if let Some(c) = self.verify(patterns, pid, haystack, at) { + return Some(c); + } + } + } + if at + self.hash_len >= haystack.len() { + return None; + } + hash = self.update_hash( + hash, + haystack[at], + haystack[at + self.hash_len], + ); + at += 1; + } + } + + /// Returns the approximate total amount of heap used by this searcher, in + /// units of bytes. + pub fn heap_bytes(&self) -> usize { + let num_patterns = self.max_pattern_id as usize + 1; + self.buckets.len() * mem::size_of::>() + + num_patterns * mem::size_of::<(Hash, PatternID)>() + } + + /// Verify whether the pattern with the given id matches at + /// `haystack[at..]`. + /// + /// We tag this function as `cold` because it helps improve codegen. + /// Intuitively, it would seem like inlining it would be better. However, + /// the only time this is called and a match is not found is when there + /// there is a hash collision, or when a prefix of a pattern matches but + /// the entire pattern doesn't match. This is hopefully fairly rare, and + /// if it does occur a lot, it's going to be slow no matter what we do. + #[cold] + fn verify( + &self, + patterns: &Patterns, + id: PatternID, + haystack: &[u8], + at: usize, + ) -> Option { + let pat = patterns.get(id); + if pat.is_prefix(&haystack[at..]) { + Some(Match::from_span(id as usize, at, at + pat.len())) + } else { + None + } + } + + /// Hash the given bytes. + fn hash(&self, bytes: &[u8]) -> Hash { + assert_eq!(self.hash_len, bytes.len()); + + let mut hash = 0usize; + for &b in bytes { + hash = hash.wrapping_shl(1).wrapping_add(b as usize); + } + hash + } + + /// Update the hash given based on removing `old_byte` at the beginning + /// of some byte string, and appending `new_byte` to the end of that same + /// byte string. + fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash { + prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow)) + .wrapping_shl(1) + .wrapping_add(new_byte as usize) + } +} diff --git a/src/packed/teddy/README.md b/src/packed/teddy/README.md new file mode 100644 index 0000000..51b999b --- /dev/null +++ b/src/packed/teddy/README.md @@ -0,0 +1,386 @@ +Teddy is a SIMD accelerated multiple substring matching algorithm. The name +and the core ideas in the algorithm were learned from the [Hyperscan][1_u] +project. The implementation in this repository was mostly motivated for use in +accelerating regex searches by searching for small sets of required literals +extracted from the regex. + + +# Background + +The key idea of Teddy is to do *packed* substring matching. In the literature, +packed substring matching is the idea of examining multiple bytes in a haystack +at a time to detect matches. Implementations of, for example, memchr (which +detects matches of a single byte) have been doing this for years. Only +recently, with the introduction of various SIMD instructions, has this been +extended to substring matching. The PCMPESTRI instruction (and its relatives), +for example, implements substring matching in hardware. It is, however, limited +to substrings of length 16 bytes or fewer, but this restriction is fine in a +regex engine, since we rarely care about the performance difference between +searching for a 16 byte literal and a 16 + N literal; 16 is already long +enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs +at least, is its latency and throughput. As a result, it is often faster to +do substring search with a Boyer-Moore (or Two-Way) variant and a well placed +memchr to quickly skip through the haystack. + +There are fewer results from the literature on packed substring matching, +and even fewer for packed multiple substring matching. Ben-Kiki et al. [2] +describes use of PCMPESTRI for substring matching, but is mostly theoretical +and hand-waves performance. There is other theoretical work done by Bille [3] +as well. + +The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci +and is generally focused on multiple pattern search. Their first paper [4a] +introduces the concept of a fingerprint, which is computed for every block of +N bytes in every pattern. The haystack is then scanned N bytes at a time and +a fingerprint is computed in the same way it was computed for blocks in the +patterns. If the fingerprint corresponds to one that was found in a pattern, +then a verification step follows to confirm that one of the substrings with the +corresponding fingerprint actually matches at the current location. Various +implementation tricks are employed to make sure the fingerprint lookup is fast; +typically by truncating the fingerprint. (This may, of course, provoke more +steps in the verification process, so a balance must be struck.) + +The main downside of [4a] is that the minimum substring length is 32 bytes, +presumably because of how the algorithm uses certain SIMD instructions. This +essentially makes it useless for general purpose regex matching, where a small +number of short patterns is far more likely. + +Faro and Kulekci published another paper [4b] that is conceptually very similar +to [4a]. The key difference is that it uses the CRC32 instruction (introduced +as part of SSE 4.2) to compute fingerprint values. This also enables the +algorithm to work effectively on substrings as short as 7 bytes with 4 byte +windows. 7 bytes is unfortunately still too long. The window could be +technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the +small window size ends up negating most performance benefits—and it's likely +the common case in a general purpose regex engine. + +Faro and Kulekci also published [4c] that appears to be intended as a +replacement to using PCMPESTRI. In particular, it is specifically motivated by +the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD +instructions that are faster. While this approach works for short substrings, +I personally couldn't see a way to generalize it to multiple substring search. + +Faro and Kulekci have another paper [4d] that I haven't been able to read +because it is behind a paywall. + + +# Teddy + +Finally, we get to Teddy. If the above literature review is complete, then it +appears that Teddy is a novel algorithm. More than that, in my experience, it +completely blows away the competition for short substrings, which is exactly +what we want in a general purpose regex engine. Again, the algorithm appears +to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced +late 2015, and no earlier history could be found. Therefore, tracking the exact +provenance of the algorithm with respect to the published literature seems +difficult. + +At a high level, Teddy works somewhat similarly to the fingerprint algorithms +published by Faro and Kulekci, but Teddy does it in a way that scales a bit +better. Namely: + +1. Teddy's core algorithm scans the haystack in 16 (for SSE, or 32 for AVX) + byte chunks. 16 (or 32) is significant because it corresponds to the number + of bytes in a SIMD vector. +2. Bitwise operations are performed on each chunk to discover if any region of + it matches a set of precomputed fingerprints from the patterns. If there are + matches, then a verification step is performed. In this implementation, our + verification step is naive. This can be improved upon. + +The details to make this work are quite clever. First, we must choose how to +pick our fingerprints. In Hyperscan's implementation, I *believe* they use the +last N bytes of each substring, where N must be at least the minimum length of +any substring in the set being searched. In this implementation, we use the +first N bytes of each substring. (The tradeoffs between these choices aren't +yet clear to me.) We then must figure out how to quickly test whether an +occurrence of any fingerprint from the set of patterns appears in a 16 byte +block from the haystack. To keep things simple, let's assume N = 1 and examine +some examples to motivate the approach. Here are our patterns: + +```ignore +foo +bar +baz +``` + +The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set +our 16 byte block to: + +```ignore +bat cat foo bump +xxxxxxxxxxxxxxxx +``` + +To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates +a mask that allows us to quickly compute membership of a fingerprint in a 16 +byte block that also tells which pattern the fingerprint corresponds to. In +this case, our fingerprint is a single byte, so an appropriate abstraction is +a map from a single byte to a list of patterns that contain that fingerprint: + +```ignore +f |--> foo +b |--> bar, baz +``` + +Now, all we need to do is figure out how to represent this map in vector space +and use normal SIMD operations to perform a lookup. The first simplification +we can make is to represent our patterns as bit fields occupying a single +byte. This is important, because a single SIMD vector can store 16 bytes. + +```ignore +f |--> 00000001 +b |--> 00000010, 00000100 +``` + +How do we perform lookup though? It turns out that SSSE3 introduced a very cool +instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`, +and returns a third vector `C`. All vectors are treated as 16 8-bit integers. +`C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true +for the purposes of this algorithm. For full details, see [Intel's Intrinsics +Guide][5_u].) This essentially lets us use the values in `B` to lookup values +in `A`. + +If we could somehow cause `B` to contain our 16 byte block from the haystack, +and if `A` could contain our bitmasks, then we'd end up with something like +this for `A`: + +```ignore + 0x00 0x01 ... 0x62 ... 0x66 ... 0xFF +A = 0 0 00000110 00000001 0 +``` + +And if `B` contains our window from our haystack, we could use shuffle to take +the values from `B` and use them to look up our bitsets in `A`. But of course, +we can't do this because `A` in the above example contains 256 bytes, which +is much larger than the size of a SIMD vector. + +Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of +our bitsets, we can use two masks, where one mask corresponds to the lower four +bits of our fingerprint and the other mask corresponds to the upper four bits. +So our map now looks like: + +```ignore +'f' & 0xF = 0x6 |--> 00000001 +'f' >> 4 = 0x6 |--> 00000111 +'b' & 0xF = 0x2 |--> 00000110 +'b' >> 4 = 0x6 |--> 00000111 +``` + +Notice that the bitsets for each nybble correspond to the union of all +fingerprints that contain that nybble. For example, both `f` and `b` have the +same upper 4 bits but differ on the lower 4 bits. Putting this together, we +have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is +our mask for the upper nybble and `B` is our 16 byte block from the haystack: + +```ignore + 0x00 0x01 0x02 0x03 ... 0x06 ... 0xF +A0 = 0 0 00000110 0 00000001 0 +A1 = 0 0 0 0 00000111 0 +B = b a t _ t p +B = 0x62 0x61 0x74 0x20 0x74 0x70 +``` + +But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits, +and we need indexes that are at most 4 bits (corresponding to one of 16 +values). We can apply the same transformation to split `B` into lower and upper +nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and +`B1` corresponds to the upper nybbles: + +```ignore + b a t _ c a t _ f o o _ b u m p +B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0 +B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7 +``` + +And now we have a nice correspondence. `B0` can index `A0` and `B1` can index +`A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`: + +```ignore + b a ... f o ... p + A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0] +C0 = 00000110 0 00000001 0 0 +``` + +And `C1 = PSHUFB(A1, B1)`: + +```ignore + b a ... f o ... p + A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7] +C1 = 00000111 00000111 00000111 00000111 0 +``` + +Notice how neither one of `C0` or `C1` is guaranteed to report fully correct +results all on its own. For example, `C1` claims that `b` is a fingerprint for +the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint +for all of our patterns. But if we combined `C0` and `C1` with an `AND` +operation: + +```ignore + b a ... f o ... p +C = 00000110 0 00000001 0 0 +``` + +Then we now have that `C[i]` contains a bitset corresponding to the matching +fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that +block. + +Once we have that, we can look for the position of the least significant bit +in `C`. (Least significant because we only target `x86_64` here, which is +always little endian. Thus, the least significant bytes correspond to bytes +in our haystack at a lower address.) That position, modulo `8`, gives us +the pattern that the fingerprint matches. That position, integer divided by +`8`, also gives us the byte offset that the fingerprint occurs in inside the +16 byte haystack block. Using those two pieces of information, we can run a +verification procedure that tries to match all substrings containing that +fingerprint at that position in the haystack. + + +# Implementation notes + +The problem with the algorithm as described above is that it uses a single byte +for a fingerprint. This will work well if the fingerprints are rare in the +haystack (e.g., capital letters or special characters in normal English text), +but if the fingerprints are common, you'll wind up spending too much time in +the verification step, which effectively negates the performance benefits of +scanning 16 bytes at a time. Remember, the key to the performance of this +algorithm is to do as little work as possible per 16 (or 32) bytes. + +This algorithm can be extrapolated in a relatively straight-forward way to use +larger fingerprints. That is, instead of a single byte prefix, we might use a +two or three byte prefix. The implementation here implements N = {1, 2, 3} +and always picks the largest N possible. The rationale is that the bigger the +fingerprint, the fewer verification steps we'll do. Of course, if N is too +large, then we'll end up doing too much on each step. + +The way to extend it is: + +1. Add a mask for each byte in the fingerprint. (Remember that each mask is + composed of two SIMD vectors.) This results in a value of `C` for each byte + in the fingerprint while searching. +2. When testing each 16 (or 32) byte block, each value of `C` must be shifted + so that they are aligned. Once aligned, they should all be `AND`'d together. + This will give you only the bitsets corresponding to the full match of the + fingerprint. To do this, one needs to save the last byte (for N=2) or last + two bytes (for N=3) from the previous iteration, and then line them up with + the first one or two bytes of the next iteration. + +## Verification + +Verification generally follows the procedure outlined above. The tricky parts +are in the right formulation of operations to get our bits out of our vectors. +We have a limited set of operations available to us on SIMD vectors as 128-bit +or 256-bit numbers, so we wind up needing to rip out 2 (or 4) 64-bit integers +from our vectors, and then run our verification step on each of those. The +verification step looks at the least significant bit set, and from its +position, we can derive the byte offset and bucket. (Again, as described +above.) Once we know the bucket, we do a fairly naive exhaustive search for +every literal in that bucket. (Hyperscan is a bit smarter here and uses a hash +table, but I haven't had time to thoroughly explore that. A few initial +half-hearted attempts resulted in worse performance.) + +## AVX + +The AVX version of Teddy extrapolates almost perfectly from the SSE version. +The only hickup is that PALIGNR is used to align chunks in the 16-bit version, +and there is no equivalent instruction in AVX. AVX does have VPALIGNR, but it +only works within 128-bit lanes. So there's a bit of tomfoolery to get around +this by shuffling the vectors before calling VPALIGNR. + +The only other aspect to AVX is that since our masks are still fundamentally +16-bytes (0x0-0xF), they are duplicated to 32-bytes, so that they can apply to +32-byte chunks. + +## Fat Teddy + +In the version of Teddy described above, 8 buckets are used to group patterns +that we want to search for. However, when AVX is available, we can extend the +number of buckets to 16 by permitting each byte in our masks to use 16-bits +instead of 8-bits to represent the buckets it belongs to. (This variant is also +in Hyperscan.) However, what we give up is the ability to scan 32 bytes at a +time, even though we're using AVX. Instead, we have to scan 16 bytes at a time. +What we gain, though, is (hopefully) less work in our verification routine. +It patterns are more spread out across more buckets, then there should overall +be fewer false positives. In general, Fat Teddy permits us to grow our capacity +a bit and search for more literals before Teddy gets overwhelmed. + +The tricky part of Fat Teddy is in how we adjust our masks and our verification +procedure. For the masks, we simply represent the first 8 buckets in each of +the low 16 bytes, and then the second 8 buckets in each of the high 16 bytes. +Then, in the search loop, instead of loading 32 bytes from the haystack, we +load the same 16 bytes from the haystack into both the low and high 16 byte +portions of our 256-bit vector. So for example, a mask might look like this: + + bits: 00100001 00000000 ... 11000000 00000000 00000001 ... 00000000 + byte: 31 30 16 15 14 0 + offset: 15 14 0 15 14 0 + buckets: 8-15 8-15 8-15 0-7 0-7 0-7 + +Where `byte` is the position in the vector (higher numbers corresponding to +more significant bits), `offset` is the corresponding position in the haystack +chunk, and `buckets` corresponds to the bucket assignments for that particular +byte. + +In particular, notice that the bucket assignments for offset `0` are spread +out between bytes `0` and `16`. This works well for the chunk-by-chunk search +procedure, but verification really wants to process all bucket assignments for +each offset at once. Otherwise, we might wind up finding a match at offset +`1` in one the first 8 buckets, when we really should have reported a match +at offset `0` in one of the second 8 buckets. (Because we want the leftmost +match.) + +Thus, for verification, we rearrange the above vector such that it is a +sequence of 16-bit integers, where the least significant 16-bit integer +corresponds to all of the bucket assignments for offset `0`. So with the +above vector, the least significant 16-bit integer would be + + 11000000 000000 + +which was taken from bytes `16` and `0`. Then the verification step pretty much +runs as described, except with 16 buckets instead of 8. + + +# References + +- **[1]** [Hyperscan on GitHub](https://github.com/intel/hyperscan), + [webpage](https://www.hyperscan.io/) +- **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R., + & Weimann, O. (2011). + _Optimal packed string matching_. + In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13). + Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik. + DOI: 10.4230/LIPIcs.FSTTCS.2011.423. + [PDF](https://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf). +- **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R., + & Weimann, O. (2014). + _Towards optimal packed string matching_. + Theoretical Computer Science, 525, 111-129. + DOI: 10.1016/j.tcs.2013.06.013. + [PDF](https://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf). +- **[3]** Bille, P. (2011). + _Fast searching in packed strings_. + Journal of Discrete Algorithms, 9(1), 49-56. + DOI: 10.1016/j.jda.2010.09.003. + [PDF](https://www.sciencedirect.com/science/article/pii/S1570866710000353). +- **[4a]** Faro, S., & Külekci, M. O. (2012, October). + _Fast multiple string matching using streaming SIMD extensions technology_. + In String Processing and Information Retrieval (pp. 217-228). + Springer Berlin Heidelberg. + DOI: 10.1007/978-3-642-34109-0_23. + [PDF](https://www.dmi.unict.it/faro/papers/conference/faro32.pdf). +- **[4b]** Faro, S., & Külekci, M. O. (2013, September). + _Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_. + In Stringology (pp. 78-91). + [PDF](https://www.dmi.unict.it/faro/papers/conference/faro36.pdf). +- **[4c]** Faro, S., & Külekci, M. O. (2013, January). + _Fast packed string matching for short patterns_. + In Proceedings of the Meeting on Algorithm Engineering & Expermiments + (pp. 113-121). + Society for Industrial and Applied Mathematics. + [PDF](https://arxiv.org/pdf/1209.6449.pdf). +- **[4d]** Faro, S., & Külekci, M. O. (2014). + _Fast and flexible packed string matching_. + Journal of Discrete Algorithms, 28, 61-72. + DOI: 10.1016/j.jda.2014.07.003. + +[1_u]: https://github.com/intel/hyperscan +[5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide diff --git a/src/packed/teddy/compile.rs b/src/packed/teddy/compile.rs new file mode 100644 index 0000000..741cb69 --- /dev/null +++ b/src/packed/teddy/compile.rs @@ -0,0 +1,414 @@ +// See the README in this directory for an explanation of the Teddy algorithm. + +use std::cmp; +use std::collections::BTreeMap; +use std::fmt; + +use crate::packed::pattern::{PatternID, Patterns}; +use crate::packed::teddy::Teddy; + +/// A builder for constructing a Teddy matcher. +/// +/// The builder primarily permits fine grained configuration of the Teddy +/// matcher. Most options are made only available for testing/benchmarking +/// purposes. In reality, options are automatically determined by the nature +/// and number of patterns given to the builder. +#[derive(Clone, Debug)] +pub struct Builder { + /// When none, this is automatically determined. Otherwise, `false` means + /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used + /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't + /// available and Fat Teddy was requested, no matcher will be built. + fat: Option, + /// When none, this is automatically determined. Otherwise, `false` means + /// that 128-bit vectors will be used (up to SSSE3 instructions) where as + /// `true` means that 256-bit vectors will be used. As with `fat`, if + /// 256-bit vectors are requested and they aren't available, then a + /// searcher will not be built. + avx: Option, +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +impl Builder { + /// Create a new builder for configuring a Teddy matcher. + pub fn new() -> Builder { + Builder { fat: None, avx: None } + } + + /// Build a matcher for the set of patterns given. If a matcher could not + /// be built, then `None` is returned. + /// + /// Generally, a matcher isn't built if the necessary CPU features aren't + /// available, an unsupported target or if the searcher is believed to be + /// slower than standard techniques (i.e., if there are too many literals). + pub fn build(&self, patterns: &Patterns) -> Option { + self.build_imp(patterns) + } + + /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses + /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful + /// for a larger set of literals. + /// + /// `None` is the default, which results in an automatic selection based + /// on the number of literals and available CPU features. + pub fn fat(&mut self, yes: Option) -> &mut Builder { + self.fat = yes; + self + } + + /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). + /// Generally, a larger vector size is better since it either permits + /// matching more patterns or matching more bytes in the haystack at once. + /// + /// `None` is the default, which results in an automatic selection based on + /// the number of literals and available CPU features. + pub fn avx(&mut self, yes: Option) -> &mut Builder { + self.avx = yes; + self + } + + fn build_imp(&self, patterns: &Patterns) -> Option { + use crate::packed::teddy::runtime; + + // Most of the logic here is just about selecting the optimal settings, + // or perhaps even rejecting construction altogether. The choices + // we have are: fat (avx only) or not, ssse3 or avx2, and how many + // patterns we allow ourselves to search. Additionally, for testing + // and benchmarking, we permit callers to try to "force" a setting, + // and if the setting isn't allowed (e.g., forcing AVX when AVX isn't + // available), then we bail and return nothing. + + if patterns.len() > 64 { + return None; + } + let has_ssse3 = is_x86_feature_detected!("ssse3"); + let has_avx = is_x86_feature_detected!("avx2"); + let avx = if self.avx == Some(true) { + if !has_avx { + return None; + } + true + } else if self.avx == Some(false) { + if !has_ssse3 { + return None; + } + false + } else if !has_ssse3 && !has_avx { + return None; + } else { + has_avx + }; + let fat = match self.fat { + None => avx && patterns.len() > 32, + Some(false) => false, + Some(true) if !avx => return None, + Some(true) => true, + }; + + let mut compiler = Compiler::new(patterns, fat); + compiler.compile(); + let Compiler { buckets, masks, .. } = compiler; + // SAFETY: It is required that the builder only produce Teddy matchers + // that are allowed to run on the current CPU, since we later assume + // that the presence of (for example) TeddySlim1Mask256 means it is + // safe to call functions marked with the `avx2` target feature. + match (masks.len(), avx, fat) { + (1, false, _) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim1Mask128( + runtime::TeddySlim1Mask128 { + mask1: runtime::Mask128::new(masks[0]), + }, + ), + }), + (1, true, false) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim1Mask256( + runtime::TeddySlim1Mask256 { + mask1: runtime::Mask256::new(masks[0]), + }, + ), + }), + (1, true, true) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddyFat1Mask256( + runtime::TeddyFat1Mask256 { + mask1: runtime::Mask256::new(masks[0]), + }, + ), + }), + (2, false, _) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim2Mask128( + runtime::TeddySlim2Mask128 { + mask1: runtime::Mask128::new(masks[0]), + mask2: runtime::Mask128::new(masks[1]), + }, + ), + }), + (2, true, false) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim2Mask256( + runtime::TeddySlim2Mask256 { + mask1: runtime::Mask256::new(masks[0]), + mask2: runtime::Mask256::new(masks[1]), + }, + ), + }), + (2, true, true) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddyFat2Mask256( + runtime::TeddyFat2Mask256 { + mask1: runtime::Mask256::new(masks[0]), + mask2: runtime::Mask256::new(masks[1]), + }, + ), + }), + (3, false, _) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim3Mask128( + runtime::TeddySlim3Mask128 { + mask1: runtime::Mask128::new(masks[0]), + mask2: runtime::Mask128::new(masks[1]), + mask3: runtime::Mask128::new(masks[2]), + }, + ), + }), + (3, true, false) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddySlim3Mask256( + runtime::TeddySlim3Mask256 { + mask1: runtime::Mask256::new(masks[0]), + mask2: runtime::Mask256::new(masks[1]), + mask3: runtime::Mask256::new(masks[2]), + }, + ), + }), + (3, true, true) => Some(Teddy { + buckets, + max_pattern_id: patterns.max_pattern_id(), + exec: runtime::Exec::TeddyFat3Mask256( + runtime::TeddyFat3Mask256 { + mask1: runtime::Mask256::new(masks[0]), + mask2: runtime::Mask256::new(masks[1]), + mask3: runtime::Mask256::new(masks[2]), + }, + ), + }), + _ => unreachable!(), + } + } +} + +/// A compiler is in charge of allocating patterns into buckets and generating +/// the masks necessary for searching. +#[derive(Clone)] +struct Compiler<'p> { + patterns: &'p Patterns, + buckets: Vec>, + masks: Vec, +} + +impl<'p> Compiler<'p> { + /// Create a new Teddy compiler for the given patterns. If `fat` is true, + /// then 16 buckets will be used instead of 8. + /// + /// This panics if any of the patterns given are empty. + fn new(patterns: &'p Patterns, fat: bool) -> Compiler<'p> { + let mask_len = cmp::min(3, patterns.minimum_len()); + assert!(1 <= mask_len && mask_len <= 3); + + Compiler { + patterns, + buckets: vec![vec![]; if fat { 16 } else { 8 }], + masks: vec![Mask::default(); mask_len], + } + } + + /// Compile the patterns in this compiler into buckets and masks. + fn compile(&mut self) { + let mut lonibble_to_bucket: BTreeMap, usize> = BTreeMap::new(); + for (id, pattern) in self.patterns.iter() { + // We try to be slightly clever in how we assign patterns into + // buckets. Generally speaking, we want patterns with the same + // prefix to be in the same bucket, since it minimizes the amount + // of time we spend churning through buckets in the verification + // step. + // + // So we could assign patterns with the same N-prefix (where N + // is the size of the mask, which is one of {1, 2, 3}) to the + // same bucket. However, case insensitive searches are fairly + // common, so we'd for example, ideally want to treat `abc` and + // `ABC` as if they shared the same prefix. ASCII has the nice + // property that the lower 4 bits of A and a are the same, so we + // therefore group patterns with the same low-nybbe-N-prefix into + // the same bucket. + // + // MOREOVER, this is actually necessary for correctness! In + // particular, by grouping patterns with the same prefix into the + // same bucket, we ensure that we preserve correct leftmost-first + // and leftmost-longest match semantics. In addition to the fact + // that `patterns.iter()` iterates in the correct order, this + // guarantees that all possible ambiguous matches will occur in + // the same bucket. The verification routine could be adjusted to + // support correct leftmost match semantics regardless of bucket + // allocation, but that results in a performance hit. It's much + // nicer to be able to just stop as soon as a match is found. + let lonybs = pattern.low_nybbles(self.masks.len()); + if let Some(&bucket) = lonibble_to_bucket.get(&lonybs) { + self.buckets[bucket].push(id); + } else { + // N.B. We assign buckets in reverse because it shouldn't have + // any influence on performance, but it does make it harder to + // get leftmost match semantics accidentally correct. + let bucket = (self.buckets.len() - 1) + - (id as usize % self.buckets.len()); + self.buckets[bucket].push(id); + lonibble_to_bucket.insert(lonybs, bucket); + } + } + for (bucket_index, bucket) in self.buckets.iter().enumerate() { + for &pat_id in bucket { + let pat = self.patterns.get(pat_id); + for (i, mask) in self.masks.iter_mut().enumerate() { + if self.buckets.len() == 8 { + mask.add_slim(bucket_index as u8, pat.bytes()[i]); + } else { + mask.add_fat(bucket_index as u8, pat.bytes()[i]); + } + } + } + } + } +} + +impl<'p> fmt::Debug for Compiler<'p> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut buckets = vec![vec![]; self.buckets.len()]; + for (i, bucket) in self.buckets.iter().enumerate() { + for &patid in bucket { + buckets[i].push(self.patterns.get(patid)); + } + } + f.debug_struct("Compiler") + .field("buckets", &buckets) + .field("masks", &self.masks) + .finish() + } +} + +/// Mask represents the low and high nybble masks that will be used during +/// search. Each mask is 32 bytes wide, although only the first 16 bytes are +/// used for the SSSE3 runtime. +/// +/// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set +/// if and only if the corresponding nybble is in the ith bucket. The index of +/// the byte (0-15, inclusive) corresponds to the nybble. +/// +/// Each mask is used as the target of a shuffle, where the indices for the +/// shuffle are taken from the haystack. AND'ing the shuffles for both the +/// low and high masks together also results in 8-bit bitsets, but where bit +/// `i` is set if and only if the correspond *byte* is in the ith bucket. +/// +/// During compilation, masks are just arrays. But during search, these masks +/// are represented as 128-bit or 256-bit vectors. +/// +/// (See the README is this directory for more details.) +#[derive(Clone, Copy, Default)] +pub struct Mask { + lo: [u8; 32], + hi: [u8; 32], +} + +impl Mask { + /// Update this mask by adding the given byte to the given bucket. The + /// given bucket must be in the range 0-7. + /// + /// This is for "slim" Teddy, where there are only 8 buckets. + fn add_slim(&mut self, bucket: u8, byte: u8) { + assert!(bucket < 8); + + let byte_lo = (byte & 0xF) as usize; + let byte_hi = ((byte >> 4) & 0xF) as usize; + // When using 256-bit vectors, we need to set this bucket assignment in + // the low and high 128-bit portions of the mask. This allows us to + // process 32 bytes at a time. Namely, AVX2 shuffles operate on each + // of the 128-bit lanes, rather than the full 256-bit vector at once. + self.lo[byte_lo] |= 1 << bucket; + self.lo[byte_lo + 16] |= 1 << bucket; + self.hi[byte_hi] |= 1 << bucket; + self.hi[byte_hi + 16] |= 1 << bucket; + } + + /// Update this mask by adding the given byte to the given bucket. The + /// given bucket must be in the range 0-15. + /// + /// This is for "fat" Teddy, where there are 16 buckets. + fn add_fat(&mut self, bucket: u8, byte: u8) { + assert!(bucket < 16); + + let byte_lo = (byte & 0xF) as usize; + let byte_hi = ((byte >> 4) & 0xF) as usize; + // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, + // the high 128 bits of our mask correspond to buckets 8-15, while the + // low 128 bits correspond to buckets 0-7. + if bucket < 8 { + self.lo[byte_lo] |= 1 << bucket; + self.hi[byte_hi] |= 1 << bucket; + } else { + self.lo[byte_lo + 16] |= 1 << (bucket % 8); + self.hi[byte_hi + 16] |= 1 << (bucket % 8); + } + } + + /// Return the low 128 bits of the low-nybble mask. + pub fn lo128(&self) -> [u8; 16] { + let mut tmp = [0; 16]; + tmp.copy_from_slice(&self.lo[..16]); + tmp + } + + /// Return the full low-nybble mask. + pub fn lo256(&self) -> [u8; 32] { + self.lo + } + + /// Return the low 128 bits of the high-nybble mask. + pub fn hi128(&self) -> [u8; 16] { + let mut tmp = [0; 16]; + tmp.copy_from_slice(&self.hi[..16]); + tmp + } + + /// Return the full high-nybble mask. + pub fn hi256(&self) -> [u8; 32] { + self.hi + } +} + +impl fmt::Debug for Mask { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let (mut parts_lo, mut parts_hi) = (vec![], vec![]); + for i in 0..32 { + parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); + parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i])); + } + f.debug_struct("Mask") + .field("lo", &parts_lo) + .field("hi", &parts_hi) + .finish() + } +} diff --git a/src/packed/teddy/mod.rs b/src/packed/teddy/mod.rs new file mode 100644 index 0000000..3268cdf --- /dev/null +++ b/src/packed/teddy/mod.rs @@ -0,0 +1,62 @@ +#[cfg(target_arch = "x86_64")] +pub use crate::packed::teddy::compile::Builder; +#[cfg(not(target_arch = "x86_64"))] +pub use crate::packed::teddy::fallback::Builder; +#[cfg(not(target_arch = "x86_64"))] +pub use crate::packed::teddy::fallback::Teddy; +#[cfg(target_arch = "x86_64")] +pub use crate::packed::teddy::runtime::Teddy; + +#[cfg(target_arch = "x86_64")] +mod compile; +#[cfg(target_arch = "x86_64")] +mod runtime; + +#[cfg(not(target_arch = "x86_64"))] +mod fallback { + use crate::packed::pattern::Patterns; + use crate::Match; + + #[derive(Clone, Debug, Default)] + pub struct Builder(()); + + impl Builder { + pub fn new() -> Builder { + Builder(()) + } + + pub fn build(&self, _: &Patterns) -> Option { + None + } + + pub fn fat(&mut self, _: Option) -> &mut Builder { + self + } + + pub fn avx(&mut self, _: Option) -> &mut Builder { + self + } + } + + #[derive(Clone, Debug)] + pub struct Teddy(()); + + impl Teddy { + pub fn find_at( + &self, + _: &Patterns, + _: &[u8], + _: usize, + ) -> Option { + None + } + + pub fn minimum_len(&self) -> usize { + 0 + } + + pub fn heap_bytes(&self) -> usize { + 0 + } + } +} diff --git a/src/packed/teddy/runtime.rs b/src/packed/teddy/runtime.rs new file mode 100644 index 0000000..0d96913 --- /dev/null +++ b/src/packed/teddy/runtime.rs @@ -0,0 +1,1204 @@ +// See the README in this directory for an explanation of the Teddy algorithm. +// It is strongly recommended to peruse the README before trying to grok this +// code, as its use of SIMD is pretty opaque, although I tried to add comments +// where appropriate. +// +// Moreover, while there is a lot of code in this file, most of it is +// repeated variants of the same thing. Specifically, there are three Teddy +// variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets) +// and Fat 256-bit Teddy (16 buckets). For each variant, there are three +// implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to +// a total of nine variants. Each one is structured roughly the same: +// +// while at <= len(haystack) - CHUNK_SIZE: +// let candidate = find_candidate_in_chunk(haystack, at) +// if not all zeroes(candidate): +// if match = verify(haystack, at, candidate): +// return match +// +// For the most part, this remains unchanged. The parts that vary are the +// verification routine (for slim vs fat Teddy) and the candidate extraction +// (based on the number of masks). +// +// In the code below, a "candidate" corresponds to a single vector with 8-bit +// lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the +// jth lane if and only if the byte occurring at position `j` is in the +// bucket `i` (where the `j`th position is the position in the current window +// of the haystack, which is always 16 or 32 bytes). Note to be careful here: +// the ith bit and the jth lane correspond to the least significant bits of the +// vector. So when visualizing how the current window of bytes is stored in a +// vector, you often need to flip it around. For example, the text `abcd` in a +// 4-byte vector would look like this: +// +// 01100100 01100011 01100010 01100001 +// d c b a +// +// When the mask length is 1, then finding the candidate is pretty straight +// forward: you just apply the shuffle indices (from the haystack window) to +// the masks, and then AND them together, as described in the README. But for +// masks of length 2 and 3, you need to keep a little state. Specifically, +// you need to store the final 1 (for mask length 2) or 2 (for mask length 3) +// bytes of the candidate for use when searching the next window. This is for +// handling matches that span two windows. +// +// With respect to the repeated code, it would likely be possible to reduce +// the number of copies of code below using polymorphism, but I find this +// formulation clearer instead of needing to reason through generics. However, +// I admit, there may be a simpler generic construction that I'm missing. +// +// All variants are fairly heavily tested in src/packed/tests.rs. + +use std::arch::x86_64::*; +use std::mem; + +use crate::packed::pattern::{PatternID, Patterns}; +use crate::packed::teddy::compile; +use crate::packed::vector::*; +use crate::Match; + +/// The Teddy runtime. +/// +/// A Teddy runtime can be used to quickly search for occurrences of one or +/// more patterns. While it does not scale to an arbitrary number of patterns +/// like Aho-Corasick, it does find occurrences for a small set of patterns +/// much more quickly than Aho-Corasick. +/// +/// Teddy cannot run on small haystacks below a certain size, which is +/// dependent on the type of matcher used. This size can be queried via the +/// `minimum_len` method. Violating this will result in a panic. +/// +/// Finally, when callers use a Teddy runtime, they must provide precisely the +/// patterns used to construct the Teddy matcher. Violating this will result +/// in either a panic or incorrect results, but will never sacrifice memory +/// safety. +#[derive(Clone, Debug)] +pub struct Teddy { + /// The allocation of patterns in buckets. This only contains the IDs of + /// patterns. In order to do full verification, callers must provide the + /// actual patterns when using Teddy. + pub buckets: Vec>, + /// The maximum identifier of a pattern. This is used as a sanity check to + /// ensure that the patterns provided by the caller are the same as the + /// patterns that were used to compile the matcher. This sanity check + /// permits safely eliminating bounds checks regardless of what patterns + /// are provided by the caller. + /// + /// Note that users of the aho-corasick crate cannot get this wrong. Only + /// code internal to this crate can get it wrong, since neither `Patterns` + /// type nor the Teddy runtime are public API items. + pub max_pattern_id: PatternID, + /// The actual runtime to use. + pub exec: Exec, +} + +impl Teddy { + /// Return the first occurrence of a match in the given haystack after or + /// starting at `at`. + /// + /// The patterns provided must be precisely the same patterns given to the + /// Teddy builder, otherwise this may panic or produce incorrect results. + /// + /// All matches are consistent with the match semantics (leftmost-first or + /// leftmost-longest) set on `pats`. + pub fn find_at( + &self, + pats: &Patterns, + haystack: &[u8], + at: usize, + ) -> Option { + // This assert is a bit subtle, but it's an important guarantee. + // Namely, if the maximum pattern ID seen by Teddy is the same as the + // one in the patterns given, then we are guaranteed that every pattern + // ID in all Teddy buckets are valid indices into `pats`. While this + // is nominally true, there is no guarantee that callers provide the + // same `pats` to both the Teddy builder and the searcher, which would + // otherwise make `find_at` unsafe to call. But this assert lets us + // keep this routine safe and eliminate an important bounds check in + // verification. + assert_eq!( + self.max_pattern_id, + pats.max_pattern_id(), + "teddy must be called with same patterns it was built with", + ); + // SAFETY: The haystack must have at least a minimum number of bytes + // for Teddy to be able to work. The minimum number varies depending on + // which matcher is used below. If this is violated, then it's possible + // for searching to do out-of-bounds writes. + assert!(haystack[at..].len() >= self.minimum_len()); + // SAFETY: The various Teddy matchers are always safe to call because + // the Teddy builder guarantees that a particular Exec variant is + // built only when it can be run the current CPU. That is, the Teddy + // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is + // enabled. That is, our dynamic CPU feature detection is performed + // once in the builder, and we rely on the type system to avoid needing + // to do it again. + unsafe { + match self.exec { + Exec::TeddySlim1Mask128(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddySlim1Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddyFat1Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddySlim2Mask128(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddySlim2Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddyFat2Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddySlim3Mask128(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddySlim3Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + Exec::TeddyFat3Mask256(ref e) => { + e.find_at(pats, self, haystack, at) + } + } + } + } + + /// Returns the minimum length of a haystack that must be provided by + /// callers to this Teddy searcher. Providing a haystack shorter than this + /// will result in a panic, but will never violate memory safety. + pub fn minimum_len(&self) -> usize { + // SAFETY: These values must be correct in order to ensure safety. + // The Teddy runtime assumes their haystacks have at least these + // lengths. Violating this will sacrifice memory safety. + match self.exec { + Exec::TeddySlim1Mask128(_) => 16, + Exec::TeddySlim1Mask256(_) => 32, + Exec::TeddyFat1Mask256(_) => 16, + Exec::TeddySlim2Mask128(_) => 17, + Exec::TeddySlim2Mask256(_) => 33, + Exec::TeddyFat2Mask256(_) => 17, + Exec::TeddySlim3Mask128(_) => 18, + Exec::TeddySlim3Mask256(_) => 34, + Exec::TeddyFat3Mask256(_) => 34, + } + } + + /// Returns the approximate total amount of heap used by this searcher, in + /// units of bytes. + pub fn heap_bytes(&self) -> usize { + let num_patterns = self.max_pattern_id as usize + 1; + self.buckets.len() * mem::size_of::>() + + num_patterns * mem::size_of::() + } + + /// Runs the verification routine for Slim 128-bit Teddy. + /// + /// The candidate given should be a collection of 8-bit bitsets (one bitset + /// per lane), where the ith bit is set in the jth lane if and only if the + /// byte occurring at `at + j` in `haystack` is in the bucket `i`. + /// + /// This is not safe to call unless the SSSE3 target feature is enabled. + /// The `target_feature` attribute is not applied since this function is + /// always forcefully inlined. + #[inline(always)] + unsafe fn verify128( + &self, + pats: &Patterns, + haystack: &[u8], + at: usize, + cand: __m128i, + ) -> Option { + debug_assert!(!is_all_zeroes128(cand)); + debug_assert_eq!(8, self.buckets.len()); + + // Convert the candidate into 64-bit chunks, and then verify each of + // those chunks. + let parts = unpack64x128(cand); + for (i, &part) in parts.iter().enumerate() { + let pos = at + i * 8; + if let Some(m) = self.verify64(pats, 8, haystack, pos, part) { + return Some(m); + } + } + None + } + + /// Runs the verification routine for Slim 256-bit Teddy. + /// + /// The candidate given should be a collection of 8-bit bitsets (one bitset + /// per lane), where the ith bit is set in the jth lane if and only if the + /// byte occurring at `at + j` in `haystack` is in the bucket `i`. + /// + /// This is not safe to call unless the AVX2 target feature is enabled. + /// The `target_feature` attribute is not applied since this function is + /// always forcefully inlined. + #[inline(always)] + unsafe fn verify256( + &self, + pats: &Patterns, + haystack: &[u8], + at: usize, + cand: __m256i, + ) -> Option { + debug_assert!(!is_all_zeroes256(cand)); + debug_assert_eq!(8, self.buckets.len()); + + // Convert the candidate into 64-bit chunks, and then verify each of + // those chunks. + let parts = unpack64x256(cand); + for (i, &part) in parts.iter().enumerate() { + let pos = at + i * 8; + if let Some(m) = self.verify64(pats, 8, haystack, pos, part) { + return Some(m); + } + } + None + } + + /// Runs the verification routine for Fat 256-bit Teddy. + /// + /// The candidate given should be a collection of 8-bit bitsets (one bitset + /// per lane), where the ith bit is set in the jth lane if and only if the + /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the + /// bucket `j < 16 ? i : i + 8`. + /// + /// This is not safe to call unless the AVX2 target feature is enabled. + /// The `target_feature` attribute is not applied since this function is + /// always forcefully inlined. + #[inline(always)] + unsafe fn verify_fat256( + &self, + pats: &Patterns, + haystack: &[u8], + at: usize, + cand: __m256i, + ) -> Option { + debug_assert!(!is_all_zeroes256(cand)); + debug_assert_eq!(16, self.buckets.len()); + + // This is a bit tricky, but we basically want to convert our + // candidate, which looks like this + // + // a31 a30 ... a17 a16 a15 a14 ... a01 a00 + // + // where each a(i) is an 8-bit bitset corresponding to the activated + // buckets, to this + // + // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00 + // + // Namely, for Fat Teddy, the high 128-bits of the candidate correspond + // to the same bytes in the haystack in the low 128-bits (so we only + // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7. + // + // The verification routine wants to look at all potentially matching + // buckets before moving on to the next lane. So for example, both + // a16 and a00 both correspond to the first byte in our window; a00 + // contains buckets 0-7 and a16 contains buckets 8-15. Specifically, + // a16 should be checked before a01. So the transformation shown above + // allows us to use our normal verification procedure with one small + // change: we treat each bitset as 16 bits instead of 8 bits. + + // Swap the 128-bit lanes in the candidate vector. + let swap = _mm256_permute4x64_epi64(cand, 0x4E); + // Interleave the bytes from the low 128-bit lanes, starting with + // cand first. + let r1 = _mm256_unpacklo_epi8(cand, swap); + // Interleave the bytes from the high 128-bit lanes, starting with + // cand first. + let r2 = _mm256_unpackhi_epi8(cand, swap); + // Now just take the 2 low 64-bit integers from both r1 and r2. We + // can drop the high 64-bit integers because they are a mirror image + // of the low 64-bit integers. All we care about are the low 128-bit + // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets + // laid out in the desired order, as described above. + let parts = unpacklo64x256(r1, r2); + for (i, &part) in parts.iter().enumerate() { + let pos = at + i * 4; + if let Some(m) = self.verify64(pats, 16, haystack, pos, part) { + return Some(m); + } + } + None + } + + /// Verify whether there are any matches starting at or after `at` in the + /// given `haystack`. The candidate given should correspond to either 8-bit + /// (for 8 buckets) or 16-bit (16 buckets) bitsets. + #[inline(always)] + fn verify64( + &self, + pats: &Patterns, + bucket_count: usize, + haystack: &[u8], + at: usize, + mut cand: u64, + ) -> Option { + // N.B. While the bucket count is known from self.buckets.len(), + // requiring it as a parameter makes it easier for the optimizer to + // know its value, and thus produce more efficient codegen. + debug_assert!(bucket_count == 8 || bucket_count == 16); + while cand != 0 { + let bit = cand.trailing_zeros() as usize; + cand &= !(1 << bit); + + let at = at + (bit / bucket_count); + let bucket = bit % bucket_count; + if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) { + return Some(m); + } + } + None + } + + /// Verify whether there are any matches starting at `at` in the given + /// `haystack` corresponding only to patterns in the given bucket. + #[inline(always)] + fn verify_bucket( + &self, + pats: &Patterns, + haystack: &[u8], + bucket: usize, + at: usize, + ) -> Option { + // Forcing this function to not inline and be "cold" seems to help + // the codegen for Teddy overall. Interestingly, this is good for a + // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among + // others). Overall, this seems like a problem with codegen, since + // creating the Match itself is a very small amount of code. + #[cold] + #[inline(never)] + fn match_from_span( + pati: PatternID, + start: usize, + end: usize, + ) -> Match { + Match::from_span(pati as usize, start, end) + } + + // N.B. The bounds check for this bucket lookup *should* be elided + // since we assert the number of buckets in each `find_at` routine, + // and the compiler can prove that the `% 8` (or `% 16`) in callers + // of this routine will always be in bounds. + for &pati in &self.buckets[bucket] { + // SAFETY: This is safe because we are guaranteed that every + // index in a Teddy bucket is a valid index into `pats`. This + // guarantee is upheld by the assert checking `max_pattern_id` in + // the beginning of `find_at` above. + // + // This explicit bounds check elision is (amazingly) good for a + // 25-50% boost in some benchmarks, particularly ones with a lot + // of short literals. + let pat = unsafe { pats.get_unchecked(pati) }; + if pat.is_prefix(&haystack[at..]) { + return Some(match_from_span(pati, at, at + pat.len())); + } + } + None + } +} + +/// Exec represents the different search strategies supported by the Teddy +/// runtime. +/// +/// This enum is an important safety abstraction. Namely, callers should only +/// construct a variant in this enum if it is safe to execute its corresponding +/// target features on the current CPU. The 128-bit searchers require SSSE3, +/// while the 256-bit searchers require AVX2. +#[derive(Clone, Debug)] +pub enum Exec { + TeddySlim1Mask128(TeddySlim1Mask128), + TeddySlim1Mask256(TeddySlim1Mask256), + TeddyFat1Mask256(TeddyFat1Mask256), + TeddySlim2Mask128(TeddySlim2Mask128), + TeddySlim2Mask256(TeddySlim2Mask256), + TeddyFat2Mask256(TeddyFat2Mask256), + TeddySlim3Mask128(TeddySlim3Mask128), + TeddySlim3Mask256(TeddySlim3Mask256), + TeddyFat3Mask256(TeddyFat3Mask256), +} + +// Most of the code below remains undocumented because they are effectively +// repeated versions of themselves. The general structure is described in the +// README and in the comments above. + +#[derive(Clone, Debug)] +pub struct TeddySlim1Mask128 { + pub mask1: Mask128, +} + +impl TeddySlim1Mask128 { + #[target_feature(enable = "ssse3")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + let len = haystack.len(); + while at <= len - 16 { + let c = self.candidate(haystack, at); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at, c) { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + let c = self.candidate(haystack, at); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = loadu128(haystack, at); + members1m128(chunk, self.mask1) + } +} + +#[derive(Clone, Debug)] +pub struct TeddySlim1Mask256 { + pub mask1: Mask256, +} + +impl TeddySlim1Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + let len = haystack.len(); + while at <= len - 32 { + let c = self.candidate(haystack, at); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at, c) { + return Some(m); + } + } + at += 32; + } + if at < len { + at = len - 32; + let c = self.candidate(haystack, at); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { + debug_assert!(haystack[at..].len() >= 32); + + let chunk = loadu256(haystack, at); + members1m256(chunk, self.mask1) + } +} + +#[derive(Clone, Debug)] +pub struct TeddyFat1Mask256 { + pub mask1: Mask256, +} + +impl TeddyFat1Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(16, teddy.buckets.len()); + + let len = haystack.len(); + while at <= len - 16 { + let c = self.candidate(haystack, at); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + let c = self.candidate(haystack, at); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); + members1m256(chunk, self.mask1) + } +} + +#[derive(Clone, Debug)] +pub struct TeddySlim2Mask128 { + pub mask1: Mask128, + pub mask2: Mask128, +} + +impl TeddySlim2Mask128 { + #[target_feature(enable = "ssse3")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + at += 1; + let len = haystack.len(); + let mut prev0 = ones128(); + while at <= len - 16 { + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + prev0 = ones128(); + + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m128i, + ) -> __m128i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = loadu128(haystack, at); + let (res0, res1) = members2m128(chunk, self.mask1, self.mask2); + let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15); + _mm_and_si128(res0prev0, res1) + } +} + +#[derive(Clone, Debug)] +pub struct TeddySlim2Mask256 { + pub mask1: Mask256, + pub mask2: Mask256, +} + +impl TeddySlim2Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + at += 1; + let len = haystack.len(); + let mut prev0 = ones256(); + while at <= len - 32 { + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { + return Some(m); + } + } + at += 32; + } + if at < len { + at = len - 32; + prev0 = ones256(); + + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m256i, + ) -> __m256i { + debug_assert!(haystack[at..].len() >= 32); + + let chunk = loadu256(haystack, at); + let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); + let res0prev0 = alignr256_15(res0, *prev0); + let res = _mm256_and_si256(res0prev0, res1); + *prev0 = res0; + res + } +} + +#[derive(Clone, Debug)] +pub struct TeddyFat2Mask256 { + pub mask1: Mask256, + pub mask2: Mask256, +} + +impl TeddyFat2Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(16, teddy.buckets.len()); + + at += 1; + let len = haystack.len(); + let mut prev0 = ones256(); + while at <= len - 16 { + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) + { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + prev0 = ones256(); + + let c = self.candidate(haystack, at, &mut prev0); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) + { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m256i, + ) -> __m256i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); + let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); + let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15); + let res = _mm256_and_si256(res0prev0, res1); + *prev0 = res0; + res + } +} + +#[derive(Clone, Debug)] +pub struct TeddySlim3Mask128 { + pub mask1: Mask128, + pub mask2: Mask128, + pub mask3: Mask128, +} + +impl TeddySlim3Mask128 { + #[target_feature(enable = "ssse3")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + at += 2; + let len = haystack.len(); + let (mut prev0, mut prev1) = (ones128(), ones128()); + while at <= len - 16 { + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + prev0 = ones128(); + prev1 = ones128(); + + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes128(c) { + if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m128i, + prev1: &mut __m128i, + ) -> __m128i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = loadu128(haystack, at); + let (res0, res1, res2) = + members3m128(chunk, self.mask1, self.mask2, self.mask3); + let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14); + let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15); + let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2); + *prev0 = res0; + *prev1 = res1; + res + } +} + +#[derive(Clone, Debug)] +pub struct TeddySlim3Mask256 { + pub mask1: Mask256, + pub mask2: Mask256, + pub mask3: Mask256, +} + +impl TeddySlim3Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(8, teddy.buckets.len()); + + at += 2; + let len = haystack.len(); + let (mut prev0, mut prev1) = (ones256(), ones256()); + while at <= len - 32 { + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { + return Some(m); + } + } + at += 32; + } + if at < len { + at = len - 32; + prev0 = ones256(); + prev1 = ones256(); + + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m256i, + prev1: &mut __m256i, + ) -> __m256i { + debug_assert!(haystack[at..].len() >= 32); + + let chunk = loadu256(haystack, at); + let (res0, res1, res2) = + members3m256(chunk, self.mask1, self.mask2, self.mask3); + let res0prev0 = alignr256_14(res0, *prev0); + let res1prev1 = alignr256_15(res1, *prev1); + let res = + _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); + *prev0 = res0; + *prev1 = res1; + res + } +} + +#[derive(Clone, Debug)] +pub struct TeddyFat3Mask256 { + pub mask1: Mask256, + pub mask2: Mask256, + pub mask3: Mask256, +} + +impl TeddyFat3Mask256 { + #[target_feature(enable = "avx2")] + unsafe fn find_at( + &self, + pats: &Patterns, + teddy: &Teddy, + haystack: &[u8], + mut at: usize, + ) -> Option { + debug_assert!(haystack[at..].len() >= teddy.minimum_len()); + // This assert helps eliminate bounds checks for bucket lookups in + // Teddy::verify_bucket, which has a small (3-4%) performance boost. + assert_eq!(16, teddy.buckets.len()); + + at += 2; + let len = haystack.len(); + let (mut prev0, mut prev1) = (ones256(), ones256()); + while at <= len - 16 { + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) + { + return Some(m); + } + } + at += 16; + } + if at < len { + at = len - 16; + prev0 = ones256(); + prev1 = ones256(); + + let c = self.candidate(haystack, at, &mut prev0, &mut prev1); + if !is_all_zeroes256(c) { + if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) + { + return Some(m); + } + } + } + None + } + + #[inline(always)] + unsafe fn candidate( + &self, + haystack: &[u8], + at: usize, + prev0: &mut __m256i, + prev1: &mut __m256i, + ) -> __m256i { + debug_assert!(haystack[at..].len() >= 16); + + let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); + let (res0, res1, res2) = + members3m256(chunk, self.mask1, self.mask2, self.mask3); + let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14); + let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15); + let res = + _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); + *prev0 = res0; + *prev1 = res1; + res + } +} + +/// A 128-bit mask for the low and high nybbles in a set of patterns. Each +/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if +/// the nybble `j` is in the bucket `i` at a particular position. +#[derive(Clone, Copy, Debug)] +pub struct Mask128 { + lo: __m128i, + hi: __m128i, +} + +impl Mask128 { + /// Create a new SIMD mask from the mask produced by the Teddy builder. + pub fn new(mask: compile::Mask) -> Mask128 { + // SAFETY: This is safe since [u8; 16] has the same representation + // as __m128i. + unsafe { + Mask128 { + lo: mem::transmute(mask.lo128()), + hi: mem::transmute(mask.hi128()), + } + } + } +} + +/// A 256-bit mask for the low and high nybbles in a set of patterns. Each +/// lane `j` corresponds to a bitset where the `i`th bit is set if and only if +/// the nybble `j` is in the bucket `i` at a particular position. +/// +/// This is slightly tweaked dependending on whether Slim or Fat Teddy is being +/// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as +/// the bitsets in the higher 128-bits, so that we can search 32 bytes at a +/// time. (Remember, the nybbles in the haystack are used as indices into these +/// masks, and 256-bit shuffles only operate on 128-bit lanes.) +/// +/// For Fat Teddy, the bitsets are not repeated, but instead, the high 128 +/// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets +/// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set +/// if it's in the higher 128 bits. +#[derive(Clone, Copy, Debug)] +pub struct Mask256 { + lo: __m256i, + hi: __m256i, +} + +impl Mask256 { + /// Create a new SIMD mask from the mask produced by the Teddy builder. + pub fn new(mask: compile::Mask) -> Mask256 { + // SAFETY: This is safe since [u8; 32] has the same representation + // as __m256i. + unsafe { + Mask256 { + lo: mem::transmute(mask.lo256()), + hi: mem::transmute(mask.hi256()), + } + } + } +} + +// The "members" routines below are responsible for taking a chunk of bytes, +// a number of nybble masks and returning the result of using the masks to +// lookup bytes in the chunk. The results of the high and low nybble masks are +// AND'ed together, such that each candidate returned is a vector, with byte +// sized lanes, and where each lane is an 8-bit bitset corresponding to the +// buckets that contain the corresponding byte. +// +// In the case of masks of length greater than 1, callers will need to keep +// the results from the previous haystack's window, and then shift the vectors +// so that they all line up. Then they can be AND'ed together. + +/// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a +/// 16-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and `mask1` corresponds to a +/// low/high mask for the first byte of all patterns that are being searched. +#[target_feature(enable = "ssse3")] +unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i { + let lomask = _mm_set1_epi8(0xF); + let hlo = _mm_and_si128(chunk, lomask); + let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); + _mm_and_si128( + _mm_shuffle_epi8(mask1.lo, hlo), + _mm_shuffle_epi8(mask1.hi, hhi), + ) +} + +/// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a +/// 32-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and `mask1` corresponds to a +/// low/high mask for the first byte of all patterns that are being searched. +/// +/// Note that this can also be used for Fat Teddy, where the high 128 bits in +/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte +/// window in the haystack. +#[target_feature(enable = "avx2")] +unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i { + let lomask = _mm256_set1_epi8(0xF); + let hlo = _mm256_and_si256(chunk, lomask); + let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); + _mm256_and_si256( + _mm256_shuffle_epi8(mask1.lo, hlo), + _mm256_shuffle_epi8(mask1.hi, hhi), + ) +} + +/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds +/// to a 16-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and the masks correspond to a +/// low/high mask for the first and second bytes of all patterns that are being +/// searched. The vectors returned correspond to candidates for the first and +/// second bytes in the patterns represented by the masks. +#[target_feature(enable = "ssse3")] +unsafe fn members2m128( + chunk: __m128i, + mask1: Mask128, + mask2: Mask128, +) -> (__m128i, __m128i) { + let lomask = _mm_set1_epi8(0xF); + let hlo = _mm_and_si128(chunk, lomask); + let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); + let res0 = _mm_and_si128( + _mm_shuffle_epi8(mask1.lo, hlo), + _mm_shuffle_epi8(mask1.hi, hhi), + ); + let res1 = _mm_and_si128( + _mm_shuffle_epi8(mask2.lo, hlo), + _mm_shuffle_epi8(mask2.hi, hhi), + ); + (res0, res1) +} + +/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds +/// to a 32-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and the masks correspond to a +/// low/high mask for the first and second bytes of all patterns that are being +/// searched. The vectors returned correspond to candidates for the first and +/// second bytes in the patterns represented by the masks. +/// +/// Note that this can also be used for Fat Teddy, where the high 128 bits in +/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte +/// window in the haystack. +#[target_feature(enable = "avx2")] +unsafe fn members2m256( + chunk: __m256i, + mask1: Mask256, + mask2: Mask256, +) -> (__m256i, __m256i) { + let lomask = _mm256_set1_epi8(0xF); + let hlo = _mm256_and_si256(chunk, lomask); + let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); + let res0 = _mm256_and_si256( + _mm256_shuffle_epi8(mask1.lo, hlo), + _mm256_shuffle_epi8(mask1.hi, hhi), + ); + let res1 = _mm256_and_si256( + _mm256_shuffle_epi8(mask2.lo, hlo), + _mm256_shuffle_epi8(mask2.hi, hhi), + ); + (res0, res1) +} + +/// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds +/// to a 16-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and the masks correspond to a +/// low/high mask for the first, second and third bytes of all patterns that +/// are being searched. The vectors returned correspond to candidates for the +/// first, second and third bytes in the patterns represented by the masks. +#[target_feature(enable = "ssse3")] +unsafe fn members3m128( + chunk: __m128i, + mask1: Mask128, + mask2: Mask128, + mask3: Mask128, +) -> (__m128i, __m128i, __m128i) { + let lomask = _mm_set1_epi8(0xF); + let hlo = _mm_and_si128(chunk, lomask); + let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); + let res0 = _mm_and_si128( + _mm_shuffle_epi8(mask1.lo, hlo), + _mm_shuffle_epi8(mask1.hi, hhi), + ); + let res1 = _mm_and_si128( + _mm_shuffle_epi8(mask2.lo, hlo), + _mm_shuffle_epi8(mask2.hi, hhi), + ); + let res2 = _mm_and_si128( + _mm_shuffle_epi8(mask3.lo, hlo), + _mm_shuffle_epi8(mask3.hi, hhi), + ); + (res0, res1, res2) +} + +/// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds +/// to a 32-byte window of the haystack (where the least significant byte +/// corresponds to the start of the window), and the masks correspond to a +/// low/high mask for the first, second and third bytes of all patterns that +/// are being searched. The vectors returned correspond to candidates for the +/// first, second and third bytes in the patterns represented by the masks. +/// +/// Note that this can also be used for Fat Teddy, where the high 128 bits in +/// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte +/// window in the haystack. +#[target_feature(enable = "avx2")] +unsafe fn members3m256( + chunk: __m256i, + mask1: Mask256, + mask2: Mask256, + mask3: Mask256, +) -> (__m256i, __m256i, __m256i) { + let lomask = _mm256_set1_epi8(0xF); + let hlo = _mm256_and_si256(chunk, lomask); + let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); + let res0 = _mm256_and_si256( + _mm256_shuffle_epi8(mask1.lo, hlo), + _mm256_shuffle_epi8(mask1.hi, hhi), + ); + let res1 = _mm256_and_si256( + _mm256_shuffle_epi8(mask2.lo, hlo), + _mm256_shuffle_epi8(mask2.hi, hhi), + ); + let res2 = _mm256_and_si256( + _mm256_shuffle_epi8(mask3.lo, hlo), + _mm256_shuffle_epi8(mask3.hi, hhi), + ); + (res0, res1, res2) +} diff --git a/src/packed/tests.rs b/src/packed/tests.rs new file mode 100644 index 0000000..91410cb --- /dev/null +++ b/src/packed/tests.rs @@ -0,0 +1,568 @@ +use std::collections::HashMap; +use std::usize; + +use crate::packed::{Config, MatchKind}; +use crate::Match; + +/// A description of a single test against a multi-pattern searcher. +/// +/// A single test may not necessarily pass on every configuration of a +/// searcher. The tests are categorized and grouped appropriately below. +#[derive(Clone, Debug, Eq, PartialEq)] +struct SearchTest { + /// The name of this test, for debugging. + name: &'static str, + /// The patterns to search for. + patterns: &'static [&'static str], + /// The text to search. + haystack: &'static str, + /// Each match is a triple of (pattern_index, start, end), where + /// pattern_index is an index into `patterns` and `start`/`end` are indices + /// into `haystack`. + matches: &'static [(usize, usize, usize)], +} + +struct SearchTestOwned { + offset: usize, + name: String, + patterns: Vec, + haystack: String, + matches: Vec<(usize, usize, usize)>, +} + +impl SearchTest { + fn variations(&self) -> Vec { + let mut tests = vec![]; + for i in 0..=260 { + tests.push(self.offset_prefix(i)); + tests.push(self.offset_suffix(i)); + tests.push(self.offset_both(i)); + } + tests + } + + fn offset_both(&self, off: usize) -> SearchTestOwned { + SearchTestOwned { + offset: off, + name: self.name.to_string(), + patterns: self.patterns.iter().map(|s| s.to_string()).collect(), + haystack: format!( + "{}{}{}", + "Z".repeat(off), + self.haystack, + "Z".repeat(off) + ), + matches: self + .matches + .iter() + .map(|&(id, s, e)| (id, s + off, e + off)) + .collect(), + } + } + + fn offset_prefix(&self, off: usize) -> SearchTestOwned { + SearchTestOwned { + offset: off, + name: self.name.to_string(), + patterns: self.patterns.iter().map(|s| s.to_string()).collect(), + haystack: format!("{}{}", "Z".repeat(off), self.haystack), + matches: self + .matches + .iter() + .map(|&(id, s, e)| (id, s + off, e + off)) + .collect(), + } + } + + fn offset_suffix(&self, off: usize) -> SearchTestOwned { + SearchTestOwned { + offset: off, + name: self.name.to_string(), + patterns: self.patterns.iter().map(|s| s.to_string()).collect(), + haystack: format!("{}{}", self.haystack, "Z".repeat(off)), + matches: self.matches.to_vec(), + } + } + + // fn to_owned(&self) -> SearchTestOwned { + // SearchTestOwned { + // name: self.name.to_string(), + // patterns: self.patterns.iter().map(|s| s.to_string()).collect(), + // haystack: self.haystack.to_string(), + // matches: self.matches.iter().cloned().collect(), + // } + // } +} + +/// Short-hand constructor for SearchTest. We use it a lot below. +macro_rules! t { + ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => { + SearchTest { + name: stringify!($name), + patterns: $patterns, + haystack: $haystack, + matches: $matches, + } + }; +} + +/// A collection of test groups. +type TestCollection = &'static [&'static [SearchTest]]; + +// Define several collections corresponding to the different type of match +// semantics supported. These collections have some overlap, but each +// collection should have some tests that no other collection has. + +/// Tests for leftmost-first match semantics. +const PACKED_LEFTMOST_FIRST: TestCollection = + &[BASICS, LEFTMOST, LEFTMOST_FIRST, REGRESSION, TEDDY]; + +/// Tests for leftmost-longest match semantics. +const PACKED_LEFTMOST_LONGEST: TestCollection = + &[BASICS, LEFTMOST, LEFTMOST_LONGEST, REGRESSION, TEDDY]; + +// Now define the individual tests that make up the collections above. + +/// A collection of tests for the that should always be true regardless of +/// match semantics. That is, all combinations of leftmost-{first, longest} +/// should produce the same answer. +const BASICS: &'static [SearchTest] = &[ + t!(basic001, &["a"], "", &[]), + t!(basic010, &["a"], "a", &[(0, 0, 1)]), + t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]), + t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]), + t!(basic050, &["a"], "bba", &[(0, 2, 3)]), + t!(basic060, &["a"], "bbb", &[]), + t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]), + t!(basic100, &["aa"], "", &[]), + t!(basic110, &["aa"], "aa", &[(0, 0, 2)]), + t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]), + t!(basic130, &["aa"], "abbab", &[]), + t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]), + t!(basic150, &["aaa"], "aaa", &[(0, 0, 3)]), + t!(basic200, &["abc"], "abc", &[(0, 0, 3)]), + t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]), + t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]), + t!(basic300, &["a", "b"], "", &[]), + t!(basic310, &["a", "b"], "z", &[]), + t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]), + t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]), + t!( + basic340, + &["a", "b"], + "abba", + &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),] + ), + t!( + basic350, + &["b", "a"], + "abba", + &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),] + ), + t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]), + t!(basic400, &["foo", "bar"], "", &[]), + t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]), + t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]), + t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]), + t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]), + t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]), + t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]), + t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]), + t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]), + t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]), + t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]), + t!( + basic720, + &["yabcdef", "bcdeyabc", "abcdezghi"], + "yabcdezghi", + &[(2, 1, 10),] + ), + t!(basic810, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), + t!(basic820, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), + t!(basic830, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]), + t!( + basic840, + &["ab", "ba"], + "abababa", + &[(0, 0, 2), (0, 2, 4), (0, 4, 6),] + ), + t!(basic850, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]), +]; + +/// Tests for leftmost match semantics. These should pass for both +/// leftmost-first and leftmost-longest match kinds. Stated differently, among +/// ambiguous matches, the longest match and the match that appeared first when +/// constructing the automaton should always be the same. +const LEFTMOST: &'static [SearchTest] = &[ + t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), + t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]), + t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]), + t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]), + t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), + t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), + t!( + leftmost360, + &["abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost370, + &["abcdefghi", "cde", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost380, + &["abcdefghi", "hz", "abcdefgh", "a"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost390, + &["b", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost400, + &["h", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost410, + &["z", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8), (0, 8, 9),] + ), +]; + +/// Tests for non-overlapping leftmost-first match semantics. These tests +/// should generally be specific to leftmost-first, which means they should +/// generally fail under leftmost-longest semantics. +const LEFTMOST_FIRST: &'static [SearchTest] = &[ + t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), + t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]), + t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]), + t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]), + t!( + leftfirst310, + &["abcd", "b", "bce", "ce"], + "abce", + &[(1, 1, 2), (3, 2, 4),] + ), + t!( + leftfirst320, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(0, 0, 1), (2, 7, 9),] + ), + t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]), + t!( + leftfirst340, + &["abcdef", "x", "x", "x", "x", "x", "x", "abcde"], + "abcdef", + &[(0, 0, 6)] + ), +]; + +/// Tests for non-overlapping leftmost-longest match semantics. These tests +/// should generally be specific to leftmost-longest, which means they should +/// generally fail under leftmost-first semantics. +const LEFTMOST_LONGEST: &'static [SearchTest] = &[ + t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), + t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), + t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), + t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), + t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), + t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), + t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]), + t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]), + t!( + leftlong310, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), + t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]), + t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]), +]; + +/// Regression tests that are applied to all combinations. +/// +/// If regression tests are needed for specific match semantics, then add them +/// to the appropriate group above. +const REGRESSION: &'static [SearchTest] = &[ + t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]), + t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]), + t!( + regression030, + &["libcore/", "libstd/"], + "libcore/char/methods.rs", + &[(0, 0, 8),] + ), + t!( + regression040, + &["libstd/", "libcore/"], + "libcore/char/methods.rs", + &[(1, 0, 8),] + ), + t!( + regression050, + &["\x00\x00\x01", "\x00\x00\x00"], + "\x00\x00\x00", + &[(1, 0, 3),] + ), + t!( + regression060, + &["\x00\x00\x00", "\x00\x00\x01"], + "\x00\x00\x00", + &[(0, 0, 3),] + ), +]; + +const TEDDY: &'static [SearchTest] = &[ + t!( + teddy010, + &["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], + "abcdefghijk", + &[ + (0, 0, 1), + (1, 1, 2), + (2, 2, 3), + (3, 3, 4), + (4, 4, 5), + (5, 5, 6), + (6, 6, 7), + (7, 7, 8), + (8, 8, 9), + (9, 9, 10), + (10, 10, 11) + ] + ), + t!( + teddy020, + &["ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl"], + "abcdefghijk", + &[(0, 0, 2), (2, 2, 4), (4, 4, 6), (6, 6, 8), (8, 8, 10),] + ), + t!( + teddy030, + &["abc"], + "abcdefghijklmnopqrstuvwxyzabcdefghijk", + &[(0, 0, 3), (0, 26, 29)] + ), +]; + +// Now define a test for each combination of things above that we want to run. +// Since there are a few different combinations for each collection of tests, +// we define a couple of macros to avoid repetition drudgery. The testconfig +// macro constructs the automaton from a given match kind, and runs the search +// tests one-by-one over the given collection. The `with` parameter allows one +// to configure the config with additional parameters. The testcombo macro +// invokes testconfig in precisely this way: it sets up several tests where +// each one turns a different knob on Config. + +macro_rules! testconfig { + ($name:ident, $collection:expr, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let mut config = Config::new(); + $with(&mut config); + config + .builder() + .extend(test.patterns.iter().map(|p| p.as_bytes())) + .build() + .unwrap() + .find_iter(&test.haystack) + .collect() + }); + } + }; +} + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_default_leftmost_first, + PACKED_LEFTMOST_FIRST, + |_: &mut Config| {} +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_default_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.match_kind(MatchKind::LeftmostLongest); + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.force_teddy(true); + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_ssse3_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.force_teddy(true); + if is_x86_feature_detected!("ssse3") { + c.force_avx(Some(false)); + } + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_ssse3_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); + if is_x86_feature_detected!("ssse3") { + c.force_avx(Some(false)); + } + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_avx2_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.force_teddy(true); + if is_x86_feature_detected!("avx2") { + c.force_avx(Some(true)); + } + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_avx2_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); + if is_x86_feature_detected!("avx2") { + c.force_avx(Some(true)); + } + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_fat_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.force_teddy(true); + if is_x86_feature_detected!("avx2") { + c.force_teddy_fat(Some(true)); + } + } +); + +#[cfg(target_arch = "x86_64")] +testconfig!( + search_teddy_fat_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); + if is_x86_feature_detected!("avx2") { + c.force_teddy_fat(Some(true)); + } + } +); + +testconfig!( + search_rabinkarp_leftmost_first, + PACKED_LEFTMOST_FIRST, + |c: &mut Config| { + c.force_rabin_karp(true); + } +); + +testconfig!( + search_rabinkarp_leftmost_longest, + PACKED_LEFTMOST_LONGEST, + |c: &mut Config| { + c.force_rabin_karp(true).match_kind(MatchKind::LeftmostLongest); + } +); + +#[test] +fn search_tests_have_unique_names() { + let assert = |constname, tests: &[SearchTest]| { + let mut seen = HashMap::new(); // map from test name to position + for (i, test) in tests.iter().enumerate() { + if !seen.contains_key(test.name) { + seen.insert(test.name, i); + } else { + let last = seen[test.name]; + panic!( + "{} tests have duplicate names at positions {} and {}", + constname, last, i + ); + } + } + }; + assert("BASICS", BASICS); + assert("LEFTMOST", LEFTMOST); + assert("LEFTMOST_FIRST", LEFTMOST_FIRST); + assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST); + assert("REGRESSION", REGRESSION); + assert("TEDDY", TEDDY); +} + +fn run_search_tests Vec>( + which: TestCollection, + mut f: F, +) { + let get_match_triples = + |matches: Vec| -> Vec<(usize, usize, usize)> { + matches + .into_iter() + .map(|m| (m.pattern(), m.start(), m.end())) + .collect() + }; + for &tests in which { + for spec in tests { + for test in spec.variations() { + assert_eq!( + test.matches, + get_match_triples(f(&test)).as_slice(), + "test: {}, patterns: {:?}, haystack: {:?}, offset: {:?}", + test.name, + test.patterns, + test.haystack, + test.offset, + ); + } + } + } +} diff --git a/src/packed/vector.rs b/src/packed/vector.rs new file mode 100644 index 0000000..ca6c2b0 --- /dev/null +++ b/src/packed/vector.rs @@ -0,0 +1,181 @@ +// This file contains a set of fairly generic utility functions when working +// with SIMD vectors. +// +// SAFETY: All of the routines below are unsafe to call because they assume +// the necessary CPU target features in order to use particular vendor +// intrinsics. Calling these routines when the underlying CPU does not support +// the appropriate target features is NOT safe. Callers must ensure this +// themselves. +// +// Note that it may not look like this safety invariant is being upheld when +// these routines are called. Namely, the CPU feature check is typically pretty +// far away from when these routines are used. Instead, we rely on the fact +// that certain types serve as a guaranteed receipt that pertinent target +// features are enabled. For example, the only way TeddySlim3Mask256 can be +// constructed is if the AVX2 CPU feature is available. Thus, any code running +// inside of TeddySlim3Mask256 can use any of the functions below without any +// additional checks: its very existence *is* the check. + +use std::arch::x86_64::*; + +/// Shift `a` to the left by two bytes (removing its two most significant +/// bytes), and concatenate it with the the two most significant bytes of `b`. +#[target_feature(enable = "avx2")] +pub unsafe fn alignr256_14(a: __m256i, b: __m256i) -> __m256i { + // Credit goes to jneem for figuring this out: + // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 + // + // TL;DR avx2's PALIGNR instruction is actually just two 128-bit PALIGNR + // instructions, which is not what we want, so we need to do some extra + // shuffling. + + // This permute gives us the low 16 bytes of a concatenated with the high + // 16 bytes of b, in order of most significant to least significant. So + // `v = a[15:0] b[31:16]`. + let v = _mm256_permute2x128_si256(b, a, 0x21); + // This effectively does this (where we deal in terms of byte-indexing + // and byte-shifting, and use inclusive ranges): + // + // ret[15:0] := ((a[15:0] << 16) | v[15:0]) >> 14 + // = ((a[15:0] << 16) | b[31:16]) >> 14 + // ret[31:16] := ((a[31:16] << 16) | v[31:16]) >> 14 + // = ((a[31:16] << 16) | a[15:0]) >> 14 + // + // Which therefore results in: + // + // ret[31:0] := a[29:16] a[15:14] a[13:0] b[31:30] + // + // The end result is that we've effectively done this: + // + // (a << 2) | (b >> 30) + // + // When `A` and `B` are strings---where the beginning of the string is in + // the least significant bits---we effectively result in the following + // semantic operation: + // + // (A >> 2) | (B << 30) + // + // The reversal being attributed to the fact that we are in little-endian. + _mm256_alignr_epi8(a, v, 14) +} + +/// Shift `a` to the left by one byte (removing its most significant byte), and +/// concatenate it with the the most significant byte of `b`. +#[target_feature(enable = "avx2")] +pub unsafe fn alignr256_15(a: __m256i, b: __m256i) -> __m256i { + // For explanation, see alignr256_14. + let v = _mm256_permute2x128_si256(b, a, 0x21); + _mm256_alignr_epi8(a, v, 15) +} + +/// Unpack the given 128-bit vector into its 64-bit components. The first +/// element of the array returned corresponds to the least significant 64-bit +/// lane in `a`. +#[target_feature(enable = "ssse3")] +pub unsafe fn unpack64x128(a: __m128i) -> [u64; 2] { + [ + _mm_cvtsi128_si64(a) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(a, 8)) as u64, + ] +} + +/// Unpack the given 256-bit vector into its 64-bit components. The first +/// element of the array returned corresponds to the least significant 64-bit +/// lane in `a`. +#[target_feature(enable = "avx2")] +pub unsafe fn unpack64x256(a: __m256i) -> [u64; 4] { + // Using transmute here is precisely equivalent, but actually slower. It's + // not quite clear why. + let lo = _mm256_extracti128_si256(a, 0); + let hi = _mm256_extracti128_si256(a, 1); + [ + _mm_cvtsi128_si64(lo) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64, + _mm_cvtsi128_si64(hi) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64, + ] +} + +/// Unpack the low 128-bits of `a` and `b`, and return them as 4 64-bit +/// integers. +/// +/// More precisely, if a = a4 a3 a2 a1 and b = b4 b3 b2 b1, where each element +/// is a 64-bit integer and a1/b1 correspond to the least significant 64 bits, +/// then the return value is `b2 b1 a2 a1`. +#[target_feature(enable = "avx2")] +pub unsafe fn unpacklo64x256(a: __m256i, b: __m256i) -> [u64; 4] { + let lo = _mm256_castsi256_si128(a); + let hi = _mm256_castsi256_si128(b); + [ + _mm_cvtsi128_si64(lo) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64, + _mm_cvtsi128_si64(hi) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64, + ] +} + +/// Returns true if and only if all bits in the given 128-bit vector are 0. +#[target_feature(enable = "ssse3")] +pub unsafe fn is_all_zeroes128(a: __m128i) -> bool { + let cmp = _mm_cmpeq_epi8(a, zeroes128()); + _mm_movemask_epi8(cmp) as u32 == 0xFFFF +} + +/// Returns true if and only if all bits in the given 256-bit vector are 0. +#[target_feature(enable = "avx2")] +pub unsafe fn is_all_zeroes256(a: __m256i) -> bool { + let cmp = _mm256_cmpeq_epi8(a, zeroes256()); + _mm256_movemask_epi8(cmp) as u32 == 0xFFFFFFFF +} + +/// Load a 128-bit vector from slice at the given position. The slice does +/// not need to be unaligned. +/// +/// Since this code assumes little-endian (there is no big-endian x86), the +/// bytes starting in `slice[at..]` will be at the least significant bits of +/// the returned vector. This is important for the surrounding code, since for +/// example, shifting the resulting vector right is equivalent to logically +/// shifting the bytes in `slice` left. +#[target_feature(enable = "sse2")] +pub unsafe fn loadu128(slice: &[u8], at: usize) -> __m128i { + let ptr = slice.get_unchecked(at..).as_ptr(); + _mm_loadu_si128(ptr as *const u8 as *const __m128i) +} + +/// Load a 256-bit vector from slice at the given position. The slice does +/// not need to be unaligned. +/// +/// Since this code assumes little-endian (there is no big-endian x86), the +/// bytes starting in `slice[at..]` will be at the least significant bits of +/// the returned vector. This is important for the surrounding code, since for +/// example, shifting the resulting vector right is equivalent to logically +/// shifting the bytes in `slice` left. +#[target_feature(enable = "avx2")] +pub unsafe fn loadu256(slice: &[u8], at: usize) -> __m256i { + let ptr = slice.get_unchecked(at..).as_ptr(); + _mm256_loadu_si256(ptr as *const u8 as *const __m256i) +} + +/// Returns a 128-bit vector with all bits set to 0. +#[target_feature(enable = "sse2")] +pub unsafe fn zeroes128() -> __m128i { + _mm_set1_epi8(0) +} + +/// Returns a 256-bit vector with all bits set to 0. +#[target_feature(enable = "avx2")] +pub unsafe fn zeroes256() -> __m256i { + _mm256_set1_epi8(0) +} + +/// Returns a 128-bit vector with all bits set to 1. +#[target_feature(enable = "sse2")] +pub unsafe fn ones128() -> __m128i { + _mm_set1_epi8(0xFF as u8 as i8) +} + +/// Returns a 256-bit vector with all bits set to 1. +#[target_feature(enable = "avx2")] +pub unsafe fn ones256() -> __m256i { + _mm256_set1_epi8(0xFF as u8 as i8) +} diff --git a/src/prefilter.rs b/src/prefilter.rs new file mode 100644 index 0000000..ef81411 --- /dev/null +++ b/src/prefilter.rs @@ -0,0 +1,1057 @@ +use std::cmp; +use std::fmt; +use std::panic::{RefUnwindSafe, UnwindSafe}; +use std::u8; + +use memchr::{memchr, memchr2, memchr3}; + +use crate::ahocorasick::MatchKind; +use crate::packed; +use crate::Match; + +/// A candidate is the result of running a prefilter on a haystack at a +/// particular position. The result is either no match, a confirmed match or +/// a possible match. +/// +/// When no match is returned, the prefilter is guaranteeing that no possible +/// match can be found in the haystack, and the caller may trust this. That is, +/// all correct prefilters must never report false negatives. +/// +/// In some cases, a prefilter can confirm a match very quickly, in which case, +/// the caller may use this to stop what it's doing and report the match. In +/// this case, prefilter implementations must never report a false positive. +/// In other cases, the prefilter can only report a potential match, in which +/// case the callers must attempt to confirm the match. In this case, prefilter +/// implementations are permitted to return false positives. +#[derive(Clone, Debug)] +pub enum Candidate { + None, + Match(Match), + PossibleStartOfMatch(usize), +} + +impl Candidate { + /// Convert this candidate into an option. This is useful when callers + /// do not distinguish between true positives and false positives (i.e., + /// the caller must always confirm the match in order to update some other + /// state). + pub fn into_option(self) -> Option { + match self { + Candidate::None => None, + Candidate::Match(ref m) => Some(m.start()), + Candidate::PossibleStartOfMatch(start) => Some(start), + } + } +} + +/// A prefilter describes the behavior of fast literal scanners for quickly +/// skipping past bytes in the haystack that we know cannot possibly +/// participate in a match. +pub trait Prefilter: + Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug +{ + /// Returns the next possible match candidate. This may yield false + /// positives, so callers must confirm a match starting at the position + /// returned. This, however, must never produce false negatives. That is, + /// this must, at minimum, return the starting position of the next match + /// in the given haystack after or at the given position. + fn next_candidate( + &self, + state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate; + + /// A method for cloning a prefilter, to work-around the fact that Clone + /// is not object-safe. + fn clone_prefilter(&self) -> Box; + + /// Returns the approximate total amount of heap used by this prefilter, in + /// units of bytes. + fn heap_bytes(&self) -> usize; + + /// Returns true if and only if this prefilter never returns false + /// positives. This is useful for completely avoiding the automaton + /// when the prefilter can quickly confirm its own matches. + /// + /// By default, this returns true, which is conservative; it is always + /// correct to return `true`. Returning `false` here and reporting a false + /// positive will result in incorrect searches. + fn reports_false_positives(&self) -> bool { + true + } + + /// Returns true if and only if this prefilter may look for a non-starting + /// position of a match. + /// + /// This is useful in a streaming context where prefilters that don't look + /// for a starting position of a match can be quite difficult to deal with. + /// + /// This returns false by default. + fn looks_for_non_start_of_match(&self) -> bool { + false + } +} + +impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { + #[inline] + fn next_candidate( + &self, + state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + (**self).next_candidate(state, haystack, at) + } + + fn clone_prefilter(&self) -> Box { + (**self).clone_prefilter() + } + + fn heap_bytes(&self) -> usize { + (**self).heap_bytes() + } + + fn reports_false_positives(&self) -> bool { + (**self).reports_false_positives() + } +} + +/// A convenience object for representing any type that implements Prefilter +/// and is cloneable. +#[derive(Debug)] +pub struct PrefilterObj(Box); + +impl Clone for PrefilterObj { + fn clone(&self) -> Self { + PrefilterObj(self.0.clone_prefilter()) + } +} + +impl PrefilterObj { + /// Create a new prefilter object. + pub fn new(t: T) -> PrefilterObj { + PrefilterObj(Box::new(t)) + } + + /// Return the underlying prefilter trait object. + pub fn as_ref(&self) -> &dyn Prefilter { + &*self.0 + } +} + +/// PrefilterState tracks state associated with the effectiveness of a +/// prefilter. It is used to track how many bytes, on average, are skipped by +/// the prefilter. If this average dips below a certain threshold over time, +/// then the state renders the prefilter inert and stops using it. +/// +/// A prefilter state should be created for each search. (Where creating an +/// iterator via, e.g., `find_iter`, is treated as a single search.) +#[derive(Clone, Debug)] +pub struct PrefilterState { + /// The number of skips that has been executed. + skips: usize, + /// The total number of bytes that have been skipped. + skipped: usize, + /// The maximum length of a match. This is used to help determine how many + /// bytes on average should be skipped in order for a prefilter to be + /// effective. + max_match_len: usize, + /// Once this heuristic has been deemed permanently ineffective, it will be + /// inert throughout the rest of its lifetime. This serves as a cheap way + /// to check inertness. + inert: bool, + /// The last (absolute) position at which a prefilter scanned to. + /// Prefilters can use this position to determine whether to re-scan or + /// not. + /// + /// Unlike other things that impact effectiveness, this is a fleeting + /// condition. That is, a prefilter can be considered ineffective if it is + /// at a position before `last_scan_at`, but can become effective again + /// once the search moves past `last_scan_at`. + /// + /// The utility of this is to both avoid additional overhead from calling + /// the prefilter and to avoid quadratic behavior. This ensures that a + /// prefilter will scan any particular byte at most once. (Note that some + /// prefilters, like the start-byte prefilter, do not need to use this + /// field at all, since it only looks for starting bytes.) + last_scan_at: usize, +} + +impl PrefilterState { + /// The minimum number of skip attempts to try before considering whether + /// a prefilter is effective or not. + const MIN_SKIPS: usize = 40; + + /// The minimum amount of bytes that skipping must average, expressed as a + /// factor of the multiple of the length of a possible match. + /// + /// That is, after MIN_SKIPS have occurred, if the average number of bytes + /// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the + /// prefilter outed to be rendered inert. + const MIN_AVG_FACTOR: usize = 2; + + /// Create a fresh prefilter state. + pub fn new(max_match_len: usize) -> PrefilterState { + PrefilterState { + skips: 0, + skipped: 0, + max_match_len, + inert: false, + last_scan_at: 0, + } + } + + /// Create a prefilter state that always disables the prefilter. + pub fn disabled() -> PrefilterState { + PrefilterState { + skips: 0, + skipped: 0, + max_match_len: 0, + inert: true, + last_scan_at: 0, + } + } + + /// Update this state with the number of bytes skipped on the last + /// invocation of the prefilter. + #[inline] + fn update_skipped_bytes(&mut self, skipped: usize) { + self.skips += 1; + self.skipped += skipped; + } + + /// Updates the position at which the last scan stopped. This may be + /// greater than the position of the last candidate reported. For example, + /// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz` + /// will report a candidate at position `0`, but the end of its last scan + /// will be at position `3`. + /// + /// This position factors into the effectiveness of this prefilter. If the + /// current position is less than the last position at which a scan ended, + /// then the prefilter should not be re-run until the search moves past + /// that position. + #[inline] + fn update_at(&mut self, at: usize) { + if at > self.last_scan_at { + self.last_scan_at = at; + } + } + + /// Return true if and only if this state indicates that a prefilter is + /// still effective. + /// + /// The given pos should correspond to the current starting position of the + /// search. + #[inline] + pub fn is_effective(&mut self, at: usize) -> bool { + if self.inert { + return false; + } + if at < self.last_scan_at { + return false; + } + if self.skips < PrefilterState::MIN_SKIPS { + return true; + } + + let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len; + if self.skipped >= min_avg * self.skips { + return true; + } + + // We're inert. + self.inert = true; + false + } +} + +/// A builder for constructing the best possible prefilter. When constructed, +/// this builder will heuristically select the best prefilter it can build, +/// if any, and discard the rest. +#[derive(Debug)] +pub struct Builder { + count: usize, + ascii_case_insensitive: bool, + start_bytes: StartBytesBuilder, + rare_bytes: RareBytesBuilder, + packed: Option, +} + +impl Builder { + /// Create a new builder for constructing the best possible prefilter. + pub fn new(kind: MatchKind) -> Builder { + let pbuilder = kind + .as_packed() + .map(|kind| packed::Config::new().match_kind(kind).builder()); + Builder { + count: 0, + ascii_case_insensitive: false, + start_bytes: StartBytesBuilder::new(), + rare_bytes: RareBytesBuilder::new(), + packed: pbuilder, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder { + self.ascii_case_insensitive = yes; + self.start_bytes = self.start_bytes.ascii_case_insensitive(yes); + self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes); + self + } + + /// Return a prefilter suitable for quickly finding potential matches. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + pub fn build(&self) -> Option { + // match (self.start_bytes.build(), self.rare_bytes.build()) { + match (self.start_bytes.build(), self.rare_bytes.build()) { + // If we could build both start and rare prefilters, then there are + // a few cases in which we'd want to use the start-byte prefilter + // over the rare-byte prefilter, since the former has lower + // overhead. + (prestart @ Some(_), prerare @ Some(_)) => { + // If the start-byte prefilter can scan for a smaller number + // of bytes than the rare-byte prefilter, then it's probably + // faster. + let has_fewer_bytes = + self.start_bytes.count < self.rare_bytes.count; + // Otherwise, if the combined frequency rank of the detected + // bytes in the start-byte prefilter is "close" to the combined + // frequency rank of the rare-byte prefilter, then we pick + // the start-byte prefilter even if the rare-byte prefilter + // heuristically searches for rare bytes. This is because the + // rare-byte prefilter has higher constant costs, so we tend to + // prefer the start-byte prefilter when we can. + let has_rarer_bytes = + self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50; + if has_fewer_bytes || has_rarer_bytes { + prestart + } else { + prerare + } + } + (prestart @ Some(_), None) => prestart, + (None, prerare @ Some(_)) => prerare, + (None, None) if self.ascii_case_insensitive => None, + (None, None) => self + .packed + .as_ref() + .and_then(|b| b.build()) + .map(|s| PrefilterObj::new(Packed(s))), + } + } + + /// Add a literal string to this prefilter builder. + pub fn add(&mut self, bytes: &[u8]) { + self.count += 1; + self.start_bytes.add(bytes); + self.rare_bytes.add(bytes); + if let Some(ref mut pbuilder) = self.packed { + pbuilder.add(bytes); + } + } +} + +/// A type that wraps a packed searcher and implements the `Prefilter` +/// interface. +#[derive(Clone, Debug)] +struct Packed(packed::Searcher); + +impl Prefilter for Packed { + fn next_candidate( + &self, + _state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + self.0.heap_bytes() + } + + fn reports_false_positives(&self) -> bool { + false + } +} + +/// A builder for constructing a rare byte prefilter. +/// +/// A rare byte prefilter attempts to pick out a small set of rare bytes that +/// occurr in the patterns, and then quickly scan to matches of those rare +/// bytes. +#[derive(Clone, Debug)] +struct RareBytesBuilder { + /// Whether this prefilter should account for ASCII case insensitivity or + /// not. + ascii_case_insensitive: bool, + /// A set of rare bytes, indexed by byte value. + rare_set: ByteSet, + /// A set of byte offsets associated with bytes in a pattern. An entry + /// corresponds to a particular bytes (its index) and is only non-zero if + /// the byte occurred at an offset greater than 0 in at least one pattern. + /// + /// If a byte's offset is not representable in 8 bits, then the rare bytes + /// prefilter becomes inert. + byte_offsets: RareByteOffsets, + /// Whether this is available as a prefilter or not. This can be set to + /// false during construction if a condition is seen that invalidates the + /// use of the rare-byte prefilter. + available: bool, + /// The number of bytes set to an active value in `byte_offsets`. + count: usize, + /// The sum of frequency ranks for the rare bytes detected. This is + /// intended to give a heuristic notion of how rare the bytes are. + rank_sum: u16, +} + +/// A set of bytes. +#[derive(Clone, Copy)] +struct ByteSet([bool; 256]); + +impl ByteSet { + fn empty() -> ByteSet { + ByteSet([false; 256]) + } + + fn insert(&mut self, b: u8) -> bool { + let new = !self.contains(b); + self.0[b as usize] = true; + new + } + + fn contains(&self, b: u8) -> bool { + self.0[b as usize] + } +} + +impl fmt::Debug for ByteSet { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut bytes = vec![]; + for b in 0..=255 { + if self.contains(b) { + bytes.push(b); + } + } + f.debug_struct("ByteSet").field("set", &bytes).finish() + } +} + +/// A set of byte offsets, keyed by byte. +#[derive(Clone, Copy)] +struct RareByteOffsets { + /// Each entry corresponds to the maximum offset of the corresponding + /// byte across all patterns seen. + set: [RareByteOffset; 256], +} + +impl RareByteOffsets { + /// Create a new empty set of rare byte offsets. + pub fn empty() -> RareByteOffsets { + RareByteOffsets { set: [RareByteOffset::default(); 256] } + } + + /// Add the given offset for the given byte to this set. If the offset is + /// greater than the existing offset, then it overwrites the previous + /// value and returns false. If there is no previous value set, then this + /// sets it and returns true. + pub fn set(&mut self, byte: u8, off: RareByteOffset) { + self.set[byte as usize].max = + cmp::max(self.set[byte as usize].max, off.max); + } +} + +impl fmt::Debug for RareByteOffsets { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut offsets = vec![]; + for off in self.set.iter() { + if off.max > 0 { + offsets.push(off); + } + } + f.debug_struct("RareByteOffsets").field("set", &offsets).finish() + } +} + +/// Offsets associated with an occurrence of a "rare" byte in any of the +/// patterns used to construct a single Aho-Corasick automaton. +#[derive(Clone, Copy, Debug)] +struct RareByteOffset { + /// The maximum offset at which a particular byte occurs from the start + /// of any pattern. This is used as a shift amount. That is, when an + /// occurrence of this byte is found, the candidate position reported by + /// the prefilter is `position_of_byte - max`, such that the automaton + /// will begin its search at a position that is guaranteed to observe a + /// match. + /// + /// To avoid accidentally quadratic behavior, a prefilter is considered + /// ineffective when it is asked to start scanning from a position that it + /// has already scanned past. + /// + /// Using a `u8` here means that if we ever see a pattern that's longer + /// than 255 bytes, then the entire rare byte prefilter is disabled. + max: u8, +} + +impl Default for RareByteOffset { + fn default() -> RareByteOffset { + RareByteOffset { max: 0 } + } +} + +impl RareByteOffset { + /// Create a new rare byte offset. If the given offset is too big, then + /// None is returned. In that case, callers should render the rare bytes + /// prefilter inert. + fn new(max: usize) -> Option { + if max > u8::MAX as usize { + None + } else { + Some(RareByteOffset { max: max as u8 }) + } + } +} + +impl RareBytesBuilder { + /// Create a new builder for constructing a rare byte prefilter. + fn new() -> RareBytesBuilder { + RareBytesBuilder { + ascii_case_insensitive: false, + rare_set: ByteSet::empty(), + byte_offsets: RareByteOffsets::empty(), + available: true, + count: 0, + rank_sum: 0, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder { + self.ascii_case_insensitive = yes; + self + } + + /// Build the rare bytes prefilter. + /// + /// If there are more than 3 distinct starting bytes, or if heuristics + /// otherwise determine that this prefilter should not be used, then `None` + /// is returned. + fn build(&self) -> Option { + if !self.available || self.count > 3 { + return None; + } + let (mut bytes, mut len) = ([0; 3], 0); + for b in 0..=255 { + if self.rare_set.contains(b) { + bytes[len] = b as u8; + len += 1; + } + } + match len { + 0 => None, + 1 => Some(PrefilterObj::new(RareBytesOne { + byte1: bytes[0], + offset: self.byte_offsets.set[bytes[0] as usize], + })), + 2 => Some(PrefilterObj::new(RareBytesTwo { + offsets: self.byte_offsets, + byte1: bytes[0], + byte2: bytes[1], + })), + 3 => Some(PrefilterObj::new(RareBytesThree { + offsets: self.byte_offsets, + byte1: bytes[0], + byte2: bytes[1], + byte3: bytes[2], + })), + _ => unreachable!(), + } + } + + /// Add a byte string to this builder. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + fn add(&mut self, bytes: &[u8]) { + // If we've already given up, then do nothing. + if !self.available { + return; + } + // If we've already blown our budget, then don't waste time looking + // for more rare bytes. + if self.count > 3 { + self.available = false; + return; + } + // If the pattern is too long, then our offset table is bunk, so + // give up. + if bytes.len() >= 256 { + self.available = false; + return; + } + let mut rarest = match bytes.get(0) { + None => return, + Some(&b) => (b, freq_rank(b)), + }; + // The idea here is to look for the rarest byte in each pattern, and + // add that to our set. As a special exception, if we see a byte that + // we've already added, then we immediately stop and choose that byte, + // even if there's another rare byte in the pattern. This helps us + // apply the rare byte optimization in more cases by attempting to pick + // bytes that are in common between patterns. So for example, if we + // were searching for `Sherlock` and `lockjaw`, then this would pick + // `k` for both patterns, resulting in the use of `memchr` instead of + // `memchr2` for `k` and `j`. + let mut found = false; + for (pos, &b) in bytes.iter().enumerate() { + self.set_offset(pos, b); + if found { + continue; + } + if self.rare_set.contains(b) { + found = true; + continue; + } + let rank = freq_rank(b); + if rank < rarest.1 { + rarest = (b, rank); + } + } + if !found { + self.add_rare_byte(rarest.0); + } + } + + fn set_offset(&mut self, pos: usize, byte: u8) { + // This unwrap is OK because pos is never bigger than our max. + let offset = RareByteOffset::new(pos).unwrap(); + self.byte_offsets.set(byte, offset); + if self.ascii_case_insensitive { + self.byte_offsets.set(opposite_ascii_case(byte), offset); + } + } + + fn add_rare_byte(&mut self, byte: u8) { + self.add_one_rare_byte(byte); + if self.ascii_case_insensitive { + self.add_one_rare_byte(opposite_ascii_case(byte)); + } + } + + fn add_one_rare_byte(&mut self, byte: u8) { + if self.rare_set.insert(byte) { + self.count += 1; + self.rank_sum += freq_rank(byte) as u16; + } + } +} + +/// A prefilter for scanning for a single "rare" byte. +#[derive(Clone, Debug)] +struct RareBytesOne { + byte1: u8, + offset: RareByteOffset, +} + +impl Prefilter for RareBytesOne { + fn next_candidate( + &self, + state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr(self.byte1, &haystack[at..]) + .map(|i| { + let pos = at + i; + state.last_scan_at = pos; + cmp::max(at, pos.saturating_sub(self.offset.max as usize)) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: It should be possible to use a rare byte prefilter in a + // streaming context. The main problem is that we usually assume that + // if a prefilter has scanned some text and not found anything, then no + // match *starts* in that text. This doesn't matter in non-streaming + // contexts, but in a streaming context, if we're looking for a byte + // that doesn't start at the beginning of a match and don't find it, + // then it's still possible for a match to start at the end of the + // current buffer content. In order to fix this, the streaming searcher + // would need to become aware of prefilters that do this and use the + // appropriate offset in various places. It is quite a delicate change + // and probably shouldn't be attempted until streaming search has a + // better testing strategy. In particular, we'd really like to be able + // to vary the buffer size to force strange cases that occur at the + // edge of the buffer. If we make the buffer size minimal, then these + // cases occur more frequently and easier. + // + // This is also a bummer because this means that if the prefilter + // builder chose a rare byte prefilter, then a streaming search won't + // use any prefilter at all because the builder doesn't know how it's + // going to be used. Assuming we don't make streaming search aware of + // these special types of prefilters as described above, we could fix + // this by building a "backup" prefilter that could be used when the + // rare byte prefilter could not. But that's a bandaide. Sigh. + true + } +} + +/// A prefilter for scanning for two "rare" bytes. +#[derive(Clone, Debug)] +struct RareBytesTwo { + offsets: RareByteOffsets, + byte1: u8, + byte2: u8, +} + +impl Prefilter for RareBytesTwo { + fn next_candidate( + &self, + state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr2(self.byte1, self.byte2, &haystack[at..]) + .map(|i| { + let pos = at + i; + state.update_at(pos); + let offset = self.offsets.set[haystack[pos] as usize].max; + cmp::max(at, pos.saturating_sub(offset as usize)) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: See Prefilter impl for RareBytesOne. + true + } +} + +/// A prefilter for scanning for three "rare" bytes. +#[derive(Clone, Debug)] +struct RareBytesThree { + offsets: RareByteOffsets, + byte1: u8, + byte2: u8, + byte3: u8, +} + +impl Prefilter for RareBytesThree { + fn next_candidate( + &self, + state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..]) + .map(|i| { + let pos = at + i; + state.update_at(pos); + let offset = self.offsets.set[haystack[pos] as usize].max; + cmp::max(at, pos.saturating_sub(offset as usize)) + }) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } + + fn looks_for_non_start_of_match(&self) -> bool { + // TODO: See Prefilter impl for RareBytesOne. + true + } +} + +/// A builder for constructing a starting byte prefilter. +/// +/// A starting byte prefilter is a simplistic prefilter that looks for possible +/// matches by reporting all positions corresponding to a particular byte. This +/// generally only takes affect when there are at most 3 distinct possible +/// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two +/// distinct starting bytes (`f` and `b`), and this prefilter returns all +/// occurrences of either `f` or `b`. +/// +/// In some cases, a heuristic frequency analysis may determine that it would +/// be better not to use this prefilter even when there are 3 or fewer distinct +/// starting bytes. +#[derive(Clone, Debug)] +struct StartBytesBuilder { + /// Whether this prefilter should account for ASCII case insensitivity or + /// not. + ascii_case_insensitive: bool, + /// The set of starting bytes observed. + byteset: Vec, + /// The number of bytes set to true in `byteset`. + count: usize, + /// The sum of frequency ranks for the rare bytes detected. This is + /// intended to give a heuristic notion of how rare the bytes are. + rank_sum: u16, +} + +impl StartBytesBuilder { + /// Create a new builder for constructing a start byte prefilter. + fn new() -> StartBytesBuilder { + StartBytesBuilder { + ascii_case_insensitive: false, + byteset: vec![false; 256], + count: 0, + rank_sum: 0, + } + } + + /// Enable ASCII case insensitivity. When set, byte strings added to this + /// builder will be interpreted without respect to ASCII case. + fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder { + self.ascii_case_insensitive = yes; + self + } + + /// Build the starting bytes prefilter. + /// + /// If there are more than 3 distinct starting bytes, or if heuristics + /// otherwise determine that this prefilter should not be used, then `None` + /// is returned. + fn build(&self) -> Option { + if self.count > 3 { + return None; + } + let (mut bytes, mut len) = ([0; 3], 0); + for b in 0..256 { + if !self.byteset[b] { + continue; + } + // We don't handle non-ASCII bytes for now. Getting non-ASCII + // bytes right is trickier, since we generally don't want to put + // a leading UTF-8 code unit into a prefilter that isn't ASCII, + // since they can frequently. Instead, it would be better to use a + // continuation byte, but this requires more sophisticated analysis + // of the automaton and a richer prefilter API. + if b > 0x7F { + return None; + } + bytes[len] = b as u8; + len += 1; + } + match len { + 0 => None, + 1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })), + 2 => Some(PrefilterObj::new(StartBytesTwo { + byte1: bytes[0], + byte2: bytes[1], + })), + 3 => Some(PrefilterObj::new(StartBytesThree { + byte1: bytes[0], + byte2: bytes[1], + byte3: bytes[2], + })), + _ => unreachable!(), + } + } + + /// Add a byte string to this builder. + /// + /// All patterns added to an Aho-Corasick automaton should be added to this + /// builder before attempting to construct the prefilter. + fn add(&mut self, bytes: &[u8]) { + if self.count > 3 { + return; + } + if let Some(&byte) = bytes.get(0) { + self.add_one_byte(byte); + if self.ascii_case_insensitive { + self.add_one_byte(opposite_ascii_case(byte)); + } + } + } + + fn add_one_byte(&mut self, byte: u8) { + if !self.byteset[byte as usize] { + self.byteset[byte as usize] = true; + self.count += 1; + self.rank_sum += freq_rank(byte) as u16; + } + } +} + +/// A prefilter for scanning for a single starting byte. +#[derive(Clone, Debug)] +struct StartBytesOne { + byte1: u8, +} + +impl Prefilter for StartBytesOne { + fn next_candidate( + &self, + _state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr(self.byte1, &haystack[at..]) + .map(|i| at + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } +} + +/// A prefilter for scanning for two starting bytes. +#[derive(Clone, Debug)] +struct StartBytesTwo { + byte1: u8, + byte2: u8, +} + +impl Prefilter for StartBytesTwo { + fn next_candidate( + &self, + _state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr2(self.byte1, self.byte2, &haystack[at..]) + .map(|i| at + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } +} + +/// A prefilter for scanning for three starting bytes. +#[derive(Clone, Debug)] +struct StartBytesThree { + byte1: u8, + byte2: u8, + byte3: u8, +} + +impl Prefilter for StartBytesThree { + fn next_candidate( + &self, + _state: &mut PrefilterState, + haystack: &[u8], + at: usize, + ) -> Candidate { + memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..]) + .map(|i| at + i) + .map_or(Candidate::None, Candidate::PossibleStartOfMatch) + } + + fn clone_prefilter(&self) -> Box { + Box::new(self.clone()) + } + + fn heap_bytes(&self) -> usize { + 0 + } +} + +/// Return the next candidate reported by the given prefilter while +/// simultaneously updating the given prestate. +/// +/// The caller is responsible for checking the prestate before deciding whether +/// to initiate a search. +#[inline] +pub fn next( + prestate: &mut PrefilterState, + prefilter: P, + haystack: &[u8], + at: usize, +) -> Candidate { + let cand = prefilter.next_candidate(prestate, haystack, at); + match cand { + Candidate::None => { + prestate.update_skipped_bytes(haystack.len() - at); + } + Candidate::Match(ref m) => { + prestate.update_skipped_bytes(m.start() - at); + } + Candidate::PossibleStartOfMatch(i) => { + prestate.update_skipped_bytes(i - at); + } + } + cand +} + +/// If the given byte is an ASCII letter, then return it in the opposite case. +/// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns +/// `b'A'`. If a non-ASCII letter is given, then the given byte is returned. +pub fn opposite_ascii_case(b: u8) -> u8 { + if b'A' <= b && b <= b'Z' { + b.to_ascii_lowercase() + } else if b'a' <= b && b <= b'z' { + b.to_ascii_uppercase() + } else { + b + } +} + +/// Return the frequency rank of the given byte. The higher the rank, the more +/// common the byte (heuristically speaking). +fn freq_rank(b: u8) -> u8 { + use crate::byte_frequencies::BYTE_FREQUENCIES; + BYTE_FREQUENCIES[b as usize] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn scratch() { + let mut b = Builder::new(MatchKind::LeftmostFirst); + b.add(b"Sherlock"); + b.add(b"locjaw"); + // b.add(b"Sherlock"); + // b.add(b"Holmes"); + // b.add(b"Watson"); + // b.add("Шерлок Холмс".as_bytes()); + // b.add("Джон Уотсон".as_bytes()); + + let s = b.build().unwrap(); + println!("{:?}", s); + } +} diff --git a/src/state_id.rs b/src/state_id.rs new file mode 100644 index 0000000..8973806 --- /dev/null +++ b/src/state_id.rs @@ -0,0 +1,192 @@ +use std::fmt::Debug; +use std::hash::Hash; + +use crate::error::{Error, Result}; + +// NOTE: Most of this code was copied from regex-automata, but without the +// (de)serialization specific stuff. + +/// Check that the premultiplication of the given state identifier can +/// fit into the representation indicated by `S`. If it cannot, or if it +/// overflows `usize` itself, then an error is returned. +pub fn premultiply_overflow_error( + last_state: S, + alphabet_len: usize, +) -> Result<()> { + let requested = match last_state.to_usize().checked_mul(alphabet_len) { + Some(requested) => requested, + None => return Err(Error::premultiply_overflow(0, 0)), + }; + if requested > S::max_id() { + return Err(Error::premultiply_overflow(S::max_id(), requested)); + } + Ok(()) +} + +/// Convert the given `usize` to the chosen state identifier +/// representation. If the given value cannot fit in the chosen +/// representation, then an error is returned. +pub fn usize_to_state_id(value: usize) -> Result { + if value > S::max_id() { + Err(Error::state_id_overflow(S::max_id())) + } else { + Ok(S::from_usize(value)) + } +} + +/// Return the unique identifier for an automaton's fail state in the chosen +/// representation indicated by `S`. +pub fn fail_id() -> S { + S::from_usize(0) +} + +/// Return the unique identifier for an automaton's fail state in the chosen +/// representation indicated by `S`. +pub fn dead_id() -> S { + S::from_usize(1) +} + +mod private { + /// Sealed stops crates other than aho-corasick from implementing any + /// traits that use it. + pub trait Sealed {} + impl Sealed for u8 {} + impl Sealed for u16 {} + impl Sealed for u32 {} + impl Sealed for u64 {} + impl Sealed for usize {} +} + +/// A trait describing the representation of an automaton's state identifier. +/// +/// The purpose of this trait is to safely express both the possible state +/// identifier representations that can be used in an automaton and to convert +/// between state identifier representations and types that can be used to +/// efficiently index memory (such as `usize`). +/// +/// In general, one should not need to implement this trait explicitly. Indeed, +/// for now, this trait is sealed such that it cannot be implemented by any +/// other type. In particular, this crate provides implementations for `u8`, +/// `u16`, `u32`, `u64` and `usize`. (`u32` and `u64` are only provided for +/// targets that can represent all corresponding values in a `usize`.) +pub trait StateID: + private::Sealed + + Clone + + Copy + + Debug + + Eq + + Hash + + PartialEq + + PartialOrd + + Ord +{ + /// Convert from a `usize` to this implementation's representation. + /// + /// Implementors may assume that `n <= Self::max_id`. That is, implementors + /// do not need to check whether `n` can fit inside this implementation's + /// representation. + fn from_usize(n: usize) -> Self; + + /// Convert this implementation's representation to a `usize`. + /// + /// Implementors must not return a `usize` value greater than + /// `Self::max_id` and must not permit overflow when converting between the + /// implementor's representation and `usize`. In general, the preferred + /// way for implementors to achieve this is to simply not provide + /// implementations of `StateID` that cannot fit into the target platform's + /// `usize`. + fn to_usize(self) -> usize; + + /// Return the maximum state identifier supported by this representation. + /// + /// Implementors must return a correct bound. Doing otherwise may result + /// in unspecified behavior (but will not violate memory safety). + fn max_id() -> usize; +} + +impl StateID for usize { + #[inline] + fn from_usize(n: usize) -> usize { + n + } + + #[inline] + fn to_usize(self) -> usize { + self + } + + #[inline] + fn max_id() -> usize { + ::std::usize::MAX + } +} + +impl StateID for u8 { + #[inline] + fn from_usize(n: usize) -> u8 { + n as u8 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::std::u8::MAX as usize + } +} + +impl StateID for u16 { + #[inline] + fn from_usize(n: usize) -> u16 { + n as u16 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::std::u16::MAX as usize + } +} + +#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] +impl StateID for u32 { + #[inline] + fn from_usize(n: usize) -> u32 { + n as u32 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::std::u32::MAX as usize + } +} + +#[cfg(target_pointer_width = "64")] +impl StateID for u64 { + #[inline] + fn from_usize(n: usize) -> u64 { + n as u64 + } + + #[inline] + fn to_usize(self) -> usize { + self as usize + } + + #[inline] + fn max_id() -> usize { + ::std::u64::MAX as usize + } +} diff --git a/src/tests.rs b/src/tests.rs new file mode 100644 index 0000000..20cd3d1 --- /dev/null +++ b/src/tests.rs @@ -0,0 +1,1254 @@ +use std::collections::HashMap; +use std::io; +use std::usize; + +use crate::{AhoCorasickBuilder, Match, MatchKind}; + +/// A description of a single test against an Aho-Corasick automaton. +/// +/// A single test may not necessarily pass on every configuration of an +/// Aho-Corasick automaton. The tests are categorized and grouped appropriately +/// below. +#[derive(Clone, Debug, Eq, PartialEq)] +struct SearchTest { + /// The name of this test, for debugging. + name: &'static str, + /// The patterns to search for. + patterns: &'static [&'static str], + /// The text to search. + haystack: &'static str, + /// Each match is a triple of (pattern_index, start, end), where + /// pattern_index is an index into `patterns` and `start`/`end` are indices + /// into `haystack`. + matches: &'static [(usize, usize, usize)], +} + +/// Short-hand constructor for SearchTest. We use it a lot below. +macro_rules! t { + ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => { + SearchTest { + name: stringify!($name), + patterns: $patterns, + haystack: $haystack, + matches: $matches, + } + }; +} + +/// A collection of test groups. +type TestCollection = &'static [&'static [SearchTest]]; + +// Define several collections corresponding to the different type of match +// semantics supported by Aho-Corasick. These collections have some overlap, +// but each collection should have some tests that no other collection has. + +/// Tests for Aho-Corasick's standard non-overlapping match semantics. +const AC_STANDARD_NON_OVERLAPPING: TestCollection = + &[BASICS, NON_OVERLAPPING, STANDARD, REGRESSION]; + +/// Tests for Aho-Corasick's anchored standard non-overlapping match semantics. +const AC_STANDARD_ANCHORED_NON_OVERLAPPING: TestCollection = + &[ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, STANDARD_ANCHORED]; + +/// Tests for Aho-Corasick's standard overlapping match semantics. +const AC_STANDARD_OVERLAPPING: TestCollection = + &[BASICS, OVERLAPPING, REGRESSION]; + +/// Tests for Aho-Corasick's anchored standard overlapping match semantics. +const AC_STANDARD_ANCHORED_OVERLAPPING: TestCollection = + &[ANCHORED_BASICS, ANCHORED_OVERLAPPING]; + +/// Tests for Aho-Corasick's leftmost-first match semantics. +const AC_LEFTMOST_FIRST: TestCollection = + &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_FIRST, REGRESSION]; + +/// Tests for Aho-Corasick's anchored leftmost-first match semantics. +const AC_LEFTMOST_FIRST_ANCHORED: TestCollection = &[ + ANCHORED_BASICS, + ANCHORED_NON_OVERLAPPING, + ANCHORED_LEFTMOST, + ANCHORED_LEFTMOST_FIRST, +]; + +/// Tests for Aho-Corasick's leftmost-longest match semantics. +const AC_LEFTMOST_LONGEST: TestCollection = + &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_LONGEST, REGRESSION]; + +/// Tests for Aho-Corasick's anchored leftmost-longest match semantics. +const AC_LEFTMOST_LONGEST_ANCHORED: TestCollection = &[ + ANCHORED_BASICS, + ANCHORED_NON_OVERLAPPING, + ANCHORED_LEFTMOST, + ANCHORED_LEFTMOST_LONGEST, +]; + +// Now define the individual tests that make up the collections above. + +/// A collection of tests for the Aho-Corasick algorithm that should always be +/// true regardless of match semantics. That is, all combinations of +/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} +/// should produce the same answer. +const BASICS: &'static [SearchTest] = &[ + t!(basic000, &[], "", &[]), + t!(basic001, &["a"], "", &[]), + t!(basic010, &["a"], "a", &[(0, 0, 1)]), + t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]), + t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]), + t!(basic050, &["a"], "bba", &[(0, 2, 3)]), + t!(basic060, &["a"], "bbb", &[]), + t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]), + t!(basic100, &["aa"], "", &[]), + t!(basic110, &["aa"], "aa", &[(0, 0, 2)]), + t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]), + t!(basic130, &["aa"], "abbab", &[]), + t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]), + t!(basic200, &["abc"], "abc", &[(0, 0, 3)]), + t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]), + t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]), + t!(basic300, &["a", "b"], "", &[]), + t!(basic310, &["a", "b"], "z", &[]), + t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]), + t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]), + t!( + basic340, + &["a", "b"], + "abba", + &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),] + ), + t!( + basic350, + &["b", "a"], + "abba", + &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),] + ), + t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]), + t!(basic400, &["foo", "bar"], "", &[]), + t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]), + t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]), + t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]), + t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]), + t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]), + t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]), + t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]), + t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]), + t!(basic600, &[""], "", &[(0, 0, 0)]), + t!(basic610, &[""], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(basic620, &[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]), + t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]), + t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]), + t!( + basic720, + &["yabcdef", "bcdeyabc", "abcdezghi"], + "yabcdezghi", + &[(2, 1, 10),] + ), +]; + +/// A collection of *anchored* tests for the Aho-Corasick algorithm that should +/// always be true regardless of match semantics. That is, all combinations of +/// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} should +/// produce the same answer. +const ANCHORED_BASICS: &'static [SearchTest] = &[ + t!(abasic000, &[], "", &[]), + t!(abasic010, &[""], "", &[(0, 0, 0)]), + t!(abasic020, &[""], "a", &[(0, 0, 0)]), + t!(abasic030, &[""], "abc", &[(0, 0, 0)]), + t!(abasic100, &["a"], "a", &[(0, 0, 1)]), + t!(abasic110, &["a"], "aa", &[(0, 0, 1)]), + t!(abasic120, &["a", "b"], "ab", &[(0, 0, 1)]), + t!(abasic130, &["a", "b"], "ba", &[(1, 0, 1)]), + t!(abasic140, &["foo", "foofoo"], "foo", &[(0, 0, 3)]), + t!(abasic150, &["foofoo", "foo"], "foo", &[(1, 0, 3)]), +]; + +/// Tests for non-overlapping standard match semantics. +/// +/// These tests generally shouldn't pass for leftmost-{first,longest}, although +/// some do in order to write clearer tests. For example, standard000 will +/// pass with leftmost-first semantics, but standard010 will not. We write +/// both to emphasize how the match semantics work. +const STANDARD: &'static [SearchTest] = &[ + t!(standard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(standard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]), + t!(standard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]), + t!(standard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]), + t!(standard040, &["a", ""], "a", &[(1, 0, 0), (1, 1, 1)]), + t!( + standard400, + &["abcd", "bcd", "cd", "b"], + "abcd", + &[(3, 1, 2), (2, 2, 4),] + ), + t!(standard410, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), + t!(standard420, &["", "a"], "aa", &[(0, 0, 0), (0, 1, 1), (0, 2, 2),]), + t!(standard430, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]), + t!(standard440, &["a", "", ""], "a", &[(1, 0, 0), (1, 1, 1),]), + t!(standard450, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), +]; + +/// Like STANDARD, but for anchored searches. +const STANDARD_ANCHORED: &'static [SearchTest] = &[ + t!(astandard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(astandard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]), + t!(astandard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]), + t!(astandard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]), + t!(astandard040, &["a", ""], "a", &[(1, 0, 0)]), + t!(astandard050, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), + t!(astandard410, &["", "a"], "a", &[(0, 0, 0)]), + t!(astandard420, &["", "a"], "aa", &[(0, 0, 0)]), + t!(astandard430, &["", "a", ""], "a", &[(0, 0, 0)]), + t!(astandard440, &["a", "", ""], "a", &[(1, 0, 0)]), + t!(astandard450, &["", "", "a"], "a", &[(0, 0, 0)]), +]; + +/// Tests for non-overlapping leftmost match semantics. These should pass for +/// both leftmost-first and leftmost-longest match kinds. Stated differently, +/// among ambiguous matches, the longest match and the match that appeared +/// first when constructing the automaton should always be the same. +const LEFTMOST: &'static [SearchTest] = &[ + t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftmost010, &["a", ""], "a", &[(0, 0, 1), (1, 1, 1)]), + t!(leftmost020, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), + t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), + t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]), + t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]), + t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]), + t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]), + t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), + t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), + t!( + leftmost360, + &["abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost370, + &["abcdefghi", "cde", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost380, + &["abcdefghi", "hz", "abcdefgh", "a"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + leftmost390, + &["b", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost400, + &["h", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + leftmost410, + &["z", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8), (0, 8, 9),] + ), +]; + +/// Like LEFTMOST, but for anchored searches. +const ANCHORED_LEFTMOST: &'static [SearchTest] = &[ + t!(aleftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(aleftmost010, &["a", ""], "a", &[(0, 0, 1)]), + t!(aleftmost020, &["", ""], "a", &[(0, 0, 0)]), + t!(aleftmost030, &["a", "ab"], "aa", &[(0, 0, 1)]), + t!(aleftmost031, &["ab", "a"], "aa", &[(1, 0, 1)]), + t!(aleftmost032, &["ab", "a"], "xayabbbz", &[]), + t!(aleftmost300, &["abcd", "bce", "b"], "abce", &[]), + t!(aleftmost310, &["abcd", "ce", "bc"], "abce", &[]), + t!(aleftmost320, &["abcd", "bce", "ce", "b"], "abce", &[]), + t!(aleftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[]), + t!(aleftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), + t!(aleftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), + t!( + aleftmost360, + &["abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + aleftmost370, + &["abcdefghi", "cde", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + aleftmost380, + &["abcdefghi", "hz", "abcdefgh", "a"], + "abcdefghz", + &[(2, 0, 8),] + ), + t!( + aleftmost390, + &["b", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + aleftmost400, + &["h", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!( + aleftmost410, + &["z", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8)] + ), +]; + +/// Tests for non-overlapping leftmost-first match semantics. These tests +/// should generally be specific to leftmost-first, which means they should +/// generally fail under leftmost-longest semantics. +const LEFTMOST_FIRST: &'static [SearchTest] = &[ + t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(leftfirst010, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), + t!(leftfirst011, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]), + t!(leftfirst012, &["a", "", ""], "a", &[(0, 0, 1), (1, 1, 1),]), + t!(leftfirst013, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), + t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), + t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]), + t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]), + t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]), + t!( + leftfirst310, + &["abcd", "b", "bce", "ce"], + "abce", + &[(1, 1, 2), (3, 2, 4),] + ), + t!( + leftfirst320, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(0, 0, 1), (2, 7, 9),] + ), + t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]), + t!(leftfirst400, &["amwix", "samwise", "sam"], "Zsamwix", &[(2, 1, 4)]), +]; + +/// Like LEFTMOST_FIRST, but for anchored searches. +const ANCHORED_LEFTMOST_FIRST: &'static [SearchTest] = &[ + t!(aleftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), + t!(aleftfirst010, &["", "a"], "a", &[(0, 0, 0)]), + t!(aleftfirst011, &["", "a", ""], "a", &[(0, 0, 0)]), + t!(aleftfirst012, &["a", "", ""], "a", &[(0, 0, 1)]), + t!(aleftfirst013, &["", "", "a"], "a", &[(0, 0, 0)]), + t!(aleftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), + t!(aleftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), + t!(aleftfirst040, &["a", "ab"], "xayabbbz", &[]), + t!(aleftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]), + t!(aleftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]), + t!(aleftfirst300, &["abcd", "b", "bce"], "abce", &[]), + t!(aleftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[]), + t!( + aleftfirst320, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(0, 0, 1)] + ), + t!(aleftfirst330, &["a", "abab"], "abab", &[(0, 0, 1)]), + t!(aleftfirst400, &["wise", "samwise", "sam"], "samwix", &[(2, 0, 3)]), +]; + +/// Tests for non-overlapping leftmost-longest match semantics. These tests +/// should generally be specific to leftmost-longest, which means they should +/// generally fail under leftmost-first semantics. +const LEFTMOST_LONGEST: &'static [SearchTest] = &[ + t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), + t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), + t!(leftlong020, &["", "a"], "a", &[(1, 0, 1), (0, 1, 1),]), + t!(leftlong021, &["", "a", ""], "a", &[(1, 0, 1), (0, 1, 1),]), + t!(leftlong022, &["a", "", ""], "a", &[(0, 0, 1), (1, 1, 1),]), + t!(leftlong023, &["", "", "a"], "a", &[(2, 0, 1), (0, 1, 1),]), + t!(leftlong030, &["", "a"], "aa", &[(1, 0, 1), (1, 1, 2), (0, 2, 2),]), + t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), + t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), + t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), + t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), + t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]), + t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), + t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]), + t!( + leftlong310, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), + t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]), + t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]), +]; + +/// Like LEFTMOST_LONGEST, but for anchored searches. +const ANCHORED_LEFTMOST_LONGEST: &'static [SearchTest] = &[ + t!(aleftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), + t!(aleftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), + t!(aleftlong020, &["", "a"], "a", &[(1, 0, 1)]), + t!(aleftlong021, &["", "a", ""], "a", &[(1, 0, 1)]), + t!(aleftlong022, &["a", "", ""], "a", &[(0, 0, 1)]), + t!(aleftlong023, &["", "", "a"], "a", &[(2, 0, 1)]), + t!(aleftlong030, &["", "a"], "aa", &[(1, 0, 1)]), + t!(aleftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), + t!(aleftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), + t!(aleftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), + t!(aleftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), + t!(aleftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]), + t!(aleftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]), + t!(aleftlong300, &["abcd", "b", "bce"], "abce", &[]), + t!( + aleftlong310, + &["a", "abcdefghi", "hz", "abcdefgh"], + "abcdefghz", + &[(3, 0, 8),] + ), + t!(aleftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), + t!(aleftlong330, &["abcd", "b", "ce"], "abce", &[]), + t!(aleftlong340, &["a", "ab"], "xayabbbz", &[]), +]; + +/// Tests for non-overlapping match semantics. +/// +/// Generally these tests shouldn't pass when using overlapping semantics. +/// These should pass for both standard and leftmost match semantics. +const NON_OVERLAPPING: &'static [SearchTest] = &[ + t!(nover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), + t!(nover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), + t!(nover030, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]), + t!( + nover100, + &["ab", "ba"], + "abababa", + &[(0, 0, 2), (0, 2, 4), (0, 4, 6),] + ), + t!(nover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]), + t!(nover300, &["", ""], "", &[(0, 0, 0),]), + t!(nover310, &["", ""], "a", &[(0, 0, 0), (0, 1, 1),]), +]; + +/// Like NON_OVERLAPPING, but for anchored searches. +const ANCHORED_NON_OVERLAPPING: &'static [SearchTest] = &[ + t!(anover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), + t!(anover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), + t!(anover030, &["abc", "bc"], "zazabcz", &[]), + t!(anover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]), + t!(anover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3)]), + t!(anover300, &["", ""], "", &[(0, 0, 0),]), + t!(anover310, &["", ""], "a", &[(0, 0, 0)]), +]; + +/// Tests for overlapping match semantics. +/// +/// This only supports standard match semantics, since leftmost-{first,longest} +/// do not support overlapping matches. +const OVERLAPPING: &'static [SearchTest] = &[ + t!( + over000, + &["abcd", "bcd", "cd", "b"], + "abcd", + &[(3, 1, 2), (0, 0, 4), (1, 1, 4), (2, 2, 4),] + ), + t!( + over010, + &["bcd", "cd", "b", "abcd"], + "abcd", + &[(2, 1, 2), (3, 0, 4), (0, 1, 4), (1, 2, 4),] + ), + t!( + over020, + &["abcd", "bcd", "cd"], + "abcd", + &[(0, 0, 4), (1, 1, 4), (2, 2, 4),] + ), + t!( + over030, + &["bcd", "abcd", "cd"], + "abcd", + &[(1, 0, 4), (0, 1, 4), (2, 2, 4),] + ), + t!( + over040, + &["bcd", "cd", "abcd"], + "abcd", + &[(2, 0, 4), (0, 1, 4), (1, 2, 4),] + ), + t!(over050, &["abc", "bc"], "zazabcz", &[(0, 3, 6), (1, 4, 6),]), + t!( + over100, + &["ab", "ba"], + "abababa", + &[(0, 0, 2), (1, 1, 3), (0, 2, 4), (1, 3, 5), (0, 4, 6), (1, 5, 7),] + ), + t!( + over200, + &["foo", "foo"], + "foobarfoo", + &[(0, 0, 3), (1, 0, 3), (0, 6, 9), (1, 6, 9),] + ), + t!(over300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]), + t!( + over310, + &["", ""], + "a", + &[(0, 0, 0), (1, 0, 0), (0, 1, 1), (1, 1, 1),] + ), + t!(over320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1), (0, 1, 1),]), + t!( + over330, + &["", "a", ""], + "a", + &[(0, 0, 0), (2, 0, 0), (1, 0, 1), (0, 1, 1), (2, 1, 1),] + ), + t!( + over340, + &["a", "", ""], + "a", + &[(1, 0, 0), (2, 0, 0), (0, 0, 1), (1, 1, 1), (2, 1, 1),] + ), + t!( + over350, + &["", "", "a"], + "a", + &[(0, 0, 0), (1, 0, 0), (2, 0, 1), (0, 1, 1), (1, 1, 1),] + ), + t!( + over360, + &["foo", "foofoo"], + "foofoo", + &[(0, 0, 3), (1, 0, 6), (0, 3, 6)] + ), +]; + +/// Like OVERLAPPING, but for anchored searches. +const ANCHORED_OVERLAPPING: &'static [SearchTest] = &[ + t!(aover000, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), + t!(aover010, &["bcd", "cd", "b", "abcd"], "abcd", &[(3, 0, 4)]), + t!(aover020, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4)]), + t!(aover030, &["bcd", "abcd", "cd"], "abcd", &[(1, 0, 4)]), + t!(aover040, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4)]), + t!(aover050, &["abc", "bc"], "zazabcz", &[]), + t!(aover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]), + t!(aover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (1, 0, 3)]), + t!(aover300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]), + t!(aover310, &["", ""], "a", &[(0, 0, 0), (1, 0, 0)]), + t!(aover320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1)]), + t!(aover330, &["", "a", ""], "a", &[(0, 0, 0), (2, 0, 0), (1, 0, 1)]), + t!(aover340, &["a", "", ""], "a", &[(1, 0, 0), (2, 0, 0), (0, 0, 1)]), + t!(aover350, &["", "", "a"], "a", &[(0, 0, 0), (1, 0, 0), (2, 0, 1)]), + t!(aover360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6)]), +]; + +/// Tests for ASCII case insensitivity. +/// +/// These tests should all have the same behavior regardless of match semantics +/// or whether the search is overlapping. +const ASCII_CASE_INSENSITIVE: &'static [SearchTest] = &[ + t!(acasei000, &["a"], "A", &[(0, 0, 1)]), + t!(acasei010, &["Samwise"], "SAMWISE", &[(0, 0, 7)]), + t!(acasei011, &["Samwise"], "SAMWISE.abcd", &[(0, 0, 7)]), + t!(acasei020, &["fOoBaR"], "quux foobar baz", &[(0, 5, 11)]), +]; + +/// Like ASCII_CASE_INSENSITIVE, but specifically for non-overlapping tests. +const ASCII_CASE_INSENSITIVE_NON_OVERLAPPING: &'static [SearchTest] = &[ + t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3)]), + t!(acasei000, &["FOO", "foo"], "fOo", &[(0, 0, 3)]), + t!(acasei010, &["abc", "def"], "abcdef", &[(0, 0, 3), (1, 3, 6)]), +]; + +/// Like ASCII_CASE_INSENSITIVE, but specifically for overlapping tests. +const ASCII_CASE_INSENSITIVE_OVERLAPPING: &'static [SearchTest] = &[ + t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3), (1, 0, 3)]), + t!(acasei001, &["FOO", "foo"], "fOo", &[(0, 0, 3), (1, 0, 3)]), + // This is a regression test from: + // https://github.com/BurntSushi/aho-corasick/issues/68 + // Previously, it was reporting a duplicate (1, 3, 6) match. + t!( + acasei010, + &["abc", "def", "abcdef"], + "abcdef", + &[(0, 0, 3), (2, 0, 6), (1, 3, 6)] + ), +]; + +/// Regression tests that are applied to all Aho-Corasick combinations. +/// +/// If regression tests are needed for specific match semantics, then add them +/// to the appropriate group above. +const REGRESSION: &'static [SearchTest] = &[ + t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]), + t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]), + t!( + regression030, + &["libcore/", "libstd/"], + "libcore/char/methods.rs", + &[(0, 0, 8),] + ), + t!( + regression040, + &["libstd/", "libcore/"], + "libcore/char/methods.rs", + &[(1, 0, 8),] + ), + t!( + regression050, + &["\x00\x00\x01", "\x00\x00\x00"], + "\x00\x00\x00", + &[(1, 0, 3),] + ), + t!( + regression060, + &["\x00\x00\x00", "\x00\x00\x01"], + "\x00\x00\x00", + &[(0, 0, 3),] + ), +]; + +// Now define a test for each combination of things above that we want to run. +// Since there are a few different combinations for each collection of tests, +// we define a couple of macros to avoid repetition drudgery. The testconfig +// macro constructs the automaton from a given match kind, and runs the search +// tests one-by-one over the given collection. The `with` parameter allows one +// to configure the builder with additional parameters. The testcombo macro +// invokes testconfig in precisely this way: it sets up several tests where +// each one turns a different knob on AhoCorasickBuilder. + +macro_rules! testconfig { + (overlapping, $name:ident, $collection:expr, $kind:ident, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let mut builder = AhoCorasickBuilder::new(); + $with(&mut builder); + builder + .match_kind(MatchKind::$kind) + .build(test.patterns) + .find_overlapping_iter(test.haystack) + .collect() + }); + } + }; + (stream, $name:ident, $collection:expr, $kind:ident, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let buf = + io::BufReader::with_capacity(1, test.haystack.as_bytes()); + let mut builder = AhoCorasickBuilder::new(); + $with(&mut builder); + builder + .match_kind(MatchKind::$kind) + .build(test.patterns) + .stream_find_iter(buf) + .map(|result| result.unwrap()) + .collect() + }); + } + }; + ($name:ident, $collection:expr, $kind:ident, $with:expr) => { + #[test] + fn $name() { + run_search_tests($collection, |test| { + let mut builder = AhoCorasickBuilder::new(); + $with(&mut builder); + builder + .match_kind(MatchKind::$kind) + .build(test.patterns) + .find_iter(test.haystack) + .collect() + }); + } + }; +} + +macro_rules! testcombo { + ($name:ident, $collection:expr, $kind:ident) => { + mod $name { + use super::*; + + testconfig!(nfa_default, $collection, $kind, |_| ()); + testconfig!( + nfa_no_prefilter, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.prefilter(false); + } + ); + testconfig!( + nfa_all_sparse, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dense_depth(0); + } + ); + testconfig!( + nfa_all_dense, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dense_depth(usize::MAX); + } + ); + testconfig!( + dfa_default, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dfa(true); + } + ); + testconfig!( + dfa_no_prefilter, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dfa(true).prefilter(false); + } + ); + testconfig!( + dfa_all_sparse, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dfa(true).dense_depth(0); + } + ); + testconfig!( + dfa_all_dense, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + b.dfa(true).dense_depth(usize::MAX); + } + ); + testconfig!( + dfa_no_byte_class, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] + b.dfa(true).byte_classes(false); + } + ); + testconfig!( + dfa_no_premultiply, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] + b.dfa(true).premultiply(false); + } + ); + testconfig!( + dfa_no_byte_class_no_premultiply, + $collection, + $kind, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when options are removed. + #[allow(deprecated)] + b.dfa(true).byte_classes(false).premultiply(false); + } + ); + } + }; +} + +// Write out the combinations. +testcombo!(search_leftmost_longest, AC_LEFTMOST_LONGEST, LeftmostLongest); +testcombo!(search_leftmost_first, AC_LEFTMOST_FIRST, LeftmostFirst); +testcombo!( + search_standard_nonoverlapping, + AC_STANDARD_NON_OVERLAPPING, + Standard +); + +// Write out the overlapping combo by hand since there is only one of them. +testconfig!( + overlapping, + search_standard_overlapping_nfa_default, + AC_STANDARD_OVERLAPPING, + Standard, + |_| () +); +testconfig!( + overlapping, + search_standard_overlapping_nfa_all_sparse, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dense_depth(0); + } +); +testconfig!( + overlapping, + search_standard_overlapping_nfa_all_dense, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dense_depth(usize::MAX); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_default, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dfa(true); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_all_sparse, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dfa(true).dense_depth(0); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_all_dense, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dfa(true).dense_depth(usize::MAX); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_no_byte_class, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] + b.dfa(true).byte_classes(false); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_no_premultiply, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when option is removed. + #[allow(deprecated)] + b.dfa(true).premultiply(false); + } +); +testconfig!( + overlapping, + search_standard_overlapping_dfa_no_byte_class_no_premultiply, + AC_STANDARD_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + // TODO: remove tests when options are removed. + #[allow(deprecated)] + b.dfa(true).byte_classes(false).premultiply(false); + } +); + +// Also write out tests manually for streams, since we only test the standard +// match semantics. We also don't bother testing different automaton +// configurations, since those are well covered by tests above. +testconfig!( + stream, + search_standard_stream_nfa_default, + AC_STANDARD_NON_OVERLAPPING, + Standard, + |_| () +); +testconfig!( + stream, + search_standard_stream_dfa_default, + AC_STANDARD_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.dfa(true); + } +); + +// Same thing for anchored searches. Write them out manually. +testconfig!( + search_standard_anchored_nfa_default, + AC_STANDARD_ANCHORED_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.anchored(true); + } +); +testconfig!( + search_standard_anchored_dfa_default, + AC_STANDARD_ANCHORED_NON_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.anchored(true).dfa(true); + } +); +testconfig!( + overlapping, + search_standard_anchored_overlapping_nfa_default, + AC_STANDARD_ANCHORED_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.anchored(true); + } +); +testconfig!( + overlapping, + search_standard_anchored_overlapping_dfa_default, + AC_STANDARD_ANCHORED_OVERLAPPING, + Standard, + |b: &mut AhoCorasickBuilder| { + b.anchored(true).dfa(true); + } +); +testconfig!( + search_leftmost_first_anchored_nfa_default, + AC_LEFTMOST_FIRST_ANCHORED, + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.anchored(true); + } +); +testconfig!( + search_leftmost_first_anchored_dfa_default, + AC_LEFTMOST_FIRST_ANCHORED, + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.anchored(true).dfa(true); + } +); +testconfig!( + search_leftmost_longest_anchored_nfa_default, + AC_LEFTMOST_LONGEST_ANCHORED, + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.anchored(true); + } +); +testconfig!( + search_leftmost_longest_anchored_dfa_default, + AC_LEFTMOST_LONGEST_ANCHORED, + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.anchored(true).dfa(true); + } +); + +// And also write out the test combinations for ASCII case insensitivity. +testconfig!( + acasei_standard_nfa_default, + &[ASCII_CASE_INSENSITIVE], + Standard, + |b: &mut AhoCorasickBuilder| { + b.prefilter(false).ascii_case_insensitive(true); + } +); +testconfig!( + acasei_standard_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); +testconfig!( + overlapping, + acasei_standard_overlapping_nfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + overlapping, + acasei_standard_overlapping_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], + Standard, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); +testconfig!( + acasei_leftmost_first_nfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_first_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostFirst, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); +testconfig!( + acasei_leftmost_longest_nfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true); + } +); +testconfig!( + acasei_leftmost_longest_dfa_default, + &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], + LeftmostLongest, + |b: &mut AhoCorasickBuilder| { + b.ascii_case_insensitive(true).dfa(true); + } +); + +fn run_search_tests Vec>( + which: TestCollection, + mut f: F, +) { + let get_match_triples = + |matches: Vec| -> Vec<(usize, usize, usize)> { + matches + .into_iter() + .map(|m| (m.pattern(), m.start(), m.end())) + .collect() + }; + for &tests in which { + for test in tests { + assert_eq!( + test.matches, + get_match_triples(f(&test)).as_slice(), + "test: {}, patterns: {:?}, haystack: {:?}", + test.name, + test.patterns, + test.haystack + ); + } + } +} + +#[test] +fn search_tests_have_unique_names() { + let assert = |constname, tests: &[SearchTest]| { + let mut seen = HashMap::new(); // map from test name to position + for (i, test) in tests.iter().enumerate() { + if !seen.contains_key(test.name) { + seen.insert(test.name, i); + } else { + let last = seen[test.name]; + panic!( + "{} tests have duplicate names at positions {} and {}", + constname, last, i + ); + } + } + }; + assert("BASICS", BASICS); + assert("STANDARD", STANDARD); + assert("LEFTMOST", LEFTMOST); + assert("LEFTMOST_FIRST", LEFTMOST_FIRST); + assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST); + assert("NON_OVERLAPPING", NON_OVERLAPPING); + assert("OVERLAPPING", OVERLAPPING); + assert("REGRESSION", REGRESSION); +} + +#[test] +#[should_panic] +fn stream_not_allowed_leftmost_first() { + let fsm = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .build(None::); + assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0); +} + +#[test] +#[should_panic] +fn stream_not_allowed_leftmost_longest() { + let fsm = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostLongest) + .build(None::); + assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0); +} + +#[test] +#[should_panic] +fn overlapping_not_allowed_leftmost_first() { + let fsm = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .build(None::); + assert_eq!(fsm.find_overlapping_iter("").count(), 0); +} + +#[test] +#[should_panic] +fn overlapping_not_allowed_leftmost_longest() { + let fsm = AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostLongest) + .build(None::); + assert_eq!(fsm.find_overlapping_iter("").count(), 0); +} + +#[test] +fn state_id_too_small() { + let mut patterns = vec![]; + for c1 in (b'a'..b'z').map(|b| b as char) { + for c2 in (b'a'..b'z').map(|b| b as char) { + for c3 in (b'a'..b'z').map(|b| b as char) { + patterns.push(format!("{}{}{}", c1, c2, c3)); + } + } + } + let result = + AhoCorasickBuilder::new().build_with_size::(&patterns); + assert!(result.is_err()); +} + +// See: https://github.com/BurntSushi/aho-corasick/issues/44 +// +// In short, this test ensures that enabling ASCII case insensitivity does not +// visit an exponential number of states when filling in failure transitions. +#[test] +fn regression_ascii_case_insensitive_no_exponential() { + let ac = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .build(&["Tsubaki House-Triple Shot Vol01校花三姐妹"]); + assert!(ac.find("").is_none()); +} + +// See: https://github.com/BurntSushi/aho-corasick/issues/53 +// +// This test ensures that the rare byte prefilter works in a particular corner +// case. In particular, the shift offset detected for '/' in the patterns below +// was incorrect, leading to a false negative. +#[test] +fn regression_rare_byte_prefilter() { + use crate::AhoCorasick; + + let ac = AhoCorasick::new_auto_configured(&["ab/j/", "x/"]); + assert!(ac.is_match("ab/j/")); +} + +#[test] +fn regression_case_insensitive_prefilter() { + use crate::AhoCorasickBuilder; + + for c in b'a'..b'z' { + for c2 in b'a'..b'z' { + let c = c as char; + let c2 = c2 as char; + let needle = format!("{}{}", c, c2).to_lowercase(); + let haystack = needle.to_uppercase(); + let ac = AhoCorasickBuilder::new() + .ascii_case_insensitive(true) + .prefilter(true) + .build(&[&needle]); + assert_eq!( + 1, + ac.find_iter(&haystack).count(), + "failed to find {:?} in {:?}\n\nautomaton:\n{:?}", + needle, + haystack, + ac, + ); + } + } +} + +// See: https://github.com/BurntSushi/aho-corasick/issues/64 +// +// This occurs when the rare byte prefilter is active. +#[test] +fn regression_stream_rare_byte_prefilter() { + use std::io::Read; + + // NOTE: The test only fails if this ends with j. + const MAGIC: [u8; 5] = *b"1234j"; + + // NOTE: The test fails for value in 8188..=8191 These value put the string + // to search accross two call to read because the buffer size is 8192 by + // default. + const BEGIN: usize = 8191; + + /// This is just a structure that implements Reader. The reader + /// implementation will simulate a file filled with 0, except for the MAGIC + /// string at offset BEGIN. + #[derive(Default)] + struct R { + read: usize, + } + + impl Read for R { + fn read(&mut self, buf: &mut [u8]) -> ::std::io::Result { + //dbg!(buf.len()); + if self.read > 100000 { + return Ok(0); + } + let mut from = 0; + if self.read < BEGIN { + from = buf.len().min(BEGIN - self.read); + for x in 0..from { + buf[x] = 0; + } + self.read += from; + } + if self.read >= BEGIN && self.read <= BEGIN + MAGIC.len() { + let to = buf.len().min(BEGIN + MAGIC.len() - self.read + from); + if to > from { + buf[from..to].copy_from_slice( + &MAGIC + [self.read - BEGIN..self.read - BEGIN + to - from], + ); + self.read += to - from; + from = to; + } + } + for x in from..buf.len() { + buf[x] = 0; + self.read += 1; + } + Ok(buf.len()) + } + } + + fn run() -> ::std::io::Result<()> { + let aut = AhoCorasickBuilder::new().build(&[&MAGIC]); + + // While reading from a vector, it works: + let mut buf = vec![]; + R::default().read_to_end(&mut buf)?; + let from_whole = aut.find_iter(&buf).next().unwrap().start(); + + //But using stream_find_iter fails! + let mut file = R::default(); + let begin = aut + .stream_find_iter(&mut file) + .next() + .expect("NOT FOUND!!!!")? // Panic here + .start(); + assert_eq!(from_whole, begin); + Ok(()) + } + + run().unwrap() +} -- 2.7.4