Import regex-automata 0.2.0

author Woohyun Jung <wh0705.jung@samsung.com>

Tue, 14 Mar 2023 03:57:19 +0000 (12:57 +0900)

committer Woohyun Jung <wh0705.jung@samsung.com>

Tue, 14 Mar 2023 03:57:19 +0000 (12:57 +0900)
author Woohyun Jung <wh0705.jung@samsung.com>
Tue, 14 Mar 2023 03:57:19 +0000 (12:57 +0900)
committer Woohyun Jung <wh0705.jung@samsung.com>
Tue, 14 Mar 2023 03:57:19 +0000 (12:57 +0900)
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json

new file mode 100644 (file)

index 0000000..56d58a3
--- /dev/null
+++ b/.cargo_vcs_info.json
@@ -0,0 +1,6 @@
+{
+  "git": {
+    "sha1": "0ba880134d649866fa15809dec9c6eae89cd7591"
+  },
+  "path_in_vcs": ""
+}
+\ No newline at end of file
diff --git a/.gitignore b/.gitignore

new file mode 100644 (file)

index 0000000..0fcea19
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+/tmp
+/target
+/examples/target
+/regex-automata-debug/target
+/regex-cli/target
+/regex-test/target
+tags
+/Cargo.lock
+/examples/Cargo.lock
+BREADCRUMBS
diff --git a/COPYING b/COPYING

new file mode 100644 (file)

index 0000000..bb9c20a
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,3 @@
+This project is dual-licensed under the Unlicense and MIT licenses.
+
+You may use this code under the terms of either license.
diff --git a/Cargo.toml b/Cargo.toml

new file mode 100644 (file)

index 0000000..153f11f
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,88 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2018"
+name = "regex-automata"
+version = "0.2.0"
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+exclude = [
+    "/.github",
+    "/scripts/*",
+    "/regex-cli",
+    "/regex-test",
+]
+autoexamples = false
+autotests = false
+description = "Automata construction and matching using regular expressions."
+homepage = "https://github.com/BurntSushi/regex-automata"
+documentation = "https://docs.rs/regex-automata"
+readme = "README.md"
+keywords = [
+    "regex",
+    "dfa",
+    "automata",
+    "automaton",
+    "nfa",
+]
+categories = ["text-processing"]
+license = "Unlicense/MIT"
+repository = "https://github.com/BurntSushi/regex-automata"
+resolver = "2"
+
+[profile.bench]
+debug = true
+
+[profile.dev]
+opt-level = 3
+debug = true
+
+[profile.release]
+debug = true
+
+[profile.test]
+opt-level = 3
+debug = true
+
+[lib]
+bench = false
+
+[[test]]
+name = "integration"
+path = "tests/tests.rs"
+
+[dependencies.fst]
+version = "0.4.5"
+optional = true
+
+[dependencies.log]
+version = "0.4.14"
+optional = true
+
+[dependencies.memchr]
+version = "2.4.0"
+default-features = false
+
+[dependencies.regex-syntax]
+version = "0.6.24"
+optional = true
+
+[features]
+alloc = ["syntax"]
+default = [
+    "std",
+    "alloc",
+    "syntax",
+]
+logging = ["log"]
+std = []
+syntax = ["regex-syntax"]
+transducer = ["fst"]
diff --git a/Cargo.toml.orig b/Cargo.toml.orig

new file mode 100644 (file)

index 0000000..530105d
--- /dev/null
+++ b/Cargo.toml.orig
@@ -0,0 +1,123 @@
+[package]
+name = "regex-automata"
+version = "0.2.0"  #:version
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = "Automata construction and matching using regular expressions."
+documentation = "https://docs.rs/regex-automata"
+homepage = "https://github.com/BurntSushi/regex-automata"
+repository = "https://github.com/BurntSushi/regex-automata"
+readme = "README.md"
+keywords = ["regex", "dfa", "automata", "automaton", "nfa"]
+license = "Unlicense/MIT"
+categories = ["text-processing"]
+exclude = [
+  "/.github", "/scripts/*", "/regex-cli", "/regex-test",
+]
+autotests = false
+autoexamples = false
+edition = "2018"
+resolver = "2"
+
+[workspace]
+members = ["bench", "examples", "regex-cli", "regex-test"]
+
+[lib]
+bench = false
+
+[features]
+# WARNING: The features below were assembled quickly without much thought.
+# They might not work as you expect. The safest configuration is the default
+# configuration.
+default = ["std", "alloc", "syntax"]
+std = []
+alloc = ["syntax"]
+transducer = ["fst"]
+logging = ["log"]
+syntax = ["regex-syntax"]
+
+# WARNING: The features below are in a very rough draft form, which is why
+# they are all commented out. I'm still working through the crate feature
+# design, planned for the regex-automata 0.3 release.
+
+# TODO: These features need to be fleshed out more, actually implemented and
+# then tested. Also, add 'alloc' and 'std' features to regex-syntax before
+# doing so.
+#default = ["std", "dfa", "syntax", "unicode", "regex-syntax/default"]
+#std = ["alloc", "memchr/std"]
+# TODO: Should this also imply regex-syntax/alloc? Will that turn into a no-op
+# if regex-syntax isn't enabled as a dependency? Do we need a separate
+# 'alloc_nosyntax' feature to enable alloc features without bringing in
+# regex-syntax? Sigh.
+#alloc = []
+#logging = ["log"]
+#transducer = ["fst"]
+
+# When enabled, the 'dfa' sub-module will be available. Note that if 'dfa' is
+# enabled but 'alloc' is not, then only DFA deserialization and search will be
+# available. DFA construction requires the 'alloc' and 'syntax' features to be
+# enabled.
+#dfa = []
+#syntax = ["regex-syntax"]
+
+## Enables all Unicode features. This expands if new Unicode features are added.
+#unicode = [
+#  "unicode-age",
+#  "unicode-bool",
+#  "unicode-case",
+#  "unicode-gencat",
+#  "unicode-perl",
+#  "unicode-script",
+#  "unicode-segment",
+#  "regex-syntax/unicode",
+#]
+## Enables use of the `Age` property, e.g., `\p{Age:3.0}`.
+#unicode-age = ["regex-syntax/unicode-age"]
+## Enables use of a smattering of boolean properties, e.g., `\p{Emoji}`.
+#unicode-bool = ["regex-syntax/unicode-bool"]
+## Enables Unicode-aware case insensitive matching, e.g., `(?i)β`.
+#unicode-case = ["regex-syntax/unicode-case"]
+## Enables Unicode general categories, e.g., `\p{Letter}` or `\pL`.
+#unicode-gencat = ["regex-syntax/unicode-gencat"]
+## Enables Unicode-aware Perl classes corresponding to `\w`, `\s` and `\d`.
+#unicode-perl = ["regex-syntax/unicode-perl"]
+## Enables Unicode scripts and script extensions, e.g., `\p{Greek}`.
+#unicode-script = ["regex-syntax/unicode-script"]
+## Enables Unicode segmentation properties, e.g., `\p{gcb=Extend}`.
+#unicode-segment = ["regex-syntax/unicode-segment"]
+
+[dependencies]
+fst = { version = "0.4.5", optional = true }
+log = { version = "0.4.14", optional = true }
+memchr = { version = "2.4.0", default-features = false }
+regex-syntax = { version = "0.6.24", optional = true }
+
+# [dev-dependencies]
+# bstr = { version = "0.2.16", default-features = false, features = ["std"] }
+# quickcheck = { version = "1.0.3", default-features = false }
+# regex-syntax = "0.6.16"
+# regex-test = { version = "*", path = "regex-test" }
+
+[[test]]
+path = "tests/tests.rs"
+name = "integration"
+
+[profile.dev]
+# Running tests takes too long in debug mode, so we forcefully always build
+# with optimizations. Unfortunate, but, ¯\_(ツ)_/¯.
+#
+# It's counter-intuitive that this needs to be set on dev *and* test, but
+# it's because the tests that take a long time to run are run as integration
+# tests in a separate crate. The test.opt-level setting won't apply there, so
+# we need to set the opt-level across the entire build.
+opt-level = 3
+debug = true
+
+[profile.test]
+opt-level = 3
+debug = true
+
+[profile.release]
+debug = true
+
+[profile.bench]
+debug = true
diff --git a/LICENSE-MIT b/LICENSE-MIT

new file mode 100644 (file)

index 0000000..3b0a5dc
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Andrew Gallant
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/PLANS.md b/PLANS.md

new file mode 100644 (file)

index 0000000..2fa9392
--- /dev/null
+++ b/PLANS.md
@@ -0,0 +1,165 @@
+pattern_limit should not be defined inside nfa::thompson, but rather at the
+top-level.
+
+-----
+
+Main problem right now is exemplified by the set60 and set70 failing tests. In
+particular, when finding the starting position while matching multiple regexes
+simultaneously, the reverse search is messed up. The reverse search doesn't
+depend on which regex matched in the forward direction, which means it won't
+always find the correcting starting location. Unfortunately, the only way to
+fix this, as far as I can tell, is to add a group of start states for every
+regex in the DFA. Then once we do the reverse search, we need to choose the
+correct start state based on which regex matched in the forward direction.
+
+This is a nasty change.
+
+So it looks like this only applies when doing an overlapping search in reverse
+to find the start of a match. That means we should make this configurable
+but enable it by default for the reverse automata. It should be configurable
+so that folks can construct a regex that doesn't have the ability to do
+overlapping searches correctly. If an overlapping search is attempted with
+a reverse automaton that lacks starting states for each pattern, then the
+implementation should panic.
+
+BUT! It is also convenient to provide this option in general for folks that
+want a DFA that can match any pattern while also being able to match a specific
+pattern.
+
+Straw man:
+
+* Update dense::Config to have a `starts_for_each_pattern` option. It should
+  be disabled by default.
+* In `RegexBuilder::build_many_with_size` tweak the reverse DFA configuration
+  to have the aforementioned option enabled.
+* It would be interesting to add new APIs to `Regex` that support matching
+  specific patterns, but I think this is a complication. If we did want to do
+  this, then we should just add it to the `_at` variants and leave the rest of
+  the API untouched.
+* Add a `pattern_id: Option<PatternID>` parameter to each of the five
+  `*_at` methods on the `dfa::Automaton` trait. A value of `None` retains the
+  existing behavior. A `Some` value means that the starting state for that
+  specific pattern must be chosen, which in turn implies an anchored search.
+  (This means `starts_for_each_pattern` has utility for single-pattern DFAs
+  since it makes it possible to build a DFA that can do both unanchored and
+  anchored searches.)
+* Thread this new parameter down into the various functions in `dfa::search`
+  all the way down into `init_fwd` and `init_rev`. These functions will then
+  pass it to `dfa.start_state_{forward,reverse}`.
+* This is where things get gruesome since we now need to completely re-work how
+  start states are represented in dense and sparse DFAs _and_ it needs to be
+  configurable. It looks like the `Start` type from `dfa::automaton` can
+  basically remain unchanged, since it still represents one of the four
+  possible starting states that will need to be applied for every pattern.
+* For `dfa::dense`, change `StartList` to `StartTable`. Currently, its only
+  header is the state ID count, which is always 4. We'll want to change this
+  to the stride and add a new header value that encodes the number of patterns.
+  When the number of patterns is zero, then existing behavior is preserved and
+  represents the case where `starts_for_each_pattern` is disabled (or in the
+  case of an empty DFA). When non-zero, a table of starting state IDs is
+  encoded with each row corresponding to the 4 starting states for each
+  pattern. Before this table (even if it's empty), the 4 starting states for
+  the entire DFA are encoded.
+* For `dfa::sparse`, do the same as above. They are essentially the same right
+  now anyway, with the only difference being that sparse DFAs use `&[u8]`
+  instead of `&[S]` (because sparse DFAs don't have any alignment
+  requirements).
+* Modify `DFA::empty` to accept a `starts_for_each_pattern` bool that, when
+  true, creates a start table with the header, the start states for the entire
+  DFA and a row of start states for each pattern. When false, no rows are
+  added.
+* Expose whether there are starting states for each pattern via a predicate
+  on the DFA.
+* Modify the determinizer's `add_starts` method to basically do what it does,
+  but also do it for each pattern when the DFA is configured for it. It should
+  continue to reuse states as appropriate or not generate new states if they
+  aren't needed. This will want to use the `NFA::start_pattern` method, which
+  provides the starting NFA state ID for the given pattern.
+* Fix the dense->sparse conversion. At this point, this piece should be fairly
+  straight-forward since the sparse representation of starting states is
+  basically identical to the dense representation.
+
+At this point, I think the bug should resolve itself.
+
+^^^^ DONE! IT WORKS!
+
+-----
+
+
+Add top-level SyntaxConfig (or some such) that has all of the regex-syntax
+options forwarded, but with automata oriented docs. Then use this for all of
+the engines instead of having to repeat every option for every builder.
+
+-----
+
+These produce different results. PCRE2 looks correct. Basically, we should be
+using the context around the `at` position correctly, which we aren't doing
+right now. Seems tricky to get right, particularly when confirming the match
+with a reverse DFA.
+
+Maybe our 'at' functions need to take a full range... Sigh. This is indeed what
+RE2 does. GAH.
+
+fn main() {
+    let re = regex::Regex::new(r"(?-u)\b\sbar").unwrap();
+    let s = "foo bar baz";
+    println!("{:?}", re.find_at(s, 3).map(|m| m.as_str()));
+
+    let re = pcre2::bytes::Regex::new(r"\b\sbar").unwrap();
+    let s = "foo bar baz";
+    println!("{:?}", re.find_at(s.as_bytes(), 3).unwrap());
+}
+
+^^^^ This is fixed now, but we still need to find a way to add test coverage
+for "context" searches. It'd be nice to do this automatically, but we'll
+probably just added a new 'context = [start, end]' option.
+
+-----
+
+
+* Create regex-test crate, based on glob-test. Try to anticipate the needs for
+  the full regex test suite.
+  * See if we can clean up tests.
+    * Provide a way to mark a test as expensive.
+    * Provide a way to test is_match_at and find_at.
+    * Test shortest_match_at too? Huge pain. Add tests for it.
+    * Port ALL tests from the regex crate. Will probably need a way to mark a
+      test as skipped.
+    * Document tests better.
+* Find a way to remove byteorder dependency.
+* Reorganize crate API:
+  * Have errors contain `Box<Error+Send+Sync>` instead of `String`.
+  * Make errors non-exhaustive.
+  * Audit `StateID` trait for safety.
+  * Brainstorm hard about `DFA` trait and the fact that DenseDFA and SparseDFA
+    have inefficient implementations of some methods. Maybe use multiple
+    traits? Answer: get rid of premultiply/classes knobs and just enable
+    them by default. Should remove a huge amount of code.
+  * Check whether `unsafe` is really needed to eliminate bounds checks. Use
+    micro-benchmarks and bigger CLI workloads using `regex-automata-debug`.
+  * Re-write module docs for `dfa` as they are no longer top-level. Keep most.
+  * Retain any pertinent top-level crate docs, but don't rewrite yet.
+  * Clean up builders if we can. e.g., Determinizer, minimizer, it's all a mess
+    right now.
+  * Clean up and add 'always_match' and 'never_match' constructors for every
+    regex engine.
+  * See about supporting ^, $, \A, \z, \b and \B in DFAs. Do the non-Unicode
+    version of \b unfortunately. Carefully scrutinize how the regex crate's
+    lazy DFA does it and try to make it comprehensible. Done! Except for the
+    part about making it comprehensible.
+* Rethink prefilters?
+* Add `regex-automata-generate` CLI tool. This should just be a copy of
+  the `ucd-generate dfa` and `ucd-generate regex` commands.
+
+Then build new public `nfa` sub-module.
+  * For Unicode \b, generate \w DFA (forwards and reverse) and embed it into
+    source for fast checking. That way, we don't need to ever do explicit UTF-8
+    decoding anywhere. Yay.
+
+Then `lazy` sub-module.
+
+Then `onepass`.
+
+Then `jit`.
+
+... and beyond? CRAZY. But it can be done! Build STRONG base layers.
diff --git a/README.md b/README.md

new file mode 100644 (file)

index 0000000..23e0bff
--- /dev/null
+++ b/README.md
@@ -0,0 +1,222 @@
+regex-automata
+==============
+A low level regular expression library that uses deterministic finite automata.
+It supports a rich syntax with Unicode support, has extensive options for
+configuring the best space vs time trade off for your use case and provides
+support for cheap deserialization of automata for use in `no_std` environments.
+
+[![Build status](https://github.com/BurntSushi/regex-automata/workflows/ci/badge.svg)](https://github.com/BurntSushi/regex-automata/actions)
+[![Crates.io](https://img.shields.io/crates/v/regex-automata.svg)](https://crates.io/crates/regex-automata)
+![Minimum Supported Rust Version 1.41](https://img.shields.io/badge/rustc-1.41-green)
+
+Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
+
+
+### Documentation
+
+https://docs.rs/regex-automata
+
+
+### Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+regex-automata = "0.1"
+```
+
+**WARNING**: The `master` branch currently contains code for the `0.2` release,
+but this README still targets the `0.1` release. Namely, it is recommended to
+stick with the `0.1` release. The `0.2` release was made prematurely in order
+to unblock some folks.
+
+
+### Example: basic regex searching
+
+This example shows how to compile a regex using the default configuration
+and then use it to find matches in a byte string:
+
+```rust
+use regex_automata::Regex;
+
+let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
+assert_eq!(matches, vec![(0, 10), (11, 21)]);
+```
+
+For more examples and information about the various knobs that can be turned,
+please see the [docs](https://docs.rs/regex-automata/0.1).
+
+
+### Support for `no_std`
+
+This crate comes with a `std` feature that is enabled by default. When the
+`std` feature is enabled, the API of this crate will include the facilities
+necessary for compiling, serializing, deserializing and searching with regular
+expressions. When the `std` feature is disabled, the API of this crate will
+shrink such that it only includes the facilities necessary for deserializing
+and searching with regular expressions.
+
+The intended workflow for `no_std` environments is thus as follows:
+
+* Write a program with the `std` feature that compiles and serializes a
+  regular expression. Serialization should only happen after first converting
+  the DFAs to use a fixed size state identifier instead of the default `usize`.
+  You may also need to serialize both little and big endian versions of each
+  DFA. (So that's 4 DFAs in total for each regex.)
+* In your `no_std` environment, follow the examples above for deserializing
+  your previously serialized DFAs into regexes. You can then search with them
+  as you would any regex.
+
+Deserialization can happen anywhere. For example, with bytes embedded into a
+binary or with a file memory mapped at runtime.
+
+Note that the
+[`ucd-generate`](https://github.com/BurntSushi/ucd-generate)
+tool will do the first step for you with its `dfa` or `regex` sub-commands.
+
+
+### Cargo features
+
+* `std` - **Enabled** by default. This enables the ability to compile finite
+  automata. This requires the `regex-syntax` dependency. Without this feature
+  enabled, finite automata can only be used for searching (using the approach
+  described above).
+* `transducer` - **Disabled** by default. This provides implementations of the
+  `Automaton` trait found in the `fst` crate. This permits using finite
+  automata generated by this crate to search finite state transducers. This
+  requires the `fst` dependency.
+
+
+### Differences with the regex crate
+
+The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
+general purpose regular expression engine. It aims to automatically balance low
+compile times, fast search times and low memory usage, while also providing
+a convenient API for users. In contrast, this crate provides a lower level
+regular expression interface that is a bit less convenient while providing more
+explicit control over memory usage and search times.
+
+Here are some specific negative differences:
+
+* **Compilation can take an exponential amount of time and space** in the size
+  of the regex pattern. While most patterns do not exhibit worst case
+  exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will
+  build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should
+  not be compiled with this library. (In the future, the API may expose an
+  option to return an error if the DFA gets too big.)
+* This crate does not support sub-match extraction, which can be achieved with
+  the regex crate's "captures" API. This may be added in the future, but is
+  unlikely.
+* While the regex crate doesn't necessarily sport fast compilation times, the
+  regexes in this crate are almost universally slow to compile, especially when
+  they contain large Unicode character classes. For example, on my system,
+  compiling `\w{3}` with byte classes enabled takes just over 1 second and
+  almost 5MB of memory! (Compiling a sparse regex takes about the same time
+  but only uses about 500KB of memory.) Conversly, compiling the same regex
+  without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and
+  less than 5KB of memory. For this reason, you should only use Unicode
+  character classes if you absolutely need them!
+* This crate does not support regex sets.
+* This crate does not support zero-width assertions such as `^`, `$`, `\b` or
+  `\B`.
+* As a lower level crate, this library does not do literal optimizations. In
+  exchange, you get predictable performance regardless of input. The
+  philosophy here is that literal optimizations should be applied at a higher
+  level, although there is no easy support for this in the ecosystem yet.
+* There is no `&str` API like in the regex crate. In this crate, all APIs
+  operate on `&[u8]`. By default, match indices are guaranteed to fall on
+  UTF-8 boundaries, unless `RegexBuilder::allow_invalid_utf8` is enabled.
+
+With some of the downsides out of the way, here are some positive differences:
+
+* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
+  deserialized. Deserialization always takes constant time since searching can
+  be performed directly on the raw serialized bytes of a DFA.
+* This crate was specifically designed so that the searching phase of a DFA has
+  minimal runtime requirements, and can therefore be used in `no_std`
+  environments. While `no_std` environments cannot compile regexes, they can
+  deserialize pre-compiled regexes.
+* Since this crate builds DFAs ahead of time, it will generally out-perform
+  the `regex` crate on equivalent tasks. The performance difference is likely
+  not large. However, because of a complex set of optimizations in the regex
+  crate (like literal optimizations), an accurate performance comparison may be
+  difficult to do.
+* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
+  performance a small amount, but uses much less storage space. Potentially
+  even less than what the regex crate uses.
+* This crate exposes DFAs directly, such as `DenseDFA` and `SparseDFA`,
+  which enables one to do less work in some cases. For example, if you only
+  need the end of a match and not the start of a match, then you can use a DFA
+  directly without building a `Regex`, which always requires a second DFA to
+  find the start of a match.
+* Aside from choosing between dense and sparse DFAs, there are several options
+  for configuring the space usage vs search time trade off. These include
+  things like choosing a smaller state identifier representation, to
+  premultiplying state identifiers and splitting a DFA's alphabet into
+  equivalence classes. Finally, DFA minimization is also provided, but can
+  increase compilation times dramatically.
+
+
+### Future work
+
+* Look into being smarter about generating NFA states for large Unicode
+  character classes. These can create a lot of additional work for both the
+  determinizer and the minimizer, and I suspect this is the key thing we'll
+  want to improve if we want to make DFA compile times faster. I *believe*
+  it's possible to potentially build minimal or nearly minimal NFAs for the
+  special case of Unicode character classes by leveraging Daciuk's algorithms
+  for building minimal automata in linear time for sets of strings. See
+  https://blog.burntsushi.net/transducers/#construction for more details. The
+  key adaptation I think we need to make is to modify the algorithm to operate
+  on byte ranges instead of enumerating every codepoint in the set. Otherwise,
+  it might not be worth doing.
+* Add support for regex sets. It should be possible to do this by "simply"
+  introducing more match states. I think we can also report the positions at
+  each match, similar to how Aho-Corasick works. I think the long pole in the
+  tent here is probably the API design work and arranging it so that we don't
+  introduce extra overhead into the non-regex-set case without duplicating a
+  lot of code. It seems doable.
+* Stretch goal: support capturing groups by implementing "tagged" DFA
+  (transducers). Laurikari's paper is the usual reference here, but Trofimovich
+  has a much more thorough treatment here:
+  https://re2c.org/2017_trofimovich_tagged_deterministic_finite_automata_with_lookahead.pdf
+  I've only read the paper once. I suspect it will require at least a few more
+  read throughs before I understand it.
+  See also: https://re2c.org
+* Possibly less ambitious goal: can we select a portion of Trofimovich's work
+  to make small fixed length look-around work? It would be really nice to
+  support ^, $ and \b, especially the Unicode variant of \b and CRLF aware $.
+* Experiment with code generating Rust code. There is an early experiment in
+  src/codegen.rs that is thoroughly bit-rotted. At the time, I was
+  experimenting with whether or not codegen would significant decrease the size
+  of a DFA, since if you squint hard enough, it's kind of like a sparse
+  representation. However, it didn't shrink as much as I thought it would, so
+  I gave up. The other problem is that Rust doesn't support gotos, so I don't
+  even know whether the "match on each state" in a loop thing will be fast
+  enough. Either way, it's probably a good option to have. For one thing, it
+  would be endian independent where as the serialization format of the DFAs in
+  this crate are endian dependent (so you need two versions of every DFA, but
+  you only need to compile one of them for any given arch).
+* Experiment with unrolling the match loops and fill out the benchmarks.
+* Add some kind of streaming API. I believe users of the library can already
+  implement something for this outside of the crate, but it would be good to
+  provide an official API. The key thing here is figuring out the API. I
+  suspect we might want to support several variants.
+* Make a decision on whether or not there is room for literal optimizations
+  in this crate. My original intent was to not let this crate sink down into
+  that very very very deep rabbit hole. But instead, we might want to provide
+  some way for literal optimizations to hook into the match routines. The right
+  path forward here is to probably build something outside of the crate and
+  then see about integrating it. After all, users can implement their own
+  match routines just as efficiently as what the crate provides.
+* A key downside of DFAs is that they can take up a lot of memory and can be
+  quite costly to build. Their worst case compilation time is O(2^n), where
+  n is the number of NFA states. A paper by Yang and Prasanna (2011) actually
+  seems to provide a way to character state blow up such that it is detectable.
+  If we could know whether a regex will exhibit state explosion or not, then
+  we could make an intelligent decision about whether to ahead-of-time compile
+  a DFA.
+  See: https://www.researchgate.net/profile/Xu-Shutu/publication/229032602_Characterization_of_a_global_germplasm_collection_and_its_potential_utilization_for_analysis_of_complex_quantitative_traits_in_maize/links/02bfe50f914d04c837000000/Characterization-of-a-global-germplasm-collection-and-its-potential-utilization-for-analysis-of-complex-quantitative-traits-in-maize.pdf
diff --git a/TODO b/TODO

new file mode 100644 (file)

index 0000000..68f0187
--- /dev/null
+++ b/TODO
@@ -0,0 +1,13 @@
+* Consider refactoring the NFA representation such that it can be instantly
+  loaded from a `&[u8]`, just like a sparse DFA. Main downside is that this
+  could negatively impact using the NFA with deserialization costs. Before
+  doing this, we should write PikeVM and backtracking implementations so that
+  they can be benchmarked.
+* Add captures to NFA.
+* Once we're happy, re-organize the public API such that NFAs are exported
+  and usable on their own.
+
+* Investigate why NFA shrinking seems to produce bigger DFAs after
+  determinization, even though it makes determinization substantially
+  faster. This might be because of its use of sparse NFA states, which have
+  a lower constant overhead associated with them.
diff --git a/UNLICENSE b/UNLICENSE

new file mode 100644 (file)

index 0000000..68a49da
--- /dev/null
+++ b/UNLICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/rustfmt.toml b/rustfmt.toml

new file mode 100644 (file)

index 0000000..aa37a21
--- /dev/null
+++ b/rustfmt.toml
@@ -0,0 +1,2 @@
+max_width = 79
+use_small_heuristics = "max"
diff --git a/src/dfa/accel.rs b/src/dfa/accel.rs

new file mode 100644 (file)

index 0000000..dbfeb79
--- /dev/null
+++ b/src/dfa/accel.rs
@@ -0,0 +1,507 @@
+// This module defines some core types for dealing with accelerated DFA states.
+// Briefly, a DFA state can be "accelerated" if all of its transitions except
+// for a few loop back to itself. This directly implies that the only way out
+// of such a state is if a byte corresponding to one of those non-loopback
+// transitions is found. Such states are often found in simple repetitions in
+// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
+// DFA with regex-cli:
+//
+//     $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC
+//     dense::DFA(
+//     D 000000:
+//     Q 000001:
+//      *000002:
+//     A 000003: \x00-` => 3, a => 5, b-\xFF => 3
+//      >000004: \x00-` => 3, a => 4, b-\xFF => 3
+//       000005: \x00-\xFF => 2, EOI => 2
+//     )
+//
+// In particular, state 3 is accelerated (shown via the 'A' indicator) since
+// the only way to leave that state once entered is to see an 'a' byte. If
+// there is a long run of non-'a' bytes, then using something like 'memchr'
+// to find the next 'a' byte can be significantly faster than just using the
+// standard byte-at-a-time state machine.
+//
+// Unfortunately, this optimization rarely applies when Unicode is enabled.
+// For example, patterns like '[^a]' don't actually match any byte that isn't
+// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
+// 'a'. This makes the state machine much more complex---far beyond a single
+// state---and removes the ability to easily accelerate it. (Because if the
+// machine sees a non-UTF-8 sequence, then the machine won't match through it.)
+//
+// In practice, we only consider accelerating states that have 3 or fewer
+// non-loop transitions. At a certain point, you get diminishing returns, but
+// also because that's what the memchr crate supports. The structures below
+// hard-code this assumption and provide (de)serialization APIs for use inside
+// a DFA.
+//
+// And finally, note that there is some trickery involved in making it very
+// fast to not only check whether a state is accelerated at search time, but
+// also to access the bytes to search for to implement the acceleration itself.
+// dfa/special.rs provides more detail, but the short story is that all
+// accelerated states appear contiguously in a DFA. This means we can represent
+// the ID space of all accelerated DFA states with a single range. So given
+// a state ID, we can determine whether it's accelerated via
+//
+//     min_accel_id <= id <= max_accel_id
+//
+// And find its corresponding accelerator with:
+//
+//     accels.get((id - min_accel_id) / dfa_stride)
+
+use core::convert::{TryFrom, TryInto};
+
+#[cfg(feature = "alloc")]
+use alloc::{vec, vec::Vec};
+
+use crate::util::bytes::{self, DeserializeError, Endian, SerializeError};
+
+/// The base type used to represent a collection of accelerators.
+///
+/// While an `Accel` is represented as a fixed size array of bytes, a
+/// *collection* of `Accel`s (called `Accels`) is represented internally as a
+/// slice of u32. While it's a bit unnatural to do this and costs us a bit of
+/// fairly low-risk not-safe code, it lets us remove the need for a second type
+/// parameter in the definition of dense::DFA. (Which really wants everything
+/// to be a slice of u32.)
+type AccelTy = u32;
+
+/// The size of the unit of representation for accelerators.
+///
+/// ACCEL_CAP *must* be a multiple of this size.
+const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
+
+/// The maximum length in bytes that a single Accel can be. This is distinct
+/// from the capacity of an accelerator in that the length represents only the
+/// bytes that should be read.
+const ACCEL_LEN: usize = 4;
+
+/// The capacity of each accelerator, in bytes. We set this to 8 since it's a
+/// multiple of 4 (our ID size) and because it gives us a little wiggle room
+/// if we want to support more accel bytes in the future without a breaking
+/// change.
+///
+/// This MUST be a multiple of ACCEL_TY_SIZE.
+const ACCEL_CAP: usize = 8;
+
+/// Search for between 1 and 3 needle bytes in the given haystack, starting the
+/// search at the given position. If `needles` has a length other than 1-3,
+/// then this panics.
+#[inline(always)]
+pub(crate) fn find_fwd(
+    needles: &[u8],
+    haystack: &[u8],
+    at: usize,
+) -> Option<usize> {
+    let bs = needles;
+    let i = match needles.len() {
+        1 => memchr::memchr(bs[0], &haystack[at..])?,
+        2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
+        3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
+        0 => panic!("cannot find with empty needles"),
+        n => panic!("invalid needles length: {}", n),
+    };
+    Some(at + i)
+}
+
+/// Search for between 1 and 3 needle bytes in the given haystack in reverse,
+/// starting the search at the given position. If `needles` has a length other
+/// than 1-3, then this panics.
+#[inline(always)]
+pub(crate) fn find_rev(
+    needles: &[u8],
+    haystack: &[u8],
+    at: usize,
+) -> Option<usize> {
+    let bs = needles;
+    match needles.len() {
+        1 => memchr::memrchr(bs[0], &haystack[..at]),
+        2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
+        3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
+        0 => panic!("cannot find with empty needles"),
+        n => panic!("invalid needles length: {}", n),
+    }
+}
+
+/// Represents the accelerators for all accelerated states in a dense DFA.
+///
+/// The `A` type parameter represents the type of the underlying bytes.
+/// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
+#[derive(Clone)]
+pub(crate) struct Accels<A> {
+    /// A length prefixed slice of contiguous accelerators. See the top comment
+    /// in this module for more details on how we can jump from a DFA's state
+    /// ID to an accelerator in this list.
+    ///
+    /// The first 4 bytes always correspond to the number of accelerators
+    /// that follow.
+    accels: A,
+}
+
+#[cfg(feature = "alloc")]
+impl Accels<Vec<AccelTy>> {
+    /// Create an empty sequence of accelerators for a DFA.
+    pub fn empty() -> Accels<Vec<AccelTy>> {
+        Accels { accels: vec![0] }
+    }
+
+    /// Add an accelerator to this sequence.
+    ///
+    /// This adds to the accelerator to the end of the sequence and therefore
+    /// should be done in correspondence with its state in the DFA.
+    ///
+    /// This panics if this results in more accelerators than AccelTy::MAX.
+    pub fn add(&mut self, accel: Accel) {
+        self.accels.extend_from_slice(&accel.as_accel_tys());
+        let len = self.len();
+        self.set_len(len + 1);
+    }
+
+    /// Set the number of accelerators in this sequence, which is encoded in
+    /// the first 4 bytes of the underlying bytes.
+    fn set_len(&mut self, new_len: usize) {
+        // The only way an accelerator gets added is if a state exists for
+        // it, and if a state exists, then its index is guaranteed to be
+        // representable by a AccelTy by virtue of the guarantees provided by
+        // StateID.
+        let new_len = AccelTy::try_from(new_len).unwrap();
+        self.accels[0] = new_len;
+    }
+}
+
+impl<'a> Accels<&'a [AccelTy]> {
+    /// Deserialize a sequence of accelerators from the given bytes. If there
+    /// was a problem deserializing, then an error is returned.
+    ///
+    /// This is guaranteed to run in constant time. This does not guarantee
+    /// that every accelerator in the returned collection is valid. Thus,
+    /// accessing one may panic, or not-safe code that relies on accelerators
+    /// being correct my result in UB.
+    ///
+    /// Callers may check the validity of every accelerator with the `validate`
+    /// method.
+    pub unsafe fn from_bytes_unchecked(
+        mut slice: &'a [u8],
+    ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
+        let slice_start = slice.as_ptr() as usize;
+
+        let (count, _) =
+            bytes::try_read_u32_as_usize(slice, "accelerators count")?;
+        // The accelerator count is part of the accel_tys slice that
+        // we deserialize. This is perhaps a bit idiosyncratic. It would
+        // probably be better to split out the count into a real field.
+
+        let accel_tys_count = bytes::add(
+            bytes::mul(count, 2, "total number of accelerator accel_tys")?,
+            1,
+            "total number of accel_tys",
+        )?;
+        let accel_tys_len = bytes::mul(
+            ACCEL_TY_SIZE,
+            accel_tys_count,
+            "total number of bytes in accelerators",
+        )?;
+        bytes::check_slice_len(slice, accel_tys_len, "accelerators")?;
+        bytes::check_alignment::<AccelTy>(slice)?;
+        let accel_tys = &slice[..accel_tys_len];
+        slice = &slice[accel_tys_len..];
+        // SAFETY: We've checked the length and alignment above, and since
+        // slice is just bytes, we can safely cast to a slice of &[AccelTy].
+        #[allow(unused_unsafe)]
+        let accels = unsafe {
+            core::slice::from_raw_parts(
+                accel_tys.as_ptr() as *const AccelTy,
+                accel_tys_count,
+            )
+        };
+        Ok((Accels { accels }, slice.as_ptr() as usize - slice_start))
+    }
+}
+
+impl<A: AsRef<[AccelTy]>> Accels<A> {
+    /// Return an owned version of the accelerators.
+    #[cfg(feature = "alloc")]
+    pub fn to_owned(&self) -> Accels<Vec<AccelTy>> {
+        Accels { accels: self.accels.as_ref().to_vec() }
+    }
+
+    /// Return a borrowed version of the accelerators.
+    pub fn as_ref(&self) -> Accels<&[AccelTy]> {
+        Accels { accels: self.accels.as_ref() }
+    }
+
+    /// Return the bytes representing the serialization of the accelerators.
+    pub fn as_bytes(&self) -> &[u8] {
+        let accels = self.accels.as_ref();
+        // SAFETY: This is safe because accels is a just a slice of AccelTy,
+        // and u8 always has a smaller alignment.
+        unsafe {
+            core::slice::from_raw_parts(
+                accels.as_ptr() as *const u8,
+                accels.len() * ACCEL_TY_SIZE,
+            )
+        }
+    }
+
+    /// Returns the memory usage, in bytes, of these accelerators.
+    ///
+    /// The memory usage is computed based on the number of bytes used to
+    /// represent all of the accelerators.
+    ///
+    /// This does **not** include the stack size used by this value.
+    pub fn memory_usage(&self) -> usize {
+        self.as_bytes().len()
+    }
+
+    /// Return the bytes to search for corresponding to the accelerator in this
+    /// sequence at index `i`. If no such accelerator exists, then this panics.
+    ///
+    /// The significance of the index is that it should be in correspondence
+    /// with the index of the corresponding DFA. That is, accelerated DFA
+    /// states are stored contiguously in the DFA and have an ordering implied
+    /// by their respective state IDs. The state's index in that sequence
+    /// corresponds to the index of its corresponding accelerator.
+    #[inline(always)]
+    pub fn needles(&self, i: usize) -> &[u8] {
+        if i >= self.len() {
+            panic!("invalid accelerator index {}", i);
+        }
+        let bytes = self.as_bytes();
+        let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
+        let len = bytes[offset] as usize;
+        &bytes[offset + 1..offset + 1 + len]
+    }
+
+    /// Return the total number of accelerators in this sequence.
+    pub fn len(&self) -> usize {
+        // This should never panic since deserialization checks that the
+        // length can fit into a usize.
+        usize::try_from(self.accels.as_ref()[0]).unwrap()
+    }
+
+    /// Return the accelerator in this sequence at index `i`. If no such
+    /// accelerator exists, then this returns None.
+    ///
+    /// See the docs for `needles` on the significance of the index.
+    fn get(&self, i: usize) -> Option<Accel> {
+        if i >= self.len() {
+            return None;
+        }
+        let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
+        let accel = Accel::from_slice(&self.as_bytes()[offset..])
+            .expect("Accels must contain valid accelerators");
+        Some(accel)
+    }
+
+    /// Returns an iterator of accelerators in this sequence.
+    fn iter(&self) -> IterAccels<'_, A> {
+        IterAccels { accels: self, i: 0 }
+    }
+
+    /// Writes these accelerators to the given byte buffer using the indicated
+    /// endianness. If the given buffer is too small, then an error is
+    /// returned. Upon success, the total number of bytes written is returned.
+    /// The number of bytes written is guaranteed to be a multiple of 8.
+    pub fn write_to<E: Endian>(
+        &self,
+        dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        let nwrite = self.write_to_len();
+        assert_eq!(
+            nwrite % ACCEL_TY_SIZE,
+            0,
+            "expected accelerator bytes written to be a multiple of {}",
+            ACCEL_TY_SIZE,
+        );
+        if dst.len() < nwrite {
+            return Err(SerializeError::buffer_too_small("accelerators"));
+        }
+
+        // The number of accelerators can never exceed AccelTy::MAX.
+        E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
+        // The actual accelerators are just raw bytes and thus their endianness
+        // is irrelevant. So we can copy them as bytes.
+        dst[ACCEL_TY_SIZE..nwrite]
+            .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
+        Ok(nwrite)
+    }
+
+    /// Validates that every accelerator in this collection can be successfully
+    /// deserialized as a valid accelerator.
+    pub fn validate(&self) -> Result<(), DeserializeError> {
+        for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
+            let _ = Accel::from_slice(chunk)?;
+        }
+        Ok(())
+    }
+
+    /// Returns the total number of bytes written by `write_to`.
+    pub fn write_to_len(&self) -> usize {
+        self.as_bytes().len()
+    }
+}
+
+impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "Accels(")?;
+        let mut list = f.debug_list();
+        for a in self.iter() {
+            list.entry(&a);
+        }
+        list.finish()?;
+        write!(f, ")")
+    }
+}
+
+#[derive(Debug)]
+struct IterAccels<'a, A: AsRef<[AccelTy]>> {
+    accels: &'a Accels<A>,
+    i: usize,
+}
+
+impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
+    type Item = Accel;
+
+    fn next(&mut self) -> Option<Accel> {
+        let accel = self.accels.get(self.i)?;
+        self.i += 1;
+        Some(accel)
+    }
+}
+
+/// Accel represents a structure for determining how to "accelerate" a DFA
+/// state.
+///
+/// Namely, it contains zero or more bytes that must be seen in order for the
+/// DFA to leave the state it is associated with. In practice, the actual range
+/// is 1 to 3 bytes.
+///
+/// The purpose of acceleration is to identify states whose vast majority
+/// of transitions are just loops back to the same state. For example,
+/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
+/// (corresponding to `[^a]+`) where all transitions *except* for `a` and
+/// `b` loop back to itself. Thus, this state can be "accelerated" by simply
+/// looking for the next occurrence of either `a` or `b` instead of explicitly
+/// following transitions. (In this case, `b` transitions to the next state
+/// where as `a` would transition to the dead state.)
+#[derive(Clone)]
+pub(crate) struct Accel {
+    /// The first byte is the length. Subsequent bytes are the accelerated
+    /// bytes.
+    ///
+    /// Note that we make every accelerator 8 bytes as a slightly wasteful
+    /// way of making sure alignment is always correct for state ID sizes of
+    /// 1, 2, 4 and 8. This should be okay since accelerated states aren't
+    /// particularly common, especially when Unicode is enabled.
+    bytes: [u8; ACCEL_CAP],
+}
+
+impl Accel {
+    /// Returns an empty accel, where no bytes are accelerated.
+    #[cfg(feature = "alloc")]
+    pub fn new() -> Accel {
+        Accel { bytes: [0; ACCEL_CAP] }
+    }
+
+    /// Returns a verified accelerator derived from the beginning of the given
+    /// slice.
+    ///
+    /// If the slice is not long enough or contains invalid bytes for an
+    /// accelerator, then this returns an error.
+    pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
+        slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
+        let bytes = slice
+            .try_into()
+            .map_err(|_| DeserializeError::buffer_too_small("accelerator"))?;
+        Accel::from_bytes(bytes)
+    }
+
+    /// Returns a verified accelerator derived from raw bytes.
+    ///
+    /// If the given bytes are invalid, then this returns an error.
+    fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
+        if bytes[0] as usize >= ACCEL_LEN {
+            return Err(DeserializeError::generic(
+                "accelerator bytes cannot have length more than 3",
+            ));
+        }
+        Ok(Accel::from_bytes_unchecked(bytes))
+    }
+
+    /// Returns an accelerator derived from raw bytes.
+    ///
+    /// This does not check whether the given bytes are valid. Invalid bytes
+    /// cannot sacrifice memory safety, but may result in panics or silent
+    /// logic bugs.
+    fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel {
+        Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] }
+    }
+
+    /// Attempts to add the given byte to this accelerator. If the accelerator
+    /// is already full then this returns false. Otherwise, returns true.
+    ///
+    /// If the given byte is already in this accelerator, then it panics.
+    #[cfg(feature = "alloc")]
+    pub fn add(&mut self, byte: u8) -> bool {
+        if self.len() >= 3 {
+            return false;
+        }
+        assert!(
+            !self.contains(byte),
+            "accelerator already contains {:?}",
+            crate::util::DebugByte(byte)
+        );
+        self.bytes[self.len() + 1] = byte;
+        self.bytes[0] += 1;
+        true
+    }
+
+    /// Return the number of bytes in this accelerator.
+    pub fn len(&self) -> usize {
+        self.bytes[0] as usize
+    }
+
+    /// Returns true if and only if there are no bytes in this accelerator.
+    #[cfg(feature = "alloc")]
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns the slice of bytes to accelerate.
+    ///
+    /// If this accelerator is empty, then this returns an empty slice.
+    fn needles(&self) -> &[u8] {
+        &self.bytes[1..1 + self.len()]
+    }
+
+    /// Returns true if and only if this accelerator will accelerate the given
+    /// byte.
+    #[cfg(feature = "alloc")]
+    fn contains(&self, byte: u8) -> bool {
+        self.needles().iter().position(|&b| b == byte).is_some()
+    }
+
+    /// Returns the accelerator bytes as an array of AccelTys.
+    #[cfg(feature = "alloc")]
+    fn as_accel_tys(&self) -> [AccelTy; 2] {
+        assert_eq!(ACCEL_CAP, 8);
+        // These unwraps are OK since ACCEL_CAP is set to 8.
+        let first =
+            AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap());
+        let second =
+            AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap());
+        [first, second]
+    }
+}
+
+impl core::fmt::Debug for Accel {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "Accel(")?;
+        let mut set = f.debug_set();
+        for &b in self.needles() {
+            set.entry(&crate::util::DebugByte(b));
+        }
+        set.finish()?;
+        write!(f, ")")
+    }
+}
diff --git a/src/dfa/automaton.rs b/src/dfa/automaton.rs

new file mode 100644 (file)

index 0000000..08bd672
--- /dev/null
+++ b/src/dfa/automaton.rs
@@ -0,0 +1,1903 @@
+use crate::{
+    dfa::search,
+    util::{
+        id::{PatternID, StateID},
+        matchtypes::{HalfMatch, MatchError},
+        prefilter,
+    },
+};
+
+/// A trait describing the interface of a deterministic finite automaton (DFA).
+///
+/// The complexity of this trait probably means that it's unlikely for others
+/// to implement it. The primary purpose of the trait is to provide for a way
+/// of abstracting over different types of DFAs. In this crate, that means
+/// dense DFAs and sparse DFAs. (Dense DFAs are fast but memory hungry, where
+/// as sparse DFAs are slower but come with a smaller memory footprint. But
+/// they otherwise provide exactly equivalent expressive power.) For example, a
+/// [`dfa::regex::Regex`](crate::dfa::regex::Regex) is generic over this trait.
+///
+/// Normally, a DFA's execution model is very simple. You might have a single
+/// start state, zero or more final or "match" states and a function that
+/// transitions from one state to the next given the next byte of input.
+/// Unfortunately, the interface described by this trait is significantly
+/// more complicated than this. The complexity has a number of different
+/// reasons, mostly motivated by performance, functionality or space savings:
+///
+/// * A DFA can search for multiple patterns simultaneously. This
+/// means extra information is returned when a match occurs. Namely,
+/// a match is not just an offset, but an offset plus a pattern ID.
+/// [`Automaton::pattern_count`] returns the number of patterns compiled into
+/// the DFA, [`Automaton::match_count`] returns the total number of patterns
+/// that match in a particular state and [`Automaton::match_pattern`] permits
+/// iterating over the patterns that match in a particular state.
+/// * A DFA can have multiple start states, and the choice of which start
+/// state to use depends on the content of the string being searched and
+/// position of the search, as well as whether the search is an anchored
+/// search for a specific pattern in the DFA. Moreover, computing the start
+/// state also depends on whether you're doing a forward or a reverse search.
+/// [`Automaton::start_state_forward`] and [`Automaton::start_state_reverse`]
+/// are used to compute the start state for forward and reverse searches,
+/// respectively.
+/// * All matches are delayed by one byte to support things like `$` and `\b`
+/// at the end of a pattern. Therefore, every use of a DFA is required to use
+/// [`Automaton::next_eoi_state`]
+/// at the end of the search to compute the final transition.
+/// * For optimization reasons, some states are treated specially. Every
+/// state is either special or not, which can be determined via the
+/// [`Automaton::is_special_state`] method. If it's special, then the state
+/// must be at least one of a few possible types of states. (Note that some
+/// types can overlap, for example, a match state can also be an accel state.
+/// But some types can't. If a state is a dead state, then it can never be any
+/// other type of state.) Those types are:
+///     * A dead state. A dead state means the DFA will never enter a match
+///     state. This can be queried via the [`Automaton::is_dead_state`] method.
+///     * A quit state. A quit state occurs if the DFA had to stop the search
+///     prematurely for some reason. This can be queried via the
+///     [`Automaton::is_quit_state`] method.
+///     * A match state. A match state occurs when a match is found. When a DFA
+///     enters a match state, the search may stop immediately (when looking
+///     for the earliest match), or it may continue to find the leftmost-first
+///     match. This can be queried via the [`Automaton::is_match_state`]
+///     method.
+///     * A start state. A start state is where a search begins. For every
+///     search, there is exactly one start state that is used, however, a
+///     DFA may contain many start states. When the search is in a start
+///     state, it may use a prefilter to quickly skip to candidate matches
+///     without executing the DFA on every byte. This can be queried via the
+///     [`Automaton::is_start_state`] method.
+///     * An accel state. An accel state is a state that is accelerated.
+///     That is, it is a state where _most_ of its transitions loop back to
+///     itself and only a small number of transitions lead to other states.
+///     This kind of state is said to be accelerated because a search routine
+///     can quickly look for the bytes leading out of the state instead of
+///     continuing to execute the DFA on each byte. This can be queried via the
+///     [`Automaton::is_accel_state`] method. And the bytes that lead out of
+///     the state can be queried via the [`Automaton::accelerator`] method.
+///
+/// There are a number of provided methods on this trait that implement
+/// efficient searching (for forwards and backwards) with a DFA using all of
+/// the above features of this trait. In particular, given the complexity of
+/// all these features, implementing a search routine in this trait is not
+/// straight forward. If you need to do this for specialized reasons, then
+/// it's recommended to look at the source of this crate. It is intentionally
+/// well commented to help with this. With that said, it is possible to
+/// somewhat simplify the search routine. For example, handling accelerated
+/// states is strictly optional, since it is always correct to assume that
+/// `Automaton::is_accel_state` returns false. However, one complex part of
+/// writing a search routine using this trait is handling the 1-byte delay of a
+/// match. That is not optional.
+///
+/// # Safety
+///
+/// This trait is unsafe to implement because DFA searching may rely on the
+/// correctness of the implementation for memory safety. For example, DFA
+/// searching may use explicit bounds check elision, which will in turn rely
+/// on the correctness of every function that returns a state ID.
+///
+/// When implementing this trait, one must uphold the documented correctness
+/// guarantees. Otherwise, undefined behavior may occur.
+pub unsafe trait Automaton {
+    /// Transitions from the current state to the next state, given the next
+    /// byte of input.
+    ///
+    /// Implementations must guarantee that the returned ID is always a valid
+    /// ID when `current` refers to a valid ID. Moreover, the transition
+    /// function must be defined for all possible values of `input`.
+    ///
+    /// # Panics
+    ///
+    /// If the given ID does not refer to a valid state, then this routine
+    /// may panic but it also may not panic and instead return an invalid ID.
+    /// However, if the caller provides an invalid ID then this must never
+    /// sacrifice memory safety.
+    ///
+    /// # Example
+    ///
+    /// This shows a simplistic example for walking a DFA for a given haystack
+    /// by using the `next_state` method.
+    ///
+    /// ```
+    /// use regex_automata::dfa::{Automaton, dense};
+    ///
+    /// let dfa = dense::DFA::new(r"[a-z]+r")?;
+    /// let haystack = "bar".as_bytes();
+    ///
+    /// // The start state is determined by inspecting the position and the
+    /// // initial bytes of the haystack.
+    /// let mut state = dfa.start_state_forward(
+    ///     None, haystack, 0, haystack.len(),
+    /// );
+    /// // Walk all the bytes in the haystack.
+    /// for &b in haystack {
+    ///     state = dfa.next_state(state, b);
+    /// }
+    /// // Matches are always delayed by 1 byte, so we must explicitly walk the
+    /// // special "EOI" transition at the end of the search.
+    /// state = dfa.next_eoi_state(state);
+    /// assert!(dfa.is_match_state(state));
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    fn next_state(&self, current: StateID, input: u8) -> StateID;
+
+    /// Transitions from the current state to the next state, given the next
+    /// byte of input.
+    ///
+    /// Unlike [`Automaton::next_state`], implementations may implement this
+    /// more efficiently by assuming that the `current` state ID is valid.
+    /// Typically, this manifests by eliding bounds checks.
+    ///
+    /// # Safety
+    ///
+    /// Callers of this method must guarantee that `current` refers to a valid
+    /// state ID. If `current` is not a valid state ID for this automaton, then
+    /// calling this routine may result in undefined behavior.
+    ///
+    /// If `current` is valid, then implementations must guarantee that the ID
+    /// returned is valid for all possible values of `input`.
+    unsafe fn next_state_unchecked(
+        &self,
+        current: StateID,
+        input: u8,
+    ) -> StateID;
+
+    /// Transitions from the current state to the next state for the special
+    /// EOI symbol.
+    ///
+    /// Implementations must guarantee that the returned ID is always a valid
+    /// ID when `current` refers to a valid ID.
+    ///
+    /// This routine must be called at the end of every search in a correct
+    /// implementation of search. Namely, DFAs in this crate delay matches
+    /// by one byte in order to support look-around operators. Thus, after
+    /// reaching the end of a haystack, a search implementation must follow one
+    /// last EOI transition.
+    ///
+    /// It is best to think of EOI as an additional symbol in the alphabet of
+    /// a DFA that is distinct from every other symbol. That is, the alphabet
+    /// of DFAs in this crate has a logical size of 257 instead of 256, where
+    /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
+    /// physical alphabet size may be smaller because of alphabet compression
+    /// via equivalence classes, but EOI is always represented somehow in the
+    /// alphabet.)
+    ///
+    /// # Panics
+    ///
+    /// If the given ID does not refer to a valid state, then this routine
+    /// may panic but it also may not panic and instead return an invalid ID.
+    /// However, if the caller provides an invalid ID then this must never
+    /// sacrifice memory safety.
+    ///
+    /// # Example
+    ///
+    /// This shows a simplistic example for walking a DFA for a given haystack,
+    /// and then finishing the search with the final EOI transition.
+    ///
+    /// ```
+    /// use regex_automata::dfa::{Automaton, dense};
+    ///
+    /// let dfa = dense::DFA::new(r"[a-z]+r")?;
+    /// let haystack = "bar".as_bytes();
+    ///
+    /// // The start state is determined by inspecting the position and the
+    /// // initial bytes of the haystack.
+    /// let mut state = dfa.start_state_forward(
+    ///     None, haystack, 0, haystack.len(),
+    /// );
+    /// // Walk all the bytes in the haystack.
+    /// for &b in haystack {
+    ///     state = dfa.next_state(state, b);
+    /// }
+    /// // Matches are always delayed by 1 byte, so we must explicitly walk
+    /// // the special "EOI" transition at the end of the search. Without this
+    /// // final transition, the assert below will fail since the DFA will not
+    /// // have entered a match state yet!
+    /// state = dfa.next_eoi_state(state);
+    /// assert!(dfa.is_match_state(state));
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    fn next_eoi_state(&self, current: StateID) -> StateID;
+
+    /// Return the ID of the start state for this DFA when executing a forward
+    /// search.
+    ///
+    /// Unlike typical DFA implementations, the start state for DFAs in this
+    /// crate is dependent on a few different factors:
+    ///
+    /// * The pattern ID, if present. When the underlying DFA has been compiled
+    /// with multiple patterns _and_ the DFA has been configured to compile
+    /// an anchored start state for each pattern, then a pattern ID may be
+    /// specified to execute an anchored search for that specific pattern.
+    /// If `pattern_id` is invalid or if the DFA doesn't have start states
+    /// compiled for each pattern, then implementations must panic. DFAs in
+    /// this crate can be configured to compile start states for each pattern
+    /// via
+    /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern).
+    /// * When `start > 0`, the byte at index `start - 1` may influence the
+    /// start state if the regex uses `^` or `\b`.
+    /// * Similarly, when `start == 0`, it may influence the start state when
+    /// the regex uses `^` or `\A`.
+    /// * Currently, `end` is unused.
+    /// * Whether the search is a forward or reverse search. This routine can
+    /// only be used for forward searches.
+    ///
+    /// # Panics
+    ///
+    /// Implementations must panic if `start..end` is not a valid sub-slice of
+    /// `bytes`. Implementations must also panic if `pattern_id` is non-None
+    /// and does not refer to a valid pattern, or if the DFA was not compiled
+    /// with anchored start states for each pattern.
+    fn start_state_forward(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> StateID;
+
+    /// Return the ID of the start state for this DFA when executing a reverse
+    /// search.
+    ///
+    /// Unlike typical DFA implementations, the start state for DFAs in this
+    /// crate is dependent on a few different factors:
+    ///
+    /// * The pattern ID, if present. When the underlying DFA has been compiled
+    /// with multiple patterns _and_ the DFA has been configured to compile an
+    /// anchored start state for each pattern, then a pattern ID may be
+    /// specified to execute an anchored search for that specific pattern. If
+    /// `pattern_id` is invalid or if the DFA doesn't have start states compiled
+    /// for each pattern, then implementations must panic. DFAs in this crate
+    /// can be configured to compile start states for each pattern via
+    /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern).
+    /// * When `end < bytes.len()`, the byte at index `end` may influence the
+    /// start state if the regex uses `$` or `\b`.
+    /// * Similarly, when `end == bytes.len()`, it may influence the start
+    /// state when the regex uses `$` or `\z`.
+    /// * Currently, `start` is unused.
+    /// * Whether the search is a forward or reverse search. This routine can
+    /// only be used for reverse searches.
+    ///
+    /// # Panics
+    ///
+    /// Implementations must panic if `start..end` is not a valid sub-slice of
+    /// `bytes`. Implementations must also panic if `pattern_id` is non-None
+    /// and does not refer to a valid pattern, or if the DFA was not compiled
+    /// with anchored start states for each pattern.
+    fn start_state_reverse(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> StateID;
+
+    /// Returns true if and only if the given identifier corresponds to a
+    /// "special" state. A special state is one or more of the following:
+    /// a dead state, a quit state, a match state, a start state or an
+    /// accelerated state.
+    ///
+    /// A correct implementation _may_ always return false for states that
+    /// are either start states or accelerated states, since that information
+    /// is only intended to be used for optimization purposes. Correct
+    /// implementations must return true if the state is a dead, quit or match
+    /// state. This is because search routines using this trait must be able
+    /// to rely on `is_special_state` as an indicator that a state may need
+    /// special treatment. (For example, when a search routine sees a dead
+    /// state, it must terminate.)
+    ///
+    /// This routine permits search implementations to use a single branch to
+    /// check whether a state needs special attention before executing the next
+    /// transition. The example below shows how to do this.
+    ///
+    /// # Example
+    ///
+    /// This example shows how `is_special_state` can be used to implement a
+    /// correct search routine with minimal branching. In particular, this
+    /// search routine implements "leftmost" matching, which means that it
+    /// doesn't immediately stop once a match is found. Instead, it continues
+    /// until it reaches a dead state.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, dense},
+    ///     HalfMatch, MatchError, PatternID,
+    /// };
+    ///
+    /// fn find_leftmost_first<A: Automaton>(
+    ///     dfa: &A,
+    ///     haystack: &[u8],
+    /// ) -> Result<Option<HalfMatch>, MatchError> {
+    ///     // The start state is determined by inspecting the position and the
+    ///     // initial bytes of the haystack. Note that start states can never
+    ///     // be match states (since DFAs in this crate delay matches by 1
+    ///     // byte), so we don't need to check if the start state is a match.
+    ///     let mut state = dfa.start_state_forward(
+    ///         None, haystack, 0, haystack.len(),
+    ///     );
+    ///     let mut last_match = None;
+    ///     // Walk all the bytes in the haystack. We can quit early if we see
+    ///     // a dead or a quit state. The former means the automaton will
+    ///     // never transition to any other state. The latter means that the
+    ///     // automaton entered a condition in which its search failed.
+    ///     for (i, &b) in haystack.iter().enumerate() {
+    ///         state = dfa.next_state(state, b);
+    ///         if dfa.is_special_state(state) {
+    ///             if dfa.is_match_state(state) {
+    ///                 last_match = Some(HalfMatch::new(
+    ///                     dfa.match_pattern(state, 0),
+    ///                     i,
+    ///                 ));
+    ///             } else if dfa.is_dead_state(state) {
+    ///                 return Ok(last_match);
+    ///             } else if dfa.is_quit_state(state) {
+    ///                 // It is possible to enter into a quit state after
+    ///                 // observing a match has occurred. In that case, we
+    ///                 // should return the match instead of an error.
+    ///                 if last_match.is_some() {
+    ///                     return Ok(last_match);
+    ///                 }
+    ///                 return Err(MatchError::Quit { byte: b, offset: i });
+    ///             }
+    ///             // Implementors may also want to check for start or accel
+    ///             // states and handle them differently for performance
+    ///             // reasons. But it is not necessary for correctness.
+    ///         }
+    ///     }
+    ///     // Matches are always delayed by 1 byte, so we must explicitly walk
+    ///     // the special "EOI" transition at the end of the search.
+    ///     state = dfa.next_eoi_state(state);
+    ///     if dfa.is_match_state(state) {
+    ///         last_match = Some(HalfMatch::new(
+    ///             dfa.match_pattern(state, 0),
+    ///             haystack.len(),
+    ///         ));
+    ///     }
+    ///     Ok(last_match)
+    /// }
+    ///
+    /// // We use a greedy '+' operator to show how the search doesn't just
+    /// // stop once a match is detected. It continues extending the match.
+    /// // Using '[a-z]+?' would also work as expected and stop the search
+    /// // early. Greediness is built into the automaton.
+    /// let dfa = dense::DFA::new(r"[a-z]+")?;
+    /// let haystack = "123 foobar 4567".as_bytes();
+    /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap();
+    /// assert_eq!(mat.pattern().as_usize(), 0);
+    /// assert_eq!(mat.offset(), 10);
+    ///
+    /// // Here's another example that tests our handling of the special EOI
+    /// // transition. This will fail to find a match if we don't call
+    /// // 'next_eoi_state' at the end of the search since the match isn't
+    /// // found until the final byte in the haystack.
+    /// let dfa = dense::DFA::new(r"[0-9]{4}")?;
+    /// let haystack = "123 foobar 4567".as_bytes();
+    /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap();
+    /// assert_eq!(mat.pattern().as_usize(), 0);
+    /// assert_eq!(mat.offset(), 15);
+    ///
+    /// // And note that our search implementation above automatically works
+    /// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
+    /// // the appropriate pattern ID for us.
+    /// let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
+    /// let haystack = "123 foobar 4567".as_bytes();
+    /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap();
+    /// assert_eq!(mat.pattern().as_usize(), 1);
+    /// assert_eq!(mat.offset(), 3);
+    /// let mat = find_leftmost_first(&dfa, &haystack[3..])?.unwrap();
+    /// assert_eq!(mat.pattern().as_usize(), 0);
+    /// assert_eq!(mat.offset(), 7);
+    /// let mat = find_leftmost_first(&dfa, &haystack[10..])?.unwrap();
+    /// assert_eq!(mat.pattern().as_usize(), 1);
+    /// assert_eq!(mat.offset(), 5);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    fn is_special_state(&self, id: StateID) -> bool;
+
+    /// Returns true if and only if the given identifier corresponds to a dead
+    /// state. When a DFA enters a dead state, it is impossible to leave. That
+    /// is, every transition on a dead state by definition leads back to the
+    /// same dead state.
+    ///
+    /// In practice, the dead state always corresponds to the identifier `0`.
+    /// Moreover, in practice, there is only one dead state.
+    ///
+    /// The existence of a dead state is not strictly required in the classical
+    /// model of finite state machines, where one generally only cares about
+    /// the question of whether an input sequence matches or not. Dead states
+    /// are not needed to answer that question, since one can immediately quit
+    /// as soon as one enters a final or "match" state. However, we don't just
+    /// care about matches but also care about the location of matches, and
+    /// more specifically, care about semantics like "greedy" matching.
+    ///
+    /// For example, given the pattern `a+` and the input `aaaz`, the dead
+    /// state won't be entered until the state machine reaches `z` in the
+    /// input, at which point, the search routine can quit. But without the
+    /// dead state, the search routine wouldn't know when to quit. In a
+    /// classical representation, the search routine would stop after seeing
+    /// the first `a` (which is when the search would enter a match state). But
+    /// this wouldn't implement "greedy" matching where `a+` matches as many
+    /// `a`'s as possible.
+    ///
+    /// # Example
+    ///
+    /// See the example for [`Automaton::is_special_state`] for how to use this
+    /// method correctly.
+    fn is_dead_state(&self, id: StateID) -> bool;
+
+    /// Returns true if and only if the given identifier corresponds to a quit
+    /// state. A quit state is like a dead state (it has no transitions other
+    /// than to itself), except it indicates that the DFA failed to complete
+    /// the search. When this occurs, callers can neither accept or reject that
+    /// a match occurred.
+    ///
+    /// In practice, the quit state always corresponds to the state immediately
+    /// following the dead state. (Which is not usually represented by `1`,
+    /// since state identifiers are pre-multiplied by the state machine's
+    /// alphabet stride, and the alphabet stride varies between DFAs.)
+    ///
+    /// By default, state machines created by this crate will never enter a
+    /// quit state. Since entering a quit state is the only way for a DFA
+    /// in this crate to fail at search time, it follows that the default
+    /// configuration can never produce a match error. Nevertheless, handling
+    /// quit states is necessary to correctly support all configurations in
+    /// this crate.
+    ///
+    /// The typical way in which a quit state can occur is when heuristic
+    /// support for Unicode word boundaries is enabled via the
+    /// [`dense::Config::unicode_word_boundary`](crate::dfa::dense::Config::unicode_word_boundary)
+    /// option. But other options, like the lower level
+    /// [`dense::Config::quit`](crate::dfa::dense::Config::quit)
+    /// configuration, can also result in a quit state being entered. The
+    /// purpose of the quit state is to provide a way to execute a fast DFA
+    /// in common cases while delegating to slower routines when the DFA quits.
+    ///
+    /// The default search implementations provided by this crate will return
+    /// a [`MatchError::Quit`](crate::MatchError::Quit) error when a quit state
+    /// is entered.
+    ///
+    /// # Example
+    ///
+    /// See the example for [`Automaton::is_special_state`] for how to use this
+    /// method correctly.
+    fn is_quit_state(&self, id: StateID) -> bool;
+
+    /// Returns true if and only if the given identifier corresponds to a
+    /// match state. A match state is also referred to as a "final" state and
+    /// indicates that a match has been found.
+    ///
+    /// If all you care about is whether a particular pattern matches in the
+    /// input sequence, then a search routine can quit early as soon as the
+    /// machine enters a match state. However, if you're looking for the
+    /// standard "leftmost-first" match location, then search _must_ continue
+    /// until either the end of the input or until the machine enters a dead
+    /// state. (Since either condition implies that no other useful work can
+    /// be done.) Namely, when looking for the location of a match, then
+    /// search implementations should record the most recent location in
+    /// which a match state was entered, but otherwise continue executing the
+    /// search as normal. (The search may even leave the match state.) Once
+    /// the termination condition is reached, the most recently recorded match
+    /// location should be returned.
+    ///
+    /// Finally, one additional power given to match states in this crate
+    /// is that they are always associated with a specific pattern in order
+    /// to support multi-DFAs. See [`Automaton::match_pattern`] for more
+    /// details and an example for how to query the pattern associated with a
+    /// particular match state.
+    ///
+    /// # Example
+    ///
+    /// See the example for [`Automaton::is_special_state`] for how to use this
+    /// method correctly.
+    fn is_match_state(&self, id: StateID) -> bool;
+
+    /// Returns true if and only if the given identifier corresponds to a
+    /// start state. A start state is a state in which a DFA begins a search.
+    /// All searches begin in a start state. Moreover, since all matches are
+    /// delayed by one byte, a start state can never be a match state.
+    ///
+    /// The main role of a start state is, as mentioned, to be a starting
+    /// point for a DFA. This starting point is determined via one of
+    /// [`Automaton::start_state_forward`] or
+    /// [`Automaton::start_state_reverse`], depending on whether one is doing
+    /// a forward or a reverse search, respectively.
+    ///
+    /// A secondary use of start states is for prefix acceleration. Namely,
+    /// while executing a search, if one detects that you're in a start state,
+    /// then it may be faster to look for the next match of a prefix of the
+    /// pattern, if one exists. If a prefix exists and since all matches must
+    /// begin with that prefix, then skipping ahead to occurrences of that
+    /// prefix may be much faster than executing the DFA.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to implement your own search routine that does
+    /// a prefix search whenever the search enters a start state.
+    ///
+    /// Note that you do not need to implement your own search routine to
+    /// make use of prefilters like this. The search routines provided
+    /// by this crate already implement prefilter support via the
+    /// [`Prefilter`](crate::util::prefilter::Prefilter) trait. The various
+    /// `find_*_at` routines on this trait support the `Prefilter` trait
+    /// through [`Scanner`](crate::util::prefilter::Scanner)s. This example is
+    /// meant to show how you might deal with prefilters in a simplified case
+    /// if you are implementing your own search routine.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     MatchError, PatternID,
+    ///     dfa::{Automaton, dense},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// fn find_byte(slice: &[u8], at: usize, byte: u8) -> Option<usize> {
+    ///     // Would be faster to use the memchr crate, but this is still
+    ///     // faster than running through the DFA.
+    ///     slice[at..].iter().position(|&b| b == byte).map(|i| at + i)
+    /// }
+    ///
+    /// fn find_leftmost_first<A: Automaton>(
+    ///     dfa: &A,
+    ///     haystack: &[u8],
+    ///     prefix_byte: Option<u8>,
+    /// ) -> Result<Option<HalfMatch>, MatchError> {
+    ///     // See the Automaton::is_special_state example for similar code
+    ///     // with more comments.
+    ///
+    ///     let mut state = dfa.start_state_forward(
+    ///         None, haystack, 0, haystack.len(),
+    ///     );
+    ///     let mut last_match = None;
+    ///     let mut pos = 0;
+    ///     while pos < haystack.len() {
+    ///         let b = haystack[pos];
+    ///         state = dfa.next_state(state, b);
+    ///         pos += 1;
+    ///         if dfa.is_special_state(state) {
+    ///             if dfa.is_match_state(state) {
+    ///                 last_match = Some(HalfMatch::new(
+    ///                     dfa.match_pattern(state, 0),
+    ///                     pos - 1,
+    ///                 ));
+    ///             } else if dfa.is_dead_state(state) {
+    ///                 return Ok(last_match);
+    ///             } else if dfa.is_quit_state(state) {
+    ///                 // It is possible to enter into a quit state after
+    ///                 // observing a match has occurred. In that case, we
+    ///                 // should return the match instead of an error.
+    ///                 if last_match.is_some() {
+    ///                     return Ok(last_match);
+    ///                 }
+    ///                 return Err(MatchError::Quit {
+    ///                     byte: b, offset: pos - 1,
+    ///                 });
+    ///             } else if dfa.is_start_state(state) {
+    ///                 // If we're in a start state and know all matches begin
+    ///                 // with a particular byte, then we can quickly skip to
+    ///                 // candidate matches without running the DFA through
+    ///                 // every byte inbetween.
+    ///                 if let Some(prefix_byte) = prefix_byte {
+    ///                     pos = match find_byte(haystack, pos, prefix_byte) {
+    ///                         Some(pos) => pos,
+    ///                         None => break,
+    ///                     };
+    ///                 }
+    ///             }
+    ///         }
+    ///     }
+    ///     // Matches are always delayed by 1 byte, so we must explicitly walk
+    ///     // the special "EOI" transition at the end of the search.
+    ///     state = dfa.next_eoi_state(state);
+    ///     if dfa.is_match_state(state) {
+    ///         last_match = Some(HalfMatch::new(
+    ///             dfa.match_pattern(state, 0),
+    ///             haystack.len(),
+    ///         ));
+    ///     }
+    ///     Ok(last_match)
+    /// }
+    ///
+    /// // In this example, it's obvious that all occurrences of our pattern
+    /// // begin with 'Z', so we pass in 'Z'.
+    /// let dfa = dense::DFA::new(r"Z[a-z]+")?;
+    /// let haystack = "123 foobar Zbaz quux".as_bytes();
+    /// let mat = find_leftmost_first(&dfa, haystack, Some(b'Z'))?.unwrap();
+    /// assert_eq!(mat.pattern().as_usize(), 0);
+    /// assert_eq!(mat.offset(), 15);
+    ///
+    /// // But note that we don't need to pass in a prefix byte. If we don't,
+    /// // then the search routine does no acceleration.
+    /// let mat = find_leftmost_first(&dfa, haystack, None)?.unwrap();
+    /// assert_eq!(mat.pattern().as_usize(), 0);
+    /// assert_eq!(mat.offset(), 15);
+    ///
+    /// // However, if we pass an incorrect byte, then the prefix search will
+    /// // result in incorrect results.
+    /// assert_eq!(find_leftmost_first(&dfa, haystack, Some(b'X'))?, None);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    fn is_start_state(&self, id: StateID) -> bool;
+
+    /// Returns true if and only if the given identifier corresponds to an
+    /// accelerated state.
+    ///
+    /// An accelerated state is a special optimization
+    /// trick implemented by this crate. Namely, if
+    /// [`dense::Config::accelerate`](crate::dfa::dense::Config::accelerate) is
+    /// enabled (and it is by default), then DFAs generated by this crate will
+    /// tag states meeting certain characteristics as accelerated. States meet
+    /// this criteria whenever most of their transitions are self-transitions.
+    /// That is, transitions that loop back to the same state. When a small
+    /// number of transitions aren't self-transitions, then it follows that
+    /// there are only a small number of bytes that can cause the DFA to leave
+    /// that state. Thus, there is an opportunity to look for those bytes
+    /// using more optimized routines rather than continuing to run through
+    /// the DFA. This trick is similar to the prefilter idea described in
+    /// the documentation of [`Automaton::is_start_state`] with two main
+    /// differences:
+    ///
+    /// 1. It is more limited since acceleration only applies to single bytes.
+    /// This means states are rarely accelerated when Unicode mode is enabled
+    /// (which is enabled by default).
+    /// 2. It can occur anywhere in the DFA, which increases optimization
+    /// opportunities.
+    ///
+    /// Like the prefilter idea, the main downside (and a possible reason to
+    /// disable it) is that it can lead to worse performance in some cases.
+    /// Namely, if a state is accelerated for very common bytes, then the
+    /// overhead of checking for acceleration and using the more optimized
+    /// routines to look for those bytes can cause overall performance to be
+    /// worse than if acceleration wasn't enabled at all.
+    ///
+    /// A simple example of a regex that has an accelerated state is
+    /// `(?-u)[^a]+a`. Namely, the `[^a]+` sub-expression gets compiled down
+    /// into a single state where all transitions except for `a` loop back to
+    /// itself, and where `a` is the only transition (other than the special
+    /// EOI transition) that goes to some other state. Thus, this state can
+    /// be accelerated and implemented more efficiently by calling an
+    /// optimized routine like `memchr` with `a` as the needle. Notice that
+    /// the `(?-u)` to disable Unicode is necessary here, as without it,
+    /// `[^a]` will match any UTF-8 encoding of any Unicode scalar value other
+    /// than `a`. This more complicated expression compiles down to many DFA
+    /// states and the simple acceleration optimization is no longer available.
+    ///
+    /// Typically, this routine is used to guard calls to
+    /// [`Automaton::accelerator`], which returns the accelerated bytes for
+    /// the specified state.
+    fn is_accel_state(&self, id: StateID) -> bool;
+
+    /// Returns the total number of patterns compiled into this DFA.
+    ///
+    /// In the case of a DFA that contains no patterns, this must return `0`.
+    ///
+    /// # Example
+    ///
+    /// This example shows the pattern count for a DFA that never matches:
+    ///
+    /// ```
+    /// use regex_automata::dfa::{Automaton, dense::DFA};
+    ///
+    /// let dfa: DFA<Vec<u32>> = DFA::never_match()?;
+    /// assert_eq!(dfa.pattern_count(), 0);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// And another example for a DFA that matches at every position:
+    ///
+    /// ```
+    /// use regex_automata::dfa::{Automaton, dense::DFA};
+    ///
+    /// let dfa: DFA<Vec<u32>> = DFA::always_match()?;
+    /// assert_eq!(dfa.pattern_count(), 1);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// And finally, a DFA that was constructed from multiple patterns:
+    ///
+    /// ```
+    /// use regex_automata::dfa::{Automaton, dense::DFA};
+    ///
+    /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+    /// assert_eq!(dfa.pattern_count(), 3);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    fn pattern_count(&self) -> usize;
+
+    /// Returns the total number of patterns that match in this state.
+    ///
+    /// If the given state is not a match state, then implementations may
+    /// panic.
+    ///
+    /// If the DFA was compiled with one pattern, then this must necessarily
+    /// always return `1` for all match states.
+    ///
+    /// Implementations must guarantee that [`Automaton::match_pattern`] can
+    /// be called with indices up to (but not including) the count returned by
+    /// this routine without panicking.
+    ///
+    /// # Panics
+    ///
+    /// Implementations are permitted to panic if the provided state ID does
+    /// not correspond to a match state.
+    ///
+    /// # Example
+    ///
+    /// This example shows a simple instance of implementing overlapping
+    /// matches. In particular, it shows not only how to determine how many
+    /// patterns have matched in a particular state, but also how to access
+    /// which specific patterns have matched.
+    ///
+    /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All)
+    /// when building the DFA. If we used
+    /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst)
+    /// instead, then the DFA would not be constructed in a way that supports
+    /// overlapping matches. (It would only report a single pattern that
+    /// matches at any particular point in time.)
+    ///
+    /// Another thing to take note of is the patterns used and the order in
+    /// which the pattern IDs are reported. In the example below, pattern `3`
+    /// is yielded first. Why? Because it corresponds to the match that
+    /// appears first. Namely, the `@` symbol is part of `\S+` but not part
+    /// of any of the other patterns. Since the `\S+` pattern has a match that
+    /// starts to the left of any other pattern, its ID is returned before any
+    /// other.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, dense},
+    ///     MatchKind,
+    /// };
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().match_kind(MatchKind::All))
+    ///     .build_many(&[
+    ///         r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+",
+    ///     ])?;
+    /// let haystack = "@bar".as_bytes();
+    ///
+    /// // The start state is determined by inspecting the position and the
+    /// // initial bytes of the haystack.
+    /// let mut state = dfa.start_state_forward(
+    ///     None, haystack, 0, haystack.len(),
+    /// );
+    /// // Walk all the bytes in the haystack.
+    /// for &b in haystack {
+    ///     state = dfa.next_state(state, b);
+    /// }
+    /// state = dfa.next_eoi_state(state);
+    ///
+    /// assert!(dfa.is_match_state(state));
+    /// assert_eq!(dfa.match_count(state), 3);
+    /// // The following calls are guaranteed to not panic since `match_count`
+    /// // returned `3` above.
+    /// assert_eq!(dfa.match_pattern(state, 0).as_usize(), 3);
+    /// assert_eq!(dfa.match_pattern(state, 1).as_usize(), 0);
+    /// assert_eq!(dfa.match_pattern(state, 2).as_usize(), 1);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    fn match_count(&self, id: StateID) -> usize;
+
+    /// Returns the pattern ID corresponding to the given match index in the
+    /// given state.
+    ///
+    /// See [`Automaton::match_count`] for an example of how to use this
+    /// method correctly. Note that if you know your DFA is compiled with a
+    /// single pattern, then this routine is never necessary since it will
+    /// always return a pattern ID of `0` for an index of `0` when `id`
+    /// corresponds to a match state.
+    ///
+    /// Typically, this routine is used when implementing an overlapping
+    /// search, as the example for `Automaton::match_count` does.
+    ///
+    /// # Panics
+    ///
+    /// If the state ID is not a match state or if the match index is out
+    /// of bounds for the given state, then this routine may either panic
+    /// or produce an incorrect result. If the state ID is correct and the
+    /// match index is correct, then this routine must always produce a valid
+    /// `PatternID`.
+    fn match_pattern(&self, id: StateID, index: usize) -> PatternID;
+
+    /// Return a slice of bytes to accelerate for the given state, if possible.
+    ///
+    /// If the given state has no accelerator, then an empty slice must be
+    /// returned. If `Automaton::is_accel_state` returns true for the given
+    /// ID, then this routine _must_ return a non-empty slice, but it is not
+    /// required to do so.
+    ///
+    /// If the given ID is not a valid state ID for this automaton, then
+    /// implementations may panic or produce incorrect results.
+    ///
+    /// See [`Automaton::is_accel_state`] for more details on state
+    /// acceleration.
+    ///
+    /// By default, this method will always return an empty slice.
+    ///
+    /// # Example
+    ///
+    /// This example shows a contrived case in which we build a regex that we
+    /// know is accelerated and extract the accelerator from a state.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     nfa::thompson,
+    ///     dfa::{Automaton, dense},
+    ///     util::id::StateID,
+    ///     SyntaxConfig,
+    /// };
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     // We disable Unicode everywhere and permit the regex to match
+    ///     // invalid UTF-8. e.g., `[^abc]` matches `\xFF`, which is not valid
+    ///     // UTF-8.
+    ///     .syntax(SyntaxConfig::new().unicode(false).utf8(false))
+    ///     // This makes the implicit `(?s:.)*?` prefix added to the regex
+    ///     // match through arbitrary bytes instead of being UTF-8 aware. This
+    ///     // isn't necessary to get acceleration to work in this case, but
+    ///     // it does make the DFA substantially simpler.
+    ///     .thompson(thompson::Config::new().utf8(false))
+    ///     .build("[^abc]+a")?;
+    ///
+    /// // Here we just pluck out the state that we know is accelerated.
+    /// // While the stride calculations are something that can be relied
+    /// // on by callers, the specific position of the accelerated state is
+    /// // implementation defined.
+    /// //
+    /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'.
+    /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`.
+    /// let id = StateID::new(3 * dfa.stride()).unwrap();
+    /// let accelerator = dfa.accelerator(id);
+    /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated.
+    /// assert_eq!(accelerator, &[b'a', b'b', b'c']);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    fn accelerator(&self, _id: StateID) -> &[u8] {
+        &[]
+    }
+
+    /// Executes a forward search and returns the end position of the first
+    /// match that is found as early as possible. If no match exists, then
+    /// `None` is returned.
+    ///
+    /// This routine stops scanning input as soon as the search observes a
+    /// match state. This is useful for implementing boolean `is_match`-like
+    /// routines, where as little work is done as possible.
+    ///
+    /// See [`Automaton::find_earliest_fwd_at`] for additional functionality,
+    /// such as providing a prefilter, a specific pattern to match and the
+    /// bounds of the search within the haystack. This routine is meant as
+    /// a convenience for common cases where the additional functionality is
+    /// not needed.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFAs generated by this crate, this only occurs in a non-default
+    /// configuration where quit bytes are used or Unicode word boundaries are
+    /// heuristically enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use this method with a
+    /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates
+    /// how the position returned might differ from what one might expect when
+    /// executing a traditional leftmost search.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, dense},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// let dfa = dense::DFA::new("foo[0-9]+")?;
+    /// // Normally, the end of the leftmost first match here would be 8,
+    /// // corresponding to the end of the input. But the "earliest" semantics
+    /// // this routine cause it to stop as soon as a match is known, which
+    /// // occurs once 'foo[0-9]' has matched.
+    /// let expected = HalfMatch::must(0, 4);
+    /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"foo12345")?);
+    ///
+    /// let dfa = dense::DFA::new("abc|a")?;
+    /// // Normally, the end of the leftmost first match here would be 3,
+    /// // but the shortest match semantics detect a match earlier.
+    /// let expected = HalfMatch::must(0, 1);
+    /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"abc")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    fn find_earliest_fwd(
+        &self,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        self.find_earliest_fwd_at(None, None, bytes, 0, bytes.len())
+    }
+
+    /// Executes a reverse search and returns the start position of the first
+    /// match that is found as early as possible. If no match exists, then
+    /// `None` is returned.
+    ///
+    /// This routine stops scanning input as soon as the search observes a
+    /// match state.
+    ///
+    /// Note that while it is not technically necessary to build a reverse
+    /// automaton to use a reverse search, it is likely that you'll want to do
+    /// so. Namely, the typical use of a reverse search is to find the starting
+    /// location of a match once its end is discovered from a forward search. A
+    /// reverse DFA automaton can be built by configuring the intermediate NFA
+    /// to be reversed via
+    /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse).
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFAs generated by this crate, this only occurs in a non-default
+    /// configuration where quit bytes are used or Unicode word boundaries are
+    /// heuristically enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use this method with a
+    /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates
+    /// how the position returned might differ from what one might expect when
+    /// executing a traditional leftmost reverse search.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     nfa::thompson,
+    ///     dfa::{Automaton, dense},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .thompson(thompson::Config::new().reverse(true))
+    ///     .build("[a-z]+[0-9]+")?;
+    /// // Normally, the end of the leftmost first match here would be 0,
+    /// // corresponding to the beginning of the input. But the "earliest"
+    /// // semantics of this routine cause it to stop as soon as a match is
+    /// // known, which occurs once '[a-z][0-9]+' has matched.
+    /// let expected = HalfMatch::must(0, 2);
+    /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"foo12345")?);
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .thompson(thompson::Config::new().reverse(true))
+    ///     .build("abc|c")?;
+    /// // Normally, the end of the leftmost first match here would be 0,
+    /// // but the shortest match semantics detect a match earlier.
+    /// let expected = HalfMatch::must(0, 2);
+    /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"abc")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    fn find_earliest_rev(
+        &self,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        self.find_earliest_rev_at(None, bytes, 0, bytes.len())
+    }
+
+    /// Executes a forward search and returns the end position of the leftmost
+    /// match that is found. If no match exists, then `None` is returned.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFAs generated by this crate, this only occurs in a non-default
+    /// configuration where quit bytes are used or Unicode word boundaries are
+    /// heuristically enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Notes for implementors
+    ///
+    /// Implementors of this trait are not required to implement any particular
+    /// match semantics (such as leftmost-first), which are instead manifest in
+    /// the DFA's transitions.
+    ///
+    /// In particular, this method must continue searching even after it enters
+    /// a match state. The search should only terminate once it has reached
+    /// the end of the input or when it has entered a dead or quit state. Upon
+    /// termination, the position of the last byte seen while still in a match
+    /// state is returned.
+    ///
+    /// Since this trait provides an implementation for this method by default,
+    /// it's unlikely that one will need to implement this.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use this method with a
+    /// [`dense::DFA`](crate::dfa::dense::DFA). By default, a dense DFA uses
+    /// "leftmost first" match semantics.
+    ///
+    /// Leftmost first match semantics corresponds to the match with the
+    /// smallest starting offset, but where the end offset is determined by
+    /// preferring earlier branches in the original regular expression. For
+    /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
+    /// will match `Samwise` in `Samwise`.
+    ///
+    /// Generally speaking, the "leftmost first" match is how most backtracking
+    /// regular expressions tend to work. This is in contrast to POSIX-style
+    /// regular expressions that yield "leftmost longest" matches. Namely,
+    /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+    /// leftmost longest semantics. (This crate does not currently support
+    /// leftmost longest semantics.)
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, dense},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// let dfa = dense::DFA::new("foo[0-9]+")?;
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    ///
+    /// // Even though a match is found after reading the first byte (`a`),
+    /// // the leftmost first match semantics demand that we find the earliest
+    /// // match that prefers earlier parts of the pattern over latter parts.
+    /// let dfa = dense::DFA::new("abc|a")?;
+    /// let expected = HalfMatch::must(0, 3);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"abc")?);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    fn find_leftmost_fwd(
+        &self,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        self.find_leftmost_fwd_at(None, None, bytes, 0, bytes.len())
+    }
+
+    /// Executes a reverse search and returns the start of the position of the
+    /// leftmost match that is found. If no match exists, then `None` is
+    /// returned.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFAs generated by this crate, this only occurs in a non-default
+    /// configuration where quit bytes are used or Unicode word boundaries are
+    /// heuristically enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Notes for implementors
+    ///
+    /// Implementors of this trait are not required to implement any particular
+    /// match semantics (such as leftmost-first), which are instead manifest in
+    /// the DFA's transitions.
+    ///
+    /// In particular, this method must continue searching even after it enters
+    /// a match state. The search should only terminate once it has reached
+    /// the end of the input or when it has entered a dead or quit state. Upon
+    /// termination, the position of the last byte seen while still in a match
+    /// state is returned.
+    ///
+    /// Since this trait provides an implementation for this method by default,
+    /// it's unlikely that one will need to implement this.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use this method with a
+    /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this routine
+    /// is principally useful when used in conjunction with the
+    /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
+    /// configuration. In general, it's unlikely to be correct to use both
+    /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since any
+    /// particular DFA will only support searching in one direction with
+    /// respect to the pattern.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     nfa::thompson,
+    ///     dfa::{Automaton, dense},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .thompson(thompson::Config::new().reverse(true))
+    ///     .build("foo[0-9]+")?;
+    /// let expected = HalfMatch::must(0, 0);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"foo12345")?);
+    ///
+    /// // Even though a match is found after reading the last byte (`c`),
+    /// // the leftmost first match semantics demand that we find the earliest
+    /// // match that prefers earlier parts of the pattern over latter parts.
+    /// let dfa = dense::Builder::new()
+    ///     .thompson(thompson::Config::new().reverse(true))
+    ///     .build("abc|c")?;
+    /// let expected = HalfMatch::must(0, 0);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"abc")?);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    fn find_leftmost_rev(
+        &self,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        self.find_leftmost_rev_at(None, bytes, 0, bytes.len())
+    }
+
+    /// Executes an overlapping forward search and returns the end position of
+    /// matches as they are found. If no match exists, then `None` is returned.
+    ///
+    /// This routine is principally only useful when searching for multiple
+    /// patterns on inputs where multiple patterns may match the same regions
+    /// of text. In particular, callers must preserve the automaton's search
+    /// state from prior calls so that the implementation knows where the last
+    /// match occurred.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFAs generated by this crate, this only occurs in a non-default
+    /// configuration where quit bytes are used or Unicode word boundaries are
+    /// heuristically enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to run a basic overlapping search with a
+    /// [`dense::DFA`](crate::dfa::dense::DFA). Notice that we build the
+    /// automaton with a `MatchKind::All` configuration. Overlapping searches
+    /// are unlikely to work as one would expect when using the default
+    /// `MatchKind::LeftmostFirst` match semantics, since leftmost-first
+    /// matching is fundamentally incompatible with overlapping searches.
+    /// Namely, overlapping searches need to report matches as they are seen,
+    /// where as leftmost-first searches will continue searching even after a
+    /// match has been observed in order to find the conventional end position
+    /// of the match. More concretely, leftmost-first searches use dead states
+    /// to terminate a search after a specific match can no longer be extended.
+    /// Overlapping searches instead do the opposite by continuing the search
+    /// to find totally new matches (potentially of other patterns).
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, OverlappingState, dense},
+    ///     HalfMatch,
+    ///     MatchKind,
+    /// };
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().match_kind(MatchKind::All))
+    ///     .build_many(&[r"\w+$", r"\S+$"])?;
+    /// let haystack = "@foo".as_bytes();
+    /// let mut state = OverlappingState::start();
+    ///
+    /// let expected = Some(HalfMatch::must(1, 4));
+    /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// // The first pattern also matches at the same position, so re-running
+    /// // the search will yield another match. Notice also that the first
+    /// // pattern is returned after the second. This is because the second
+    /// // pattern begins its match before the first, is therefore an earlier
+    /// // match and is thus reported first.
+    /// let expected = Some(HalfMatch::must(0, 4));
+    /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    fn find_overlapping_fwd(
+        &self,
+        bytes: &[u8],
+        state: &mut OverlappingState,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        self.find_overlapping_fwd_at(None, None, bytes, 0, bytes.len(), state)
+    }
+
+    /// Executes a forward search and returns the end position of the first
+    /// match that is found as early as possible. If no match exists, then
+    /// `None` is returned.
+    ///
+    /// This routine stops scanning input as soon as the search observes a
+    /// match state. This is useful for implementing boolean `is_match`-like
+    /// routines, where as little work is done as possible.
+    ///
+    /// This is like [`Automaton::find_earliest_fwd`], except it provides some
+    /// additional control over how the search is executed:
+    ///
+    /// * `pre` is a prefilter scanner that, when given, is used whenever the
+    /// DFA enters its starting state. This is meant to speed up searches where
+    /// one or a small number of literal prefixes are known.
+    /// * `pattern_id` specifies a specific pattern in the DFA to run an
+    /// anchored search for. If not given, then a search for any pattern is
+    /// performed. For DFAs built by this crate,
+    /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern)
+    /// must be enabled to use this functionality.
+    /// * `start` and `end` permit searching a specific region of the haystack
+    /// `bytes`. This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `bytes`. (Because the existence of look-around
+    /// operations such as `\b`, `^` and `$` need to take the surrounding
+    /// context into account. This cannot be done if the haystack doesn't
+    /// contain it.)
+    ///
+    /// The examples below demonstrate each of these additional parameters.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFAs generated by this crate, this only occurs in a non-default
+    /// configuration where quit bytes are used or Unicode word boundaries are
+    /// heuristically enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Panics
+    ///
+    /// This routine must panic if a `pattern_id` is given and the underlying
+    /// DFA does not support specific pattern searches.
+    ///
+    /// It must also panic if the given haystack range is not valid.
+    ///
+    /// # Example: prefilter
+    ///
+    /// This example shows how to provide a prefilter for a pattern where all
+    /// matches start with a `z` byte.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, dense},
+    ///     util::prefilter::{Candidate, Prefilter, Scanner, State},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// #[derive(Debug)]
+    /// pub struct ZPrefilter;
+    ///
+    /// impl Prefilter for ZPrefilter {
+    ///     fn next_candidate(
+    ///         &self,
+    ///         _: &mut State,
+    ///         haystack: &[u8],
+    ///         at: usize,
+    ///     ) -> Candidate {
+    ///         // Try changing b'z' to b'q' and observe this test fail since
+    ///         // the prefilter will skip right over the match.
+    ///         match haystack.iter().position(|&b| b == b'z') {
+    ///             None => Candidate::None,
+    ///             Some(i) => Candidate::PossibleStartOfMatch(at + i),
+    ///         }
+    ///     }
+    ///
+    ///     fn heap_bytes(&self) -> usize {
+    ///         0
+    ///     }
+    /// }
+    ///
+    /// let dfa = dense::DFA::new("z[0-9]{3}")?;
+    /// let haystack = "foobar z123 q123".as_bytes();
+    /// // A scanner executes a prefilter while tracking some state that helps
+    /// // determine whether a prefilter is still "effective" or not.
+    /// let mut scanner = Scanner::new(&ZPrefilter);
+    ///
+    /// let expected = Some(HalfMatch::must(0, 11));
+    /// let got = dfa.find_earliest_fwd_at(
+    ///     Some(&mut scanner),
+    ///     None,
+    ///     haystack,
+    ///     0,
+    ///     haystack.len(),
+    /// )?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// # Example: specific pattern search
+    ///
+    /// This example shows how to build a multi-DFA that permits searching for
+    /// specific patterns.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, dense},
+    ///     HalfMatch,
+    ///     PatternID,
+    /// };
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().starts_for_each_pattern(true))
+    ///     .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
+    /// let haystack = "foo123".as_bytes();
+    ///
+    /// // Since we are using the default leftmost-first match and both
+    /// // patterns match at the same starting position, only the first pattern
+    /// // will be returned in this case when doing a search for any of the
+    /// // patterns.
+    /// let expected = Some(HalfMatch::must(0, 6));
+    /// let got = dfa.find_earliest_fwd_at(
+    ///     None,
+    ///     None,
+    ///     haystack,
+    ///     0,
+    ///     haystack.len(),
+    /// )?;
+    /// assert_eq!(expected, got);
+    ///
+    /// // But if we want to check whether some other pattern matches, then we
+    /// // can provide its pattern ID.
+    /// let expected = Some(HalfMatch::must(1, 6));
+    /// let got = dfa.find_earliest_fwd_at(
+    ///     None,
+    ///     Some(PatternID::must(1)),
+    ///     haystack,
+    ///     0,
+    ///     haystack.len(),
+    /// )?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// # Example: specifying the bounds of a search
+    ///
+    /// This example shows how providing the bounds of a search can produce
+    /// different results than simply sub-slicing the haystack.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, dense},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// // N.B. We disable Unicode here so that we use a simple ASCII word
+    /// // boundary. Alternatively, we could enable heuristic support for
+    /// // Unicode word boundaries.
+    /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?;
+    /// let haystack = "foo123bar".as_bytes();
+    ///
+    /// // Since we sub-slice the haystack, the search doesn't know about the
+    /// // larger context and assumes that `123` is surrounded by word
+    /// // boundaries. And of course, the match position is reported relative
+    /// // to the sub-slice as well, which means we get `3` instead of `6`.
+    /// let expected = Some(HalfMatch::must(0, 3));
+    /// let got = dfa.find_earliest_fwd_at(
+    ///     None,
+    ///     None,
+    ///     &haystack[3..6],
+    ///     0,
+    ///     haystack[3..6].len(),
+    /// )?;
+    /// assert_eq!(expected, got);
+    ///
+    /// // But if we provide the bounds of the search within the context of the
+    /// // entire haystack, then the search can take the surrounding context
+    /// // into account. (And if we did find a match, it would be reported
+    /// // as a valid offset into `haystack` instead of its sub-slice.)
+    /// let expected = None;
+    /// let got = dfa.find_earliest_fwd_at(
+    ///     None,
+    ///     None,
+    ///     haystack,
+    ///     3,
+    ///     6,
+    /// )?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    fn find_earliest_fwd_at(
+        &self,
+        pre: Option<&mut prefilter::Scanner>,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        search::find_earliest_fwd(pre, self, pattern_id, bytes, start, end)
+    }
+
+    /// Executes a reverse search and returns the start position of the first
+    /// match that is found as early as possible. If no match exists, then
+    /// `None` is returned.
+    ///
+    /// This routine stops scanning input as soon as the search observes a
+    /// match state.
+    ///
+    /// This is like [`Automaton::find_earliest_rev`], except it provides some
+    /// additional control over how the search is executed. See the
+    /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
+    /// on the additional parameters along with examples of their usage.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFAs generated by this crate, this only occurs in a non-default
+    /// configuration where quit bytes are used or Unicode word boundaries are
+    /// heuristically enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Panics
+    ///
+    /// This routine must panic if a `pattern_id` is given and the underlying
+    /// DFA does not support specific pattern searches.
+    ///
+    /// It must also panic if the given haystack range is not valid.
+    #[inline]
+    fn find_earliest_rev_at(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        search::find_earliest_rev(self, pattern_id, bytes, start, end)
+    }
+
+    /// Executes a forward search and returns the end position of the leftmost
+    /// match that is found. If no match exists, then `None` is returned.
+    ///
+    /// This is like [`Automaton::find_leftmost_fwd`], except it provides some
+    /// additional control over how the search is executed. See the
+    /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
+    /// on the additional parameters along with examples of their usage.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFAs generated by this crate, this only occurs in a non-default
+    /// configuration where quit bytes are used or Unicode word boundaries are
+    /// heuristically enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Panics
+    ///
+    /// This routine must panic if a `pattern_id` is given and the underlying
+    /// DFA does not support specific pattern searches.
+    ///
+    /// It must also panic if the given haystack range is not valid.
+    #[inline]
+    fn find_leftmost_fwd_at(
+        &self,
+        pre: Option<&mut prefilter::Scanner>,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        search::find_leftmost_fwd(pre, self, pattern_id, bytes, start, end)
+    }
+
+    /// Executes a reverse search and returns the start of the position of the
+    /// leftmost match that is found. If no match exists, then `None` is
+    /// returned.
+    ///
+    /// This is like [`Automaton::find_leftmost_rev`], except it provides some
+    /// additional control over how the search is executed. See the
+    /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
+    /// on the additional parameters along with examples of their usage.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFAs generated by this crate, this only occurs in a non-default
+    /// configuration where quit bytes are used or Unicode word boundaries are
+    /// heuristically enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Panics
+    ///
+    /// This routine must panic if a `pattern_id` is given and the underlying
+    /// DFA does not support specific pattern searches.
+    ///
+    /// It must also panic if the given haystack range is not valid.
+    #[inline]
+    fn find_leftmost_rev_at(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        search::find_leftmost_rev(self, pattern_id, bytes, start, end)
+    }
+
+    /// Executes an overlapping forward search and returns the end position of
+    /// matches as they are found. If no match exists, then `None` is returned.
+    ///
+    /// This routine is principally only useful when searching for multiple
+    /// patterns on inputs where multiple patterns may match the same regions
+    /// of text. In particular, callers must preserve the automaton's search
+    /// state from prior calls so that the implementation knows where the last
+    /// match occurred.
+    ///
+    /// This is like [`Automaton::find_overlapping_fwd`], except it provides
+    /// some additional control over how the search is executed. See the
+    /// documentation of [`Automaton::find_earliest_fwd_at`] for more details
+    /// on the additional parameters along with examples of their usage.
+    ///
+    /// When using this routine to implement an iterator of overlapping
+    /// matches, the `start` of the search should always be set to the end
+    /// of the last match. If more patterns match at the previous location,
+    /// then they will be immediately returned. (This is tracked by the given
+    /// overlapping state.) Otherwise, the search continues at the starting
+    /// position given.
+    ///
+    /// If for some reason you want the search to forget about its previous
+    /// state and restart the search at a particular position, then setting the
+    /// state to [`OverlappingState::start`] will accomplish that.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFAs generated by this crate, this only occurs in a non-default
+    /// configuration where quit bytes are used or Unicode word boundaries are
+    /// heuristically enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Panics
+    ///
+    /// This routine must panic if a `pattern_id` is given and the underlying
+    /// DFA does not support specific pattern searches.
+    ///
+    /// It must also panic if the given haystack range is not valid.
+    #[inline]
+    fn find_overlapping_fwd_at(
+        &self,
+        pre: Option<&mut prefilter::Scanner>,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+        state: &mut OverlappingState,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        search::find_overlapping_fwd(
+            pre, self, pattern_id, bytes, start, end, state,
+        )
+    }
+}
+
+unsafe impl<'a, T: Automaton> Automaton for &'a T {
+    #[inline]
+    fn next_state(&self, current: StateID, input: u8) -> StateID {
+        (**self).next_state(current, input)
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(
+        &self,
+        current: StateID,
+        input: u8,
+    ) -> StateID {
+        (**self).next_state_unchecked(current, input)
+    }
+
+    #[inline]
+    fn next_eoi_state(&self, current: StateID) -> StateID {
+        (**self).next_eoi_state(current)
+    }
+
+    #[inline]
+    fn start_state_forward(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> StateID {
+        (**self).start_state_forward(pattern_id, bytes, start, end)
+    }
+
+    #[inline]
+    fn start_state_reverse(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> StateID {
+        (**self).start_state_reverse(pattern_id, bytes, start, end)
+    }
+
+    #[inline]
+    fn is_special_state(&self, id: StateID) -> bool {
+        (**self).is_special_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: StateID) -> bool {
+        (**self).is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_quit_state(&self, id: StateID) -> bool {
+        (**self).is_quit_state(id)
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: StateID) -> bool {
+        (**self).is_match_state(id)
+    }
+
+    #[inline]
+    fn is_start_state(&self, id: StateID) -> bool {
+        (**self).is_start_state(id)
+    }
+
+    #[inline]
+    fn is_accel_state(&self, id: StateID) -> bool {
+        (**self).is_accel_state(id)
+    }
+
+    #[inline]
+    fn pattern_count(&self) -> usize {
+        (**self).pattern_count()
+    }
+
+    #[inline]
+    fn match_count(&self, id: StateID) -> usize {
+        (**self).match_count(id)
+    }
+
+    #[inline]
+    fn match_pattern(&self, id: StateID, index: usize) -> PatternID {
+        (**self).match_pattern(id, index)
+    }
+
+    #[inline]
+    fn accelerator(&self, id: StateID) -> &[u8] {
+        (**self).accelerator(id)
+    }
+
+    #[inline]
+    fn find_earliest_fwd(
+        &self,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        (**self).find_earliest_fwd(bytes)
+    }
+
+    #[inline]
+    fn find_earliest_rev(
+        &self,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        (**self).find_earliest_rev(bytes)
+    }
+
+    #[inline]
+    fn find_leftmost_fwd(
+        &self,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        (**self).find_leftmost_fwd(bytes)
+    }
+
+    #[inline]
+    fn find_leftmost_rev(
+        &self,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        (**self).find_leftmost_rev(bytes)
+    }
+
+    #[inline]
+    fn find_overlapping_fwd(
+        &self,
+        bytes: &[u8],
+        state: &mut OverlappingState,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        (**self).find_overlapping_fwd(bytes, state)
+    }
+
+    #[inline]
+    fn find_earliest_fwd_at(
+        &self,
+        pre: Option<&mut prefilter::Scanner>,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        (**self).find_earliest_fwd_at(pre, pattern_id, bytes, start, end)
+    }
+
+    #[inline]
+    fn find_earliest_rev_at(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        (**self).find_earliest_rev_at(pattern_id, bytes, start, end)
+    }
+
+    #[inline]
+    fn find_leftmost_fwd_at(
+        &self,
+        pre: Option<&mut prefilter::Scanner>,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        (**self).find_leftmost_fwd_at(pre, pattern_id, bytes, start, end)
+    }
+
+    #[inline]
+    fn find_leftmost_rev_at(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        (**self).find_leftmost_rev_at(pattern_id, bytes, start, end)
+    }
+
+    #[inline]
+    fn find_overlapping_fwd_at(
+        &self,
+        pre: Option<&mut prefilter::Scanner>,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+        state: &mut OverlappingState,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        (**self)
+            .find_overlapping_fwd_at(pre, pattern_id, bytes, start, end, state)
+    }
+}
+
+/// Represents the current state of an overlapping search.
+///
+/// This is used for overlapping searches since they need to know something
+/// about the previous search. For example, when multiple patterns match at the
+/// same position, this state tracks the last reported pattern so that the next
+/// search knows whether to report another matching pattern or continue with
+/// the search at the next position. Additionally, it also tracks which state
+/// the last search call terminated in.
+///
+/// This type provides no introspection capabilities. The only thing a caller
+/// can do is construct it and pass it around to permit search routines to use
+/// it to track state.
+///
+/// Callers should always provide a fresh state constructed via
+/// [`OverlappingState::start`] when starting a new search. Reusing state from
+/// a previous search may result in incorrect results.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct OverlappingState {
+    /// The state ID of the state at which the search was in when the call
+    /// terminated. When this is a match state, `last_match` must be set to a
+    /// non-None value.
+    ///
+    /// A `None` value indicates the start state of the corresponding
+    /// automaton. We cannot use the actual ID, since any one automaton may
+    /// have many start states, and which one is in use depends on several
+    /// search-time factors.
+    id: Option<StateID>,
+    /// Information associated with a match when `id` corresponds to a match
+    /// state.
+    last_match: Option<StateMatch>,
+}
+
+/// Internal state about the last match that occurred. This records both the
+/// offset of the match and the match index.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub(crate) struct StateMatch {
+    /// The index into the matching patterns for the current match state.
+    pub(crate) match_index: usize,
+    /// The offset in the haystack at which the match occurred. This is used
+    /// when reporting multiple matches at the same offset. That is, when
+    /// an overlapping search runs, the first thing it checks is whether it's
+    /// already in a match state, and if so, whether there are more patterns
+    /// to report as matches in that state. If so, it increments `match_index`
+    /// and returns the pattern and this offset. Once `match_index` exceeds the
+    /// number of matching patterns in the current state, the search continues.
+    pub(crate) offset: usize,
+}
+
+impl OverlappingState {
+    /// Create a new overlapping state that begins at the start state of any
+    /// automaton.
+    pub fn start() -> OverlappingState {
+        OverlappingState { id: None, last_match: None }
+    }
+
+    pub(crate) fn id(&self) -> Option<StateID> {
+        self.id
+    }
+
+    pub(crate) fn set_id(&mut self, id: StateID) {
+        self.id = Some(id);
+    }
+
+    pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> {
+        self.last_match.as_mut()
+    }
+
+    pub(crate) fn set_last_match(&mut self, last_match: StateMatch) {
+        self.last_match = Some(last_match);
+    }
+}
+
+/// Write a prefix "state" indicator for fmt::Debug impls.
+///
+/// Specifically, this tries to succinctly distinguish the different types of
+/// states: dead states, quit states, accelerated states, start states and
+/// match states. It even accounts for the possible overlappings of different
+/// state types.
+pub(crate) fn fmt_state_indicator<A: Automaton>(
+    f: &mut core::fmt::Formatter<'_>,
+    dfa: A,
+    id: StateID,
+) -> core::fmt::Result {
+    if dfa.is_dead_state(id) {
+        write!(f, "D")?;
+        if dfa.is_start_state(id) {
+            write!(f, ">")?;
+        } else {
+            write!(f, " ")?;
+        }
+    } else if dfa.is_quit_state(id) {
+        write!(f, "Q ")?;
+    } else if dfa.is_start_state(id) {
+        if dfa.is_accel_state(id) {
+            write!(f, "A>")?;
+        } else {
+            write!(f, " >")?;
+        }
+    } else if dfa.is_match_state(id) {
+        if dfa.is_accel_state(id) {
+            write!(f, "A*")?;
+        } else {
+            write!(f, " *")?;
+        }
+    } else if dfa.is_accel_state(id) {
+        write!(f, "A ")?;
+    } else {
+        write!(f, "  ")?;
+    }
+    Ok(())
+}
diff --git a/src/dfa/dense.rs b/src/dfa/dense.rs

new file mode 100644 (file)

index 0000000..07c1350
--- /dev/null
+++ b/src/dfa/dense.rs
@@ -0,0 +1,4470 @@
+/*!
+Types and routines specific to dense DFAs.
+
+This module is the home of [`dense::DFA`](DFA).
+
+This module also contains a [`dense::Builder`](Builder) and a
+[`dense::Config`](Config) for configuring and building a dense DFA.
+*/
+
+#[cfg(feature = "alloc")]
+use core::cmp;
+use core::{convert::TryFrom, fmt, iter, mem::size_of, slice};
+
+#[cfg(feature = "alloc")]
+use alloc::{
+    collections::{BTreeMap, BTreeSet},
+    vec,
+    vec::Vec,
+};
+
+#[cfg(feature = "alloc")]
+use crate::{
+    dfa::{
+        accel::Accel, determinize, error::Error, minimize::Minimizer, sparse,
+    },
+    nfa::thompson,
+    util::alphabet::ByteSet,
+    MatchKind,
+};
+use crate::{
+    dfa::{
+        accel::Accels,
+        automaton::{fmt_state_indicator, Automaton},
+        special::Special,
+        DEAD,
+    },
+    util::{
+        alphabet::{self, ByteClasses},
+        bytes::{self, DeserializeError, Endian, SerializeError},
+        id::{PatternID, StateID},
+        start::Start,
+    },
+};
+
+/// The label that is pre-pended to a serialized DFA.
+const LABEL: &str = "rust-regex-automata-dfa-dense";
+
+/// The format version of dense regexes. This version gets incremented when a
+/// change occurs. A change may not necessarily be a breaking change, but the
+/// version does permit good error messages in the case where a breaking change
+/// is made.
+const VERSION: u32 = 2;
+
+/// The configuration used for compiling a dense DFA.
+///
+/// A dense DFA configuration is a simple data object that is typically used
+/// with [`dense::Builder::configure`](self::Builder::configure).
+///
+/// The default configuration guarantees that a search will _never_ return a
+/// [`MatchError`](crate::MatchError) for any haystack or pattern. Setting a
+/// quit byte with [`Config::quit`] or enabling heuristic support for Unicode
+/// word boundaries with [`Config::unicode_word_boundary`] can in turn cause a
+/// search to return an error. See the corresponding configuration options for
+/// more details on when those error conditions arise.
+#[cfg(feature = "alloc")]
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+    // As with other configuration types in this crate, we put all our knobs
+    // in options so that we can distinguish between "default" and "not set."
+    // This makes it possible to easily combine multiple configurations
+    // without default values overwriting explicitly specified values. See the
+    // 'overwrite' method.
+    //
+    // For docs on the fields below, see the corresponding method setters.
+    anchored: Option<bool>,
+    accelerate: Option<bool>,
+    minimize: Option<bool>,
+    match_kind: Option<MatchKind>,
+    starts_for_each_pattern: Option<bool>,
+    byte_classes: Option<bool>,
+    unicode_word_boundary: Option<bool>,
+    quit: Option<ByteSet>,
+    dfa_size_limit: Option<Option<usize>>,
+    determinize_size_limit: Option<Option<usize>>,
+}
+
+#[cfg(feature = "alloc")]
+impl Config {
+    /// Return a new default dense DFA compiler configuration.
+    pub fn new() -> Config {
+        Config::default()
+    }
+
+    /// Set whether matching must be anchored at the beginning of the input.
+    ///
+    /// When enabled, a match must begin at the start of a search. When
+    /// disabled, the DFA will act as if the pattern started with a `(?s:.)*?`,
+    /// which enables a match to appear anywhere.
+    ///
+    /// Note that if you want to run both anchored and unanchored
+    /// searches without building multiple automatons, you can enable the
+    /// [`Config::starts_for_each_pattern`] configuration instead. This will
+    /// permit unanchored any-pattern searches and pattern-specific anchored
+    /// searches. See the documentation for that configuration for an example.
+    ///
+    /// By default this is disabled.
+    ///
+    /// **WARNING:** this is subtly different than using a `^` at the start of
+    /// your regex. A `^` forces a regex to match exclusively at the start of
+    /// input, regardless of where you begin your search. In contrast, enabling
+    /// this option will allow your regex to match anywhere in your input,
+    /// but the match must start at the beginning of a search. (Most of the
+    /// higher level convenience search routines make "start of input" and
+    /// "start of search" equivalent, but some routines allow treating these as
+    /// orthogonal.)
+    ///
+    /// For example, consider the haystack `aba` and the following searches:
+    ///
+    /// 1. The regex `^a` is compiled with `anchored=false` and searches
+    ///    `aba` starting at position `2`. Since `^` requires the match to
+    ///    start at the beginning of the input and `2 > 0`, no match is found.
+    /// 2. The regex `a` is compiled with `anchored=true` and searches `aba`
+    ///    starting at position `2`. This reports a match at `[2, 3]` since
+    ///    the match starts where the search started. Since there is no `^`,
+    ///    there is no requirement for the match to start at the beginning of
+    ///    the input.
+    /// 3. The regex `a` is compiled with `anchored=true` and searches `aba`
+    ///    starting at position `1`. Since `b` corresponds to position `1` and
+    ///    since the regex is anchored, it finds no match.
+    /// 4. The regex `a` is compiled with `anchored=false` and searches `aba`
+    ///    startting at position `1`. Since the regex is neither anchored nor
+    ///    starts with `^`, the regex is compiled with an implicit `(?s:.)*?`
+    ///    prefix that permits it to match anywhere. Thus, it reports a match
+    ///    at `[2, 3]`.
+    ///
+    /// # Example
+    ///
+    /// This demonstrates the differences between an anchored search and
+    /// a pattern that begins with `^` (as described in the above warning
+    /// message).
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+    ///
+    /// let haystack = "aba".as_bytes();
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().anchored(false)) // default
+    ///     .build(r"^a")?;
+    /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?;
+    /// // No match is found because 2 is not the beginning of the haystack,
+    /// // which is what ^ requires.
+    /// let expected = None;
+    /// assert_eq!(expected, got);
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().anchored(true))
+    ///     .build(r"a")?;
+    /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?;
+    /// // An anchored search can still match anywhere in the haystack, it just
+    /// // must begin at the start of the search which is '2' in this case.
+    /// let expected = Some(HalfMatch::must(0, 3));
+    /// assert_eq!(expected, got);
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().anchored(true))
+    ///     .build(r"a")?;
+    /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?;
+    /// // No match is found since we start searching at offset 1 which
+    /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
+    /// // is found.
+    /// let expected = None;
+    /// assert_eq!(expected, got);
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().anchored(false)) // default
+    ///     .build(r"a")?;
+    /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?;
+    /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the
+    /// // pattern. Even though the search starts at 'b', the 'match anything'
+    /// // prefix allows the search to match 'a'.
+    /// let expected = Some(HalfMatch::must(0, 3));
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn anchored(mut self, yes: bool) -> Config {
+        self.anchored = Some(yes);
+        self
+    }
+
+    /// Enable state acceleration.
+    ///
+    /// When enabled, DFA construction will analyze each state to determine
+    /// whether it is eligible for simple acceleration. Acceleration typically
+    /// occurs when most of a state's transitions loop back to itself, leaving
+    /// only a select few bytes that will exit the state. When this occurs,
+    /// other routines like `memchr` can be used to look for those bytes which
+    /// may be much faster than traversing the DFA.
+    ///
+    /// Callers may elect to disable this if consistent performance is more
+    /// desirable than variable performance. Namely, acceleration can sometimes
+    /// make searching slower than it otherwise would be if the transitions
+    /// that leave accelerated states are traversed frequently.
+    ///
+    /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for
+    /// an example.
+    ///
+    /// This is enabled by default.
+    pub fn accelerate(mut self, yes: bool) -> Config {
+        self.accelerate = Some(yes);
+        self
+    }
+
+    /// Minimize the DFA.
+    ///
+    /// When enabled, the DFA built will be minimized such that it is as small
+    /// as possible.
+    ///
+    /// Whether one enables minimization or not depends on the types of costs
+    /// you're willing to pay and how much you care about its benefits. In
+    /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
+    /// space, where `n` is the number of DFA states and `k` is the alphabet
+    /// size. In practice, minimization can be quite costly in terms of both
+    /// space and time, so it should only be done if you're willing to wait
+    /// longer to produce a DFA. In general, you might want a minimal DFA in
+    /// the following circumstances:
+    ///
+    /// 1. You would like to optimize for the size of the automaton. This can
+    ///    manifest in one of two ways. Firstly, if you're converting the
+    ///    DFA into Rust code (or a table embedded in the code), then a minimal
+    ///    DFA will translate into a corresponding reduction in code  size, and
+    ///    thus, also the final compiled binary size. Secondly, if you are
+    ///    building many DFAs and putting them on the heap, you'll be able to
+    ///    fit more if they are smaller. Note though that building a minimal
+    ///    DFA itself requires additional space; you only realize the space
+    ///    savings once the minimal DFA is constructed (at which point, the
+    ///    space used for minimization is freed).
+    /// 2. You've observed that a smaller DFA results in faster match
+    ///    performance. Naively, this isn't guaranteed since there is no
+    ///    inherent difference between matching with a bigger-than-minimal
+    ///    DFA and a minimal DFA. However, a smaller DFA may make use of your
+    ///    CPU's cache more efficiently.
+    /// 3. You are trying to establish an equivalence between regular
+    ///    languages. The standard method for this is to build a minimal DFA
+    ///    for each language and then compare them. If the DFAs are equivalent
+    ///    (up to state renaming), then the languages are equivalent.
+    ///
+    /// Typically, minimization only makes sense as an offline process. That
+    /// is, one might minimize a DFA before serializing it to persistent
+    /// storage. In practical terms, minimization can take around an order of
+    /// magnitude more time than compiling the initial DFA via determinization.
+    ///
+    /// This option is disabled by default.
+    pub fn minimize(mut self, yes: bool) -> Config {
+        self.minimize = Some(yes);
+        self
+    }
+
+    /// Set the desired match semantics.
+    ///
+    /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
+    /// match semantics of Perl-like regex engines. That is, when multiple
+    /// patterns would match at the same leftmost position, the pattern that
+    /// appears first in the concrete syntax is chosen.
+    ///
+    /// Currently, the only other kind of match semantics supported is
+    /// [`MatchKind::All`]. This corresponds to classical DFA construction
+    /// where all possible matches are added to the DFA.
+    ///
+    /// Typically, `All` is used when one wants to execute an overlapping
+    /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
+    /// sense to use `All` with the various "leftmost" find routines, since the
+    /// leftmost routines depend on the `LeftmostFirst` automata construction
+    /// strategy. Specifically, `LeftmostFirst` adds dead states to the DFA
+    /// as a way to terminate the search and report a match. `LeftmostFirst`
+    /// also supports non-greedy matches using this strategy where as `All`
+    /// does not.
+    ///
+    /// # Example: overlapping search
+    ///
+    /// This example shows the typical use of `MatchKind::All`, which is to
+    /// report overlapping matches.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, OverlappingState, dense},
+    ///     HalfMatch, MatchKind,
+    /// };
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().match_kind(MatchKind::All))
+    ///     .build_many(&[r"\w+$", r"\S+$"])?;
+    /// let haystack = "@foo".as_bytes();
+    /// let mut state = OverlappingState::start();
+    ///
+    /// let expected = Some(HalfMatch::must(1, 4));
+    /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// // The first pattern also matches at the same position, so re-running
+    /// // the search will yield another match. Notice also that the first
+    /// // pattern is returned after the second. This is because the second
+    /// // pattern begins its match before the first, is therefore an earlier
+    /// // match and is thus reported first.
+    /// let expected = Some(HalfMatch::must(0, 4));
+    /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// # Example: reverse automaton to find start of match
+    ///
+    /// Another example for using `MatchKind::All` is for constructing a
+    /// reverse automaton to find the start of a match. `All` semantics are
+    /// used for this in order to find the longest possible match, which
+    /// corresponds to the leftmost starting position.
+    ///
+    /// Note that if you need the starting position then
+    /// [`dfa::regex::Regex`](crate::dfa::regex::Regex) will handle this for
+    /// you, so it's usually not necessary to do this yourself.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, MatchKind};
+    ///
+    /// let haystack = "123foobar456".as_bytes();
+    /// let pattern = r"[a-z]+";
+    ///
+    /// let dfa_fwd = dense::DFA::new(pattern)?;
+    /// let dfa_rev = dense::Builder::new()
+    ///     .configure(dense::Config::new()
+    ///         .anchored(true)
+    ///         .match_kind(MatchKind::All)
+    ///     )
+    ///     .build(pattern)?;
+    /// let expected_fwd = HalfMatch::must(0, 9);
+    /// let expected_rev = HalfMatch::must(0, 3);
+    /// let got_fwd = dfa_fwd.find_leftmost_fwd(haystack)?.unwrap();
+    /// // Here we don't specify the pattern to search for since there's only
+    /// // one pattern and we're doing a leftmost search. But if this were an
+    /// // overlapping search, you'd need to specify the pattern that matched
+    /// // in the forward direction. (Otherwise, you might wind up finding the
+    /// // starting position of a match of some other pattern.) That in turn
+    /// // requires building the reverse automaton with starts_for_each_pattern
+    /// // enabled. Indeed, this is what Regex does internally.
+    /// let got_rev = dfa_rev.find_leftmost_rev_at(
+    ///     None, haystack, 0, got_fwd.offset(),
+    /// )?.unwrap();
+    /// assert_eq!(expected_fwd, got_fwd);
+    /// assert_eq!(expected_rev, got_rev);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn match_kind(mut self, kind: MatchKind) -> Config {
+        self.match_kind = Some(kind);
+        self
+    }
+
+    /// Whether to compile a separate start state for each pattern in the
+    /// automaton.
+    ///
+    /// When enabled, a separate **anchored** start state is added for each
+    /// pattern in the DFA. When this start state is used, then the DFA will
+    /// only search for matches for the pattern specified, even if there are
+    /// other patterns in the DFA.
+    ///
+    /// The main downside of this option is that it can potentially increase
+    /// the size of the DFA and/or increase the time it takes to build the DFA.
+    ///
+    /// There are a few reasons one might want to enable this (it's disabled
+    /// by default):
+    ///
+    /// 1. When looking for the start of an overlapping match (using a
+    /// reverse DFA), doing it correctly requires starting the reverse search
+    /// using the starting state of the pattern that matched in the forward
+    /// direction. Indeed, when building a [`Regex`](crate::dfa::regex::Regex),
+    /// it will automatically enable this option when building the reverse DFA
+    /// internally.
+    /// 2. When you want to use a DFA with multiple patterns to both search
+    /// for matches of any pattern or to search for anchored matches of one
+    /// particular pattern while using the same DFA. (Otherwise, you would need
+    /// to compile a new DFA for each pattern.)
+    /// 3. Since the start states added for each pattern are anchored, if you
+    /// compile an unanchored DFA with one pattern while also enabling this
+    /// option, then you can use the same DFA to perform anchored or unanchored
+    /// searches. The latter you get with the standard search APIs. The former
+    /// you get from the various `_at` search methods that allow you specify a
+    /// pattern ID to search for.
+    ///
+    /// By default this is disabled.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use this option to permit the same DFA to
+    /// run both anchored and unanchored searches for a single pattern.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, dense},
+    ///     HalfMatch, PatternID,
+    /// };
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().starts_for_each_pattern(true))
+    ///     .build(r"foo[0-9]+")?;
+    /// let haystack = b"quux foo123";
+    ///
+    /// // Here's a normal unanchored search. Notice that we use 'None' for the
+    /// // pattern ID. Since the DFA was built as an unanchored machine, it
+    /// // use its default unanchored starting state.
+    /// let expected = HalfMatch::must(0, 11);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
+    ///     None, None, haystack, 0, haystack.len(),
+    /// )?);
+    /// // But now if we explicitly specify the pattern to search ('0' being
+    /// // the only pattern in the DFA), then it will use the starting state
+    /// // for that specific pattern which is always anchored. Since the
+    /// // pattern doesn't have a match at the beginning of the haystack, we
+    /// // find nothing.
+    /// assert_eq!(None, dfa.find_leftmost_fwd_at(
+    ///     None, Some(PatternID::must(0)), haystack, 0, haystack.len(),
+    /// )?);
+    /// // And finally, an anchored search is not the same as putting a '^' at
+    /// // beginning of the pattern. An anchored search can only match at the
+    /// // beginning of the *search*, which we can change:
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
+    ///     None, Some(PatternID::must(0)), haystack, 5, haystack.len(),
+    /// )?);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
+        self.starts_for_each_pattern = Some(yes);
+        self
+    }
+
+    /// Whether to attempt to shrink the size of the DFA's alphabet or not.
+    ///
+    /// This option is enabled by default and should never be disabled unless
+    /// one is debugging a generated DFA.
+    ///
+    /// When enabled, the DFA will use a map from all possible bytes to their
+    /// corresponding equivalence class. Each equivalence class represents a
+    /// set of bytes that does not discriminate between a match and a non-match
+    /// in the DFA. For example, the pattern `[ab]+` has at least two
+    /// equivalence classes: a set containing `a` and `b` and a set containing
+    /// every byte except for `a` and `b`. `a` and `b` are in the same
+    /// equivalence classes because they never discriminate between a match
+    /// and a non-match.
+    ///
+    /// The advantage of this map is that the size of the transition table
+    /// can be reduced drastically from `#states * 256 * sizeof(StateID)` to
+    /// `#states * k * sizeof(StateID)` where `k` is the number of equivalence
+    /// classes (rounded up to the nearest power of 2). As a result, total
+    /// space usage can decrease substantially. Moreover, since a smaller
+    /// alphabet is used, DFA compilation becomes faster as well.
+    ///
+    /// **WARNING:** This is only useful for debugging DFAs. Disabling this
+    /// does not yield any speed advantages. Namely, even when this is
+    /// disabled, a byte class map is still used while searching. The only
+    /// difference is that every byte will be forced into its own distinct
+    /// equivalence class. This is useful for debugging the actual generated
+    /// transitions because it lets one see the transitions defined on actual
+    /// bytes instead of the equivalence classes.
+    pub fn byte_classes(mut self, yes: bool) -> Config {
+        self.byte_classes = Some(yes);
+        self
+    }
+
+    /// Heuristically enable Unicode word boundaries.
+    ///
+    /// When set, this will attempt to implement Unicode word boundaries as if
+    /// they were ASCII word boundaries. This only works when the search input
+    /// is ASCII only. If a non-ASCII byte is observed while searching, then a
+    /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned.
+    ///
+    /// A possible alternative to enabling this option is to simply use an
+    /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
+    /// option is if you absolutely need Unicode support. This option lets one
+    /// use a fast search implementation (a DFA) for some potentially very
+    /// common cases, while providing the option to fall back to some other
+    /// regex engine to handle the general case when an error is returned.
+    ///
+    /// If the pattern provided has no Unicode word boundary in it, then this
+    /// option has no effect. (That is, quitting on a non-ASCII byte only
+    /// occurs when this option is enabled _and_ a Unicode word boundary is
+    /// present in the pattern.)
+    ///
+    /// This is almost equivalent to setting all non-ASCII bytes to be quit
+    /// bytes. The only difference is that this will cause non-ASCII bytes to
+    /// be quit bytes _only_ when a Unicode word boundary is present in the
+    /// pattern.
+    ///
+    /// When enabling this option, callers _must_ be prepared to handle
+    /// a [`MatchError`](crate::MatchError) error during search.
+    /// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds
+    /// to using the `try_` suite of methods. Alternatively, if
+    /// callers can guarantee that their input is ASCII only, then a
+    /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be
+    /// returned while searching.
+    ///
+    /// This is disabled by default.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to heuristically enable Unicode word boundaries
+    /// in a pattern. It also shows what happens when a search comes across a
+    /// non-ASCII byte.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, dense},
+    ///     HalfMatch, MatchError, MatchKind,
+    /// };
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().unicode_word_boundary(true))
+    ///     .build(r"\b[0-9]+\b")?;
+    ///
+    /// // The match occurs before the search ever observes the snowman
+    /// // character, so no error occurs.
+    /// let haystack = "foo 123 ☃".as_bytes();
+    /// let expected = Some(HalfMatch::must(0, 7));
+    /// let got = dfa.find_leftmost_fwd(haystack)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// // Notice that this search fails, even though the snowman character
+    /// // occurs after the ending match offset. This is because search
+    /// // routines read one byte past the end of the search to account for
+    /// // look-around, and indeed, this is required here to determine whether
+    /// // the trailing \b matches.
+    /// let haystack = "foo 123☃".as_bytes();
+    /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 };
+    /// let got = dfa.find_leftmost_fwd(haystack);
+    /// assert_eq!(Err(expected), got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
+        // We have a separate option for this instead of just setting the
+        // appropriate quit bytes here because we don't want to set quit bytes
+        // for every regex. We only want to set them when the regex contains a
+        // Unicode word boundary.
+        self.unicode_word_boundary = Some(yes);
+        self
+    }
+
+    /// Add a "quit" byte to the DFA.
+    ///
+    /// When a quit byte is seen during search time, then search will return
+    /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the
+    /// offset at which the search stopped.
+    ///
+    /// A quit byte will always overrule any other aspects of a regex. For
+    /// example, if the `x` byte is added as a quit byte and the regex `\w` is
+    /// used, then observing `x` will cause the search to quit immediately
+    /// despite the fact that `x` is in the `\w` class.
+    ///
+    /// This mechanism is primarily useful for heuristically enabling certain
+    /// features like Unicode word boundaries in a DFA. Namely, if the input
+    /// to search is ASCII, then a Unicode word boundary can be implemented
+    /// via an ASCII word boundary with no change in semantics. Thus, a DFA
+    /// can attempt to match a Unicode word boundary but give up as soon as it
+    /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
+    /// to be quit bytes, then Unicode word boundaries will be permitted when
+    /// building DFAs. Of course, callers should enable
+    /// [`Config::unicode_word_boundary`] if they want this behavior instead.
+    /// (The advantage being that non-ASCII quit bytes will only be added if a
+    /// Unicode word boundary is in the pattern.)
+    ///
+    /// When enabling this option, callers _must_ be prepared to handle a
+    /// [`MatchError`](crate::MatchError) error during search. When using a
+    /// [`Regex`](crate::dfa::regex::Regex), this corresponds to using the
+    /// `try_` suite of methods.
+    ///
+    /// By default, there are no quit bytes set.
+    ///
+    /// # Panics
+    ///
+    /// This panics if heuristic Unicode word boundaries are enabled and any
+    /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
+    /// Unicode word boundaries requires setting every non-ASCII byte to a quit
+    /// byte. So if the caller attempts to undo any of that, then this will
+    /// panic.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to cause a search to terminate if it sees a
+    /// `\n` byte. This could be useful if, for example, you wanted to prevent
+    /// a user supplied pattern from matching across a line boundary.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, dense},
+    ///     HalfMatch, MatchError,
+    /// };
+    ///
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().quit(b'\n', true))
+    ///     .build(r"foo\p{any}+bar")?;
+    ///
+    /// let haystack = "foo\nbar".as_bytes();
+    /// // Normally this would produce a match, since \p{any} contains '\n'.
+    /// // But since we instructed the automaton to enter a quit state if a
+    /// // '\n' is observed, this produces a match error instead.
+    /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
+    /// let got = dfa.find_leftmost_fwd(haystack).unwrap_err();
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn quit(mut self, byte: u8, yes: bool) -> Config {
+        if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
+            panic!(
+                "cannot set non-ASCII byte to be non-quit when \
+                 Unicode word boundaries are enabled"
+            );
+        }
+        if self.quit.is_none() {
+            self.quit = Some(ByteSet::empty());
+        }
+        if yes {
+            self.quit.as_mut().unwrap().add(byte);
+        } else {
+            self.quit.as_mut().unwrap().remove(byte);
+        }
+        self
+    }
+
+    /// Set a size limit on the total heap used by a DFA.
+    ///
+    /// This size limit is expressed in bytes and is applied during
+    /// determinization of an NFA into a DFA. If the DFA's heap usage, and only
+    /// the DFA, exceeds this configured limit, then determinization is stopped
+    /// and an error is returned.
+    ///
+    /// This limit does not apply to auxiliary storage used during
+    /// determinization that isn't part of the generated DFA.
+    ///
+    /// This limit is only applied during determinization. Currently, there is
+    /// no way to post-pone this check to after minimization if minimization
+    /// was enabled.
+    ///
+    /// The total limit on heap used during determinization is the sum of the
+    /// DFA and determinization size limits.
+    ///
+    /// The default is no limit.
+    ///
+    /// # Example
+    ///
+    /// This example shows a DFA that fails to build because of a configured
+    /// size limit. This particular example also serves as a cautionary tale
+    /// demonstrating just how big DFAs with large Unicode character classes
+    /// can get.
+    ///
+    /// ```
+    /// use regex_automata::dfa::{dense, Automaton};
+    ///
+    /// // 3MB isn't enough!
+    /// dense::Builder::new()
+    ///     .configure(dense::Config::new().dfa_size_limit(Some(3_000_000)))
+    ///     .build(r"\w{20}")
+    ///     .unwrap_err();
+    ///
+    /// // ... but 4MB probably is!
+    /// // (Note that DFA sizes aren't necessarily stable between releases.)
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new().dfa_size_limit(Some(4_000_000)))
+    ///     .build(r"\w{20}")?;
+    /// let haystack = "A".repeat(20).into_bytes();
+    /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// While one needs a little more than 3MB to represent `\w{20}`, it
+    /// turns out that you only need a little more than 4KB to represent
+    /// `(?-u:\w{20})`. So only use Unicode if you need it!
+    pub fn dfa_size_limit(mut self, bytes: Option<usize>) -> Config {
+        self.dfa_size_limit = Some(bytes);
+        self
+    }
+
+    /// Set a size limit on the total heap used by determinization.
+    ///
+    /// This size limit is expressed in bytes and is applied during
+    /// determinization of an NFA into a DFA. If the heap used for auxiliary
+    /// storage during determinization (memory that is not in the DFA but
+    /// necessary for building the DFA) exceeds this configured limit, then
+    /// determinization is stopped and an error is returned.
+    ///
+    /// This limit does not apply to heap used by the DFA itself.
+    ///
+    /// The total limit on heap used during determinization is the sum of the
+    /// DFA and determinization size limits.
+    ///
+    /// The default is no limit.
+    ///
+    /// # Example
+    ///
+    /// This example shows a DFA that fails to build because of a
+    /// configured size limit on the amount of heap space used by
+    /// determinization. This particular example complements the example for
+    /// [`Config::dfa_size_limit`] by demonstrating that not only does Unicode
+    /// potentially make DFAs themselves big, but it also results in more
+    /// auxiliary storage during determinization. (Although, auxiliary storage
+    /// is still not as much as the DFA itself.)
+    ///
+    /// ```
+    /// use regex_automata::dfa::{dense, Automaton};
+    ///
+    /// // 300KB isn't enough!
+    /// dense::Builder::new()
+    ///     .configure(dense::Config::new()
+    ///         .determinize_size_limit(Some(300_000))
+    ///     )
+    ///     .build(r"\w{20}")
+    ///     .unwrap_err();
+    ///
+    /// // ... but 400KB probably is!
+    /// // (Note that auxiliary storage sizes aren't necessarily stable between
+    /// // releases.)
+    /// let dfa = dense::Builder::new()
+    ///     .configure(dense::Config::new()
+    ///         .determinize_size_limit(Some(400_000))
+    ///     )
+    ///     .build(r"\w{20}")?;
+    /// let haystack = "A".repeat(20).into_bytes();
+    /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn determinize_size_limit(mut self, bytes: Option<usize>) -> Config {
+        self.determinize_size_limit = Some(bytes);
+        self
+    }
+
+    /// Returns whether this configuration has enabled anchored searches.
+    pub fn get_anchored(&self) -> bool {
+        self.anchored.unwrap_or(false)
+    }
+
+    /// Returns whether this configuration has enabled simple state
+    /// acceleration.
+    pub fn get_accelerate(&self) -> bool {
+        self.accelerate.unwrap_or(true)
+    }
+
+    /// Returns whether this configuration has enabled the expensive process
+    /// of minimizing a DFA.
+    pub fn get_minimize(&self) -> bool {
+        self.minimize.unwrap_or(false)
+    }
+
+    /// Returns the match semantics set in this configuration.
+    pub fn get_match_kind(&self) -> MatchKind {
+        self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
+    }
+
+    /// Returns whether this configuration has enabled anchored starting states
+    /// for every pattern in the DFA.
+    pub fn get_starts_for_each_pattern(&self) -> bool {
+        self.starts_for_each_pattern.unwrap_or(false)
+    }
+
+    /// Returns whether this configuration has enabled byte classes or not.
+    /// This is typically a debugging oriented option, as disabling it confers
+    /// no speed benefit.
+    pub fn get_byte_classes(&self) -> bool {
+        self.byte_classes.unwrap_or(true)
+    }
+
+    /// Returns whether this configuration has enabled heuristic Unicode word
+    /// boundary support. When enabled, it is possible for a search to return
+    /// an error.
+    pub fn get_unicode_word_boundary(&self) -> bool {
+        self.unicode_word_boundary.unwrap_or(false)
+    }
+
+    /// Returns whether this configuration will instruct the DFA to enter a
+    /// quit state whenever the given byte is seen during a search. When at
+    /// least one byte has this enabled, it is possible for a search to return
+    /// an error.
+    pub fn get_quit(&self, byte: u8) -> bool {
+        self.quit.map_or(false, |q| q.contains(byte))
+    }
+
+    /// Returns the DFA size limit of this configuration if one was set.
+    /// The size limit is total number of bytes on the heap that a DFA is
+    /// permitted to use. If the DFA exceeds this limit during construction,
+    /// then construction is stopped and an error is returned.
+    pub fn get_dfa_size_limit(&self) -> Option<usize> {
+        self.dfa_size_limit.unwrap_or(None)
+    }
+
+    /// Returns the determinization size limit of this configuration if one
+    /// was set. The size limit is total number of bytes on the heap that
+    /// determinization is permitted to use. If determinization exceeds this
+    /// limit during construction, then construction is stopped and an error is
+    /// returned.
+    ///
+    /// This is different from the DFA size limit in that this only applies to
+    /// the auxiliary storage used during determinization. Once determinization
+    /// is complete, this memory is freed.
+    ///
+    /// The limit on the total heap memory used is the sum of the DFA and
+    /// determinization size limits.
+    pub fn get_determinize_size_limit(&self) -> Option<usize> {
+        self.determinize_size_limit.unwrap_or(None)
+    }
+
+    /// Overwrite the default configuration such that the options in `o` are
+    /// always used. If an option in `o` is not set, then the corresponding
+    /// option in `self` is used. If it's not set in `self` either, then it
+    /// remains not set.
+    pub(crate) fn overwrite(self, o: Config) -> Config {
+        Config {
+            anchored: o.anchored.or(self.anchored),
+            accelerate: o.accelerate.or(self.accelerate),
+            minimize: o.minimize.or(self.minimize),
+            match_kind: o.match_kind.or(self.match_kind),
+            starts_for_each_pattern: o
+                .starts_for_each_pattern
+                .or(self.starts_for_each_pattern),
+            byte_classes: o.byte_classes.or(self.byte_classes),
+            unicode_word_boundary: o
+                .unicode_word_boundary
+                .or(self.unicode_word_boundary),
+            quit: o.quit.or(self.quit),
+            dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit),
+            determinize_size_limit: o
+                .determinize_size_limit
+                .or(self.determinize_size_limit),
+        }
+    }
+}
+
+/// A builder for constructing a deterministic finite automaton from regular
+/// expressions.
+///
+/// This builder provides two main things:
+///
+/// 1. It provides a few different `build` routines for actually constructing
+/// a DFA from different kinds of inputs. The most convenient is
+/// [`Builder::build`], which builds a DFA directly from a pattern string. The
+/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
+/// from an NFA.
+/// 2. The builder permits configuring a number of things.
+/// [`Builder::configure`] is used with [`Config`] to configure aspects of
+/// the DFA and the construction process itself. [`Builder::syntax`] and
+/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
+/// construction, respectively. The syntax and thompson configurations only
+/// apply when building from a pattern string.
+///
+/// This builder always constructs a *single* DFA. As such, this builder
+/// can only be used to construct regexes that either detect the presence
+/// of a match or find the end location of a match. A single DFA cannot
+/// produce both the start and end of a match. For that information, use a
+/// [`Regex`](crate::dfa::regex::Regex), which can be similarly configured
+/// using [`regex::Builder`](crate::dfa::regex::Builder). The main reason to
+/// use a DFA directly is if the end location of a match is enough for your use
+/// case. Namely, a `Regex` will construct two DFAs instead of one, since a
+/// second reverse DFA is needed to find the start of a match.
+///
+/// Note that if one wants to build a sparse DFA, you must first build a dense
+/// DFA and convert that to a sparse DFA. There is no way to build a sparse
+/// DFA without first building a dense DFA.
+///
+/// # Example
+///
+/// This example shows how to build a minimized DFA that completely disables
+/// Unicode. That is:
+///
+/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
+///   and `\b` are ASCII-only while `.` matches any byte except for `\n`
+///   (instead of any UTF-8 encoding of a Unicode scalar value except for
+///   `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
+/// * The pattern itself is permitted to match invalid UTF-8. For example,
+///   things like `[^a]` that match any byte except for `a` are permitted.
+/// * Unanchored patterns can search through invalid UTF-8. That is, for
+///   unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of
+///   `(?s:.)*?`.
+///
+/// ```
+/// use regex_automata::{
+///     dfa::{Automaton, dense},
+///     nfa::thompson,
+///     HalfMatch, SyntaxConfig,
+/// };
+///
+/// let dfa = dense::Builder::new()
+///     .configure(dense::Config::new().minimize(false))
+///     .syntax(SyntaxConfig::new().unicode(false).utf8(false))
+///     .thompson(thompson::Config::new().utf8(false))
+///     .build(r"foo[^b]ar.*")?;
+///
+/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
+/// let expected = Some(HalfMatch::must(0, 10));
+/// let got = dfa.find_leftmost_fwd(haystack)?;
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug)]
+pub struct Builder {
+    config: Config,
+    thompson: thompson::Builder,
+}
+
+#[cfg(feature = "alloc")]
+impl Builder {
+    /// Create a new dense DFA builder with the default configuration.
+    pub fn new() -> Builder {
+        Builder {
+            config: Config::default(),
+            thompson: thompson::Builder::new(),
+        }
+    }
+
+    /// Build a DFA from the given pattern.
+    ///
+    /// If there was a problem parsing or compiling the pattern, then an error
+    /// is returned.
+    pub fn build(&self, pattern: &str) -> Result<OwnedDFA, Error> {
+        self.build_many(&[pattern])
+    }
+
+    /// Build a DFA from the given patterns.
+    ///
+    /// When matches are returned, the pattern ID corresponds to the index of
+    /// the pattern in the slice given.
+    pub fn build_many<P: AsRef<str>>(
+        &self,
+        patterns: &[P],
+    ) -> Result<OwnedDFA, Error> {
+        let nfa = self.thompson.build_many(patterns).map_err(Error::nfa)?;
+        self.build_from_nfa(&nfa)
+    }
+
+    /// Build a DFA from the given NFA.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to build a DFA if you already have an NFA in
+    /// hand.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, dense},
+    ///     nfa::thompson,
+    ///     HalfMatch,
+    /// };
+    ///
+    /// let haystack = "foo123bar".as_bytes();
+    ///
+    /// // This shows how to set non-default options for building an NFA.
+    /// let nfa = thompson::Builder::new()
+    ///     .configure(thompson::Config::new().shrink(false))
+    ///     .build(r"[0-9]+")?;
+    /// let dfa = dense::Builder::new().build_from_nfa(&nfa)?;
+    /// let expected = Some(HalfMatch::must(0, 6));
+    /// let got = dfa.find_leftmost_fwd(haystack)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn build_from_nfa(
+        &self,
+        nfa: &thompson::NFA,
+    ) -> Result<OwnedDFA, Error> {
+        let mut quit = self.config.quit.unwrap_or(ByteSet::empty());
+        if self.config.get_unicode_word_boundary()
+            && nfa.has_word_boundary_unicode()
+        {
+            for b in 0x80..=0xFF {
+                quit.add(b);
+            }
+        }
+        let classes = if !self.config.get_byte_classes() {
+            // DFAs will always use the equivalence class map, but enabling
+            // this option is useful for debugging. Namely, this will cause all
+            // transitions to be defined over their actual bytes instead of an
+            // opaque equivalence class identifier. The former is much easier
+            // to grok as a human.
+            ByteClasses::singletons()
+        } else {
+            let mut set = nfa.byte_class_set().clone();
+            // It is important to distinguish any "quit" bytes from all other
+            // bytes. Otherwise, a non-quit byte may end up in the same class
+            // as a quit byte, and thus cause the DFA stop when it shouldn't.
+            if !quit.is_empty() {
+                set.add_set(&quit);
+            }
+            set.byte_classes()
+        };
+
+        let mut dfa = DFA::initial(
+            classes,
+            nfa.pattern_len(),
+            self.config.get_starts_for_each_pattern(),
+        )?;
+        determinize::Config::new()
+            .anchored(self.config.get_anchored())
+            .match_kind(self.config.get_match_kind())
+            .quit(quit)
+            .dfa_size_limit(self.config.get_dfa_size_limit())
+            .determinize_size_limit(self.config.get_determinize_size_limit())
+            .run(nfa, &mut dfa)?;
+        if self.config.get_minimize() {
+            dfa.minimize();
+        }
+        if self.config.get_accelerate() {
+            dfa.accelerate();
+        }
+        Ok(dfa)
+    }
+
+    /// Apply the given dense DFA configuration options to this builder.
+    pub fn configure(&mut self, config: Config) -> &mut Builder {
+        self.config = self.config.overwrite(config);
+        self
+    }
+
+    /// Set the syntax configuration for this builder using
+    /// [`SyntaxConfig`](crate::SyntaxConfig).
+    ///
+    /// This permits setting things like case insensitivity, Unicode and multi
+    /// line mode.
+    ///
+    /// These settings only apply when constructing a DFA directly from a
+    /// pattern.
+    pub fn syntax(
+        &mut self,
+        config: crate::util::syntax::SyntaxConfig,
+    ) -> &mut Builder {
+        self.thompson.syntax(config);
+        self
+    }
+
+    /// Set the Thompson NFA configuration for this builder using
+    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+    ///
+    /// This permits setting things like whether the DFA should match the regex
+    /// in reverse or if additional time should be spent shrinking the size of
+    /// the NFA.
+    ///
+    /// These settings only apply when constructing a DFA directly from a
+    /// pattern.
+    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+        self.thompson.configure(config);
+        self
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl Default for Builder {
+    fn default() -> Builder {
+        Builder::new()
+    }
+}
+
+/// A convenience alias for an owned DFA. We use this particular instantiation
+/// a lot in this crate, so it's worth giving it a name. This instantiation
+/// is commonly used for mutable APIs on the DFA while building it. The main
+/// reason for making DFAs generic is no_std support, and more generally,
+/// making it possible to load a DFA from an arbitrary slice of bytes.
+#[cfg(feature = "alloc")]
+pub(crate) type OwnedDFA = DFA<Vec<u32>>;
+
+/// A dense table-based deterministic finite automaton (DFA).
+///
+/// All dense DFAs have one or more start states, zero or more match states
+/// and a transition table that maps the current state and the current byte
+/// of input to the next state. A DFA can use this information to implement
+/// fast searching. In particular, the use of a dense DFA generally makes the
+/// trade off that match speed is the most valuable characteristic, even if
+/// building the DFA may take significant time *and* space. (More concretely,
+/// building a DFA takes time and space that is exponential in the size of the
+/// pattern in the worst case.) As such, the processing of every byte of input
+/// is done with a small constant number of operations that does not vary with
+/// the pattern, its size or the size of the alphabet. If your needs don't line
+/// up with this trade off, then a dense DFA may not be an adequate solution to
+/// your problem.
+///
+/// In contrast, a [`sparse::DFA`] makes the opposite
+/// trade off: it uses less space but will execute a variable number of
+/// instructions per byte at match time, which makes it slower for matching.
+/// (Note that space usage is still exponential in the size of the pattern in
+/// the worst case.)
+///
+/// A DFA can be built using the default configuration via the
+/// [`DFA::new`] constructor. Otherwise, one can
+/// configure various aspects via [`dense::Builder`](Builder).
+///
+/// A single DFA fundamentally supports the following operations:
+///
+/// 1. Detection of a match.
+/// 2. Location of the end of a match.
+/// 3. In the case of a DFA with multiple patterns, which pattern matched is
+///    reported as well.
+///
+/// A notable absence from the above list of capabilities is the location of
+/// the *start* of a match. In order to provide both the start and end of
+/// a match, *two* DFAs are required. This functionality is provided by a
+/// [`Regex`](crate::dfa::regex::Regex).
+///
+/// # Type parameters
+///
+/// A `DFA` has one type parameter, `T`, which is used to represent state IDs,
+/// pattern IDs and accelerators. `T` is typically a `Vec<u32>` or a `&[u32]`.
+///
+/// # The `Automaton` trait
+///
+/// This type implements the [`Automaton`] trait, which means it can be used
+/// for searching. For example:
+///
+/// ```
+/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+///
+/// let dfa = DFA::new("foo[0-9]+")?;
+/// let expected = HalfMatch::must(0, 8);
+/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone)]
+pub struct DFA<T> {
+    /// The transition table for this DFA. This includes the transitions
+    /// themselves, along with the stride, number of states and the equivalence
+    /// class mapping.
+    tt: TransitionTable<T>,
+    /// The set of starting state identifiers for this DFA. The starting state
+    /// IDs act as pointers into the transition table. The specific starting
+    /// state chosen for each search is dependent on the context at which the
+    /// search begins.
+    st: StartTable<T>,
+    /// The set of match states and the patterns that match for each
+    /// corresponding match state.
+    ///
+    /// This structure is technically only needed because of support for
+    /// multi-regexes. Namely, multi-regexes require answering not just whether
+    /// a match exists, but _which_ patterns match. So we need to store the
+    /// matching pattern IDs for each match state. We do this even when there
+    /// is only one pattern for the sake of simplicity. In practice, this uses
+    /// up very little space for the case of on pattern.
+    ms: MatchStates<T>,
+    /// Information about which states are "special." Special states are states
+    /// that are dead, quit, matching, starting or accelerated. For more info,
+    /// see the docs for `Special`.
+    special: Special,
+    /// The accelerators for this DFA.
+    ///
+    /// If a state is accelerated, then there exist only a small number of
+    /// bytes that can cause the DFA to leave the state. This permits searching
+    /// to use optimized routines to find those specific bytes instead of using
+    /// the transition table.
+    ///
+    /// All accelerated states exist in a contiguous range in the DFA's
+    /// transition table. See dfa/special.rs for more details on how states are
+    /// arranged.
+    accels: Accels<T>,
+}
+
+#[cfg(feature = "alloc")]
+impl OwnedDFA {
+    /// Parse the given regular expression using a default configuration and
+    /// return the corresponding DFA.
+    ///
+    /// If you want a non-default configuration, then use the
+    /// [`dense::Builder`](Builder) to set your own configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+    ///
+    /// let dfa = dense::DFA::new("foo[0-9]+bar")?;
+    /// let expected = HalfMatch::must(0, 11);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new(pattern: &str) -> Result<OwnedDFA, Error> {
+        Builder::new().build(pattern)
+    }
+
+    /// Parse the given regular expressions using a default configuration and
+    /// return the corresponding multi-DFA.
+    ///
+    /// If you want a non-default configuration, then use the
+    /// [`dense::Builder`](Builder) to set your own configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+    ///
+    /// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
+    /// let expected = HalfMatch::must(1, 3);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<OwnedDFA, Error> {
+        Builder::new().build_many(patterns)
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl OwnedDFA {
+    /// Create a new DFA that matches every input.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+    ///
+    /// let dfa = dense::DFA::always_match()?;
+    ///
+    /// let expected = HalfMatch::must(0, 0);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn always_match() -> Result<OwnedDFA, Error> {
+        let nfa = thompson::NFA::always_match();
+        Builder::new().build_from_nfa(&nfa)
+    }
+
+    /// Create a new DFA that never matches any input.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::dfa::{Automaton, dense};
+    ///
+    /// let dfa = dense::DFA::never_match()?;
+    /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?);
+    /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn never_match() -> Result<OwnedDFA, Error> {
+        let nfa = thompson::NFA::never_match();
+        Builder::new().build_from_nfa(&nfa)
+    }
+
+    /// Create an initial DFA with the given equivalence classes, pattern count
+    /// and whether anchored starting states are enabled for each pattern. An
+    /// initial DFA can be further mutated via determinization.
+    fn initial(
+        classes: ByteClasses,
+        pattern_count: usize,
+        starts_for_each_pattern: bool,
+    ) -> Result<OwnedDFA, Error> {
+        let start_pattern_count =
+            if starts_for_each_pattern { pattern_count } else { 0 };
+        Ok(DFA {
+            tt: TransitionTable::minimal(classes),
+            st: StartTable::dead(start_pattern_count)?,
+            ms: MatchStates::empty(pattern_count),
+            special: Special::new(),
+            accels: Accels::empty(),
+        })
+    }
+}
+
+impl<T: AsRef<[u32]>> DFA<T> {
+    /// Cheaply return a borrowed version of this dense DFA. Specifically,
+    /// the DFA returned always uses `&[u32]` for its transition table.
+    pub fn as_ref(&self) -> DFA<&'_ [u32]> {
+        DFA {
+            tt: self.tt.as_ref(),
+            st: self.st.as_ref(),
+            ms: self.ms.as_ref(),
+            special: self.special,
+            accels: self.accels(),
+        }
+    }
+
+    /// Return an owned version of this sparse DFA. Specifically, the DFA
+    /// returned always uses `Vec<u32>` for its transition table.
+    ///
+    /// Effectively, this returns a dense DFA whose transition table lives on
+    /// the heap.
+    #[cfg(feature = "alloc")]
+    pub fn to_owned(&self) -> OwnedDFA {
+        DFA {
+            tt: self.tt.to_owned(),
+            st: self.st.to_owned(),
+            ms: self.ms.to_owned(),
+            special: self.special,
+            accels: self.accels().to_owned(),
+        }
+    }
+
+    /// Returns true only if this DFA has starting states for each pattern.
+    ///
+    /// When a DFA has starting states for each pattern, then a search with the
+    /// DFA can be configured to only look for anchored matches of a specific
+    /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`]
+    /// can accept a non-None `pattern_id` if and only if this method returns
+    /// true. Otherwise, calling `find_earliest_fwd_at` will panic.
+    ///
+    /// Note that if the DFA has no patterns, this always returns false.
+    pub fn has_starts_for_each_pattern(&self) -> bool {
+        self.st.patterns > 0
+    }
+
+    /// Returns the total number of elements in the alphabet for this DFA.
+    ///
+    /// That is, this returns the total number of transitions that each state
+    /// in this DFA must have. Typically, a normal byte oriented DFA would
+    /// always have an alphabet size of 256, corresponding to the number of
+    /// unique values in a single byte. However, this implementation has two
+    /// peculiarities that impact the alphabet length:
+    ///
+    /// * Every state has a special "EOI" transition that is only followed
+    /// after the end of some haystack is reached. This EOI transition is
+    /// necessary to account for one byte of look-ahead when implementing
+    /// things like `\b` and `$`.
+    /// * Bytes are grouped into equivalence classes such that no two bytes in
+    /// the same class can distinguish a match from a non-match. For example,
+    /// in the regex `^[a-z]+$`, the ASCII bytes `a-z` could all be in the
+    /// same equivalence class. This leads to a massive space savings.
+    ///
+    /// Note though that the alphabet length does _not_ necessarily equal the
+    /// total stride space taken up by a single DFA state in the transition
+    /// table. Namely, for performance reasons, the stride is always the
+    /// smallest power of two that is greater than or equal to the alphabet
+    /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are
+    /// often more useful. The alphabet length is typically useful only for
+    /// informational purposes.
+    pub fn alphabet_len(&self) -> usize {
+        self.tt.alphabet_len()
+    }
+
+    /// Returns the total stride for every state in this DFA, expressed as the
+    /// exponent of a power of 2. The stride is the amount of space each state
+    /// takes up in the transition table, expressed as a number of transitions.
+    /// (Unused transitions map to dead states.)
+    ///
+    /// The stride of a DFA is always equivalent to the smallest power of 2
+    /// that is greater than or equal to the DFA's alphabet length. This
+    /// definition uses extra space, but permits faster translation between
+    /// premultiplied state identifiers and contiguous indices (by using shifts
+    /// instead of relying on integer division).
+    ///
+    /// For example, if the DFA's stride is 16 transitions, then its `stride2`
+    /// is `4` since `2^4 = 16`.
+    ///
+    /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
+    /// while the maximum `stride2` value is `9` (corresponding to a stride of
+    /// `512`). The maximum is not `8` since the maximum alphabet size is `257`
+    /// when accounting for the special EOI transition. However, an alphabet
+    /// length of that size is exceptionally rare since the alphabet is shrunk
+    /// into equivalence classes.
+    pub fn stride2(&self) -> usize {
+        self.tt.stride2
+    }
+
+    /// Returns the total stride for every state in this DFA. This corresponds
+    /// to the total number of transitions used by each state in this DFA's
+    /// transition table.
+    ///
+    /// Please see [`DFA::stride2`] for more information. In particular, this
+    /// returns the stride as the number of transitions, where as `stride2`
+    /// returns it as the exponent of a power of 2.
+    pub fn stride(&self) -> usize {
+        self.tt.stride()
+    }
+
+    /// Returns the "universal" start state for this DFA.
+    ///
+    /// A universal start state occurs only when all of the starting states
+    /// for this DFA are precisely the same. This occurs when there are no
+    /// look-around assertions at the beginning (or end for a reverse DFA) of
+    /// the pattern.
+    ///
+    /// Using this as a starting state for a DFA without a universal starting
+    /// state has unspecified behavior. This condition is not checked, so the
+    /// caller must guarantee it themselves.
+    pub(crate) fn universal_start_state(&self) -> StateID {
+        // We choose 'NonWordByte' for no particular reason, other than
+        // the fact that this is the 'main' starting configuration used in
+        // determinization. But in essence, it doesn't really matter.
+        //
+        // Also, we might consider exposing this routine, but it seems
+        // a little tricky to use correctly. Maybe if we also expose a
+        // 'has_universal_start_state' method?
+        self.st.start(Start::NonWordByte, None)
+    }
+
+    /// Returns the memory usage, in bytes, of this DFA.
+    ///
+    /// The memory usage is computed based on the number of bytes used to
+    /// represent this DFA.
+    ///
+    /// This does **not** include the stack size used up by this DFA. To
+    /// compute that, use `std::mem::size_of::<dense::DFA>()`.
+    pub fn memory_usage(&self) -> usize {
+        self.tt.memory_usage()
+            + self.st.memory_usage()
+            + self.ms.memory_usage()
+            + self.accels.memory_usage()
+    }
+}
+
+/// Routines for converting a dense DFA to other representations, such as
+/// sparse DFAs or raw bytes suitable for persistent storage.
+impl<T: AsRef<[u32]>> DFA<T> {
+    /// Convert this dense DFA to a sparse DFA.
+    ///
+    /// If a `StateID` is too small to represent all states in the sparse
+    /// DFA, then this returns an error. In most cases, if a dense DFA is
+    /// constructable with `StateID` then a sparse DFA will be as well.
+    /// However, it is not guaranteed.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+    ///
+    /// let dense = dense::DFA::new("foo[0-9]+")?;
+    /// let sparse = dense.to_sparse()?;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), sparse.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[cfg(feature = "alloc")]
+    pub fn to_sparse(&self) -> Result<sparse::DFA<Vec<u8>>, Error> {
+        sparse::DFA::from_dense(self)
+    }
+
+    /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
+    /// format. Upon success, the `Vec<u8>` and the initial padding length are
+    /// returned.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// The padding returned is non-zero if the returned `Vec<u8>` starts at
+    /// an address that does not have the same alignment as `u32`. The padding
+    /// corresponds to the number of leading bytes written to the returned
+    /// `Vec<u8>`.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA:
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// // N.B. We use native endianness here to make the example work, but
+    /// // using to_bytes_little_endian would work on a little endian target.
+    /// let (buf, _) = original_dfa.to_bytes_native_endian();
+    /// // Even if buf has initial padding, DFA::from_bytes will automatically
+    /// // ignore it.
+    /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[cfg(feature = "alloc")]
+    pub fn to_bytes_little_endian(&self) -> (Vec<u8>, usize) {
+        self.to_bytes::<bytes::LE>()
+    }
+
+    /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
+    /// format. Upon success, the `Vec<u8>` and the initial padding length are
+    /// returned.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// The padding returned is non-zero if the returned `Vec<u8>` starts at
+    /// an address that does not have the same alignment as `u32`. The padding
+    /// corresponds to the number of leading bytes written to the returned
+    /// `Vec<u8>`.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA:
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// // N.B. We use native endianness here to make the example work, but
+    /// // using to_bytes_big_endian would work on a big endian target.
+    /// let (buf, _) = original_dfa.to_bytes_native_endian();
+    /// // Even if buf has initial padding, DFA::from_bytes will automatically
+    /// // ignore it.
+    /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[cfg(feature = "alloc")]
+    pub fn to_bytes_big_endian(&self) -> (Vec<u8>, usize) {
+        self.to_bytes::<bytes::BE>()
+    }
+
+    /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
+    /// format. Upon success, the `Vec<u8>` and the initial padding length are
+    /// returned.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// The padding returned is non-zero if the returned `Vec<u8>` starts at
+    /// an address that does not have the same alignment as `u32`. The padding
+    /// corresponds to the number of leading bytes written to the returned
+    /// `Vec<u8>`.
+    ///
+    /// Generally speaking, native endian format should only be used when
+    /// you know that the target you're compiling the DFA for matches the
+    /// endianness of the target on which you're compiling DFA. For example,
+    /// if serialization and deserialization happen in the same process or on
+    /// the same machine. Otherwise, when serializing a DFA for use in a
+    /// portable environment, you'll almost certainly want to serialize _both_
+    /// a little endian and a big endian version and then load the correct one
+    /// based on the target's configuration.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA:
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// let (buf, _) = original_dfa.to_bytes_native_endian();
+    /// // Even if buf has initial padding, DFA::from_bytes will automatically
+    /// // ignore it.
+    /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[cfg(feature = "alloc")]
+    pub fn to_bytes_native_endian(&self) -> (Vec<u8>, usize) {
+        self.to_bytes::<bytes::NE>()
+    }
+
+    /// The implementation of the public `to_bytes` serialization methods,
+    /// which is generic over endianness.
+    #[cfg(feature = "alloc")]
+    fn to_bytes<E: Endian>(&self) -> (Vec<u8>, usize) {
+        let len = self.write_to_len();
+        let (mut buf, padding) = bytes::alloc_aligned_buffer::<u32>(len);
+        // This should always succeed since the only possible serialization
+        // error is providing a buffer that's too small, but we've ensured that
+        // `buf` is big enough here.
+        self.as_ref().write_to::<E>(&mut buf[padding..]).unwrap();
+        (buf, padding)
+    }
+
+    /// Serialize this DFA as raw bytes to the given slice, in little endian
+    /// format. Upon success, the total number of bytes written to `dst` is
+    /// returned.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// Note that unlike the various `to_byte_*` routines, this does not write
+    /// any padding. Callers are responsible for handling alignment correctly.
+    ///
+    /// # Errors
+    ///
+    /// This returns an error if the given destination slice is not big enough
+    /// to contain the full serialized DFA. If an error occurs, then nothing
+    /// is written to `dst`.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA without
+    /// dynamic memory allocation.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// // Create a 4KB buffer on the stack to store our serialized DFA.
+    /// let mut buf = [0u8; 4 * (1<<10)];
+    /// // N.B. We use native endianness here to make the example work, but
+    /// // using write_to_little_endian would work on a little endian target.
+    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+    /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn write_to_little_endian(
+        &self,
+        dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        self.as_ref().write_to::<bytes::LE>(dst)
+    }
+
+    /// Serialize this DFA as raw bytes to the given slice, in big endian
+    /// format. Upon success, the total number of bytes written to `dst` is
+    /// returned.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// Note that unlike the various `to_byte_*` routines, this does not write
+    /// any padding. Callers are responsible for handling alignment correctly.
+    ///
+    /// # Errors
+    ///
+    /// This returns an error if the given destination slice is not big enough
+    /// to contain the full serialized DFA. If an error occurs, then nothing
+    /// is written to `dst`.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA without
+    /// dynamic memory allocation.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// // Create a 4KB buffer on the stack to store our serialized DFA.
+    /// let mut buf = [0u8; 4 * (1<<10)];
+    /// // N.B. We use native endianness here to make the example work, but
+    /// // using write_to_big_endian would work on a big endian target.
+    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+    /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn write_to_big_endian(
+        &self,
+        dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        self.as_ref().write_to::<bytes::BE>(dst)
+    }
+
+    /// Serialize this DFA as raw bytes to the given slice, in native endian
+    /// format. Upon success, the total number of bytes written to `dst` is
+    /// returned.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// Generally speaking, native endian format should only be used when
+    /// you know that the target you're compiling the DFA for matches the
+    /// endianness of the target on which you're compiling DFA. For example,
+    /// if serialization and deserialization happen in the same process or on
+    /// the same machine. Otherwise, when serializing a DFA for use in a
+    /// portable environment, you'll almost certainly want to serialize _both_
+    /// a little endian and a big endian version and then load the correct one
+    /// based on the target's configuration.
+    ///
+    /// Note that unlike the various `to_byte_*` routines, this does not write
+    /// any padding. Callers are responsible for handling alignment correctly.
+    ///
+    /// # Errors
+    ///
+    /// This returns an error if the given destination slice is not big enough
+    /// to contain the full serialized DFA. If an error occurs, then nothing
+    /// is written to `dst`.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA without
+    /// dynamic memory allocation.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// // Create a 4KB buffer on the stack to store our serialized DFA.
+    /// let mut buf = [0u8; 4 * (1<<10)];
+    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+    /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn write_to_native_endian(
+        &self,
+        dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        self.as_ref().write_to::<bytes::NE>(dst)
+    }
+
+    /// Return the total number of bytes required to serialize this DFA.
+    ///
+    /// This is useful for determining the size of the buffer required to pass
+    /// to one of the serialization routines:
+    ///
+    /// * [`DFA::write_to_little_endian`]
+    /// * [`DFA::write_to_big_endian`]
+    /// * [`DFA::write_to_native_endian`]
+    ///
+    /// Passing a buffer smaller than the size returned by this method will
+    /// result in a serialization error. Serialization routines are guaranteed
+    /// to succeed when the buffer is big enough.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to dynamically allocate enough room to serialize
+    /// a DFA.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// let mut buf = vec![0; original_dfa.write_to_len()];
+    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+    /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// Note that this example isn't actually guaranteed to work! In
+    /// particular, if `buf` is not aligned to a 4-byte boundary, then the
+    /// `DFA::from_bytes` call will fail. If you need this to work, then you
+    /// either need to deal with adding some initial padding yourself, or use
+    /// one of the `to_bytes` methods, which will do it for you.
+    pub fn write_to_len(&self) -> usize {
+        bytes::write_label_len(LABEL)
+        + bytes::write_endianness_check_len()
+        + bytes::write_version_len()
+        + size_of::<u32>() // unused, intended for future flexibility
+        + self.tt.write_to_len()
+        + self.st.write_to_len()
+        + self.ms.write_to_len()
+        + self.special.write_to_len()
+        + self.accels.write_to_len()
+    }
+}
+
+impl<'a> DFA<&'a [u32]> {
+    /// Safely deserialize a DFA with a specific state identifier
+    /// representation. Upon success, this returns both the deserialized DFA
+    /// and the number of bytes read from the given slice. Namely, the contents
+    /// of the slice beyond the DFA are not read.
+    ///
+    /// Deserializing a DFA using this routine will never allocate heap memory.
+    /// For safety purposes, the DFA's transition table will be verified such
+    /// that every transition points to a valid state. If this verification is
+    /// too costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
+    /// will always execute in constant time.
+    ///
+    /// The bytes given must be generated by one of the serialization APIs
+    /// of a `DFA` using a semver compatible release of this crate. Those
+    /// include:
+    ///
+    /// * [`DFA::to_bytes_little_endian`]
+    /// * [`DFA::to_bytes_big_endian`]
+    /// * [`DFA::to_bytes_native_endian`]
+    /// * [`DFA::write_to_little_endian`]
+    /// * [`DFA::write_to_big_endian`]
+    /// * [`DFA::write_to_native_endian`]
+    ///
+    /// The `to_bytes` methods allocate and return a `Vec<u8>` for you, along
+    /// with handling alignment correctly. The `write_to` methods do not
+    /// allocate and write to an existing slice (which may be on the stack).
+    /// Since deserialization always uses the native endianness of the target
+    /// platform, the serialization API you use should match the endianness of
+    /// the target platform. (It's often a good idea to generate serialized
+    /// DFAs for both forms of endianness and then load the correct one based
+    /// on endianness.)
+    ///
+    /// # Errors
+    ///
+    /// Generally speaking, it's easier to state the conditions in which an
+    /// error is _not_ returned. All of the following must be true:
+    ///
+    /// * The bytes given must be produced by one of the serialization APIs
+    ///   on this DFA, as mentioned above.
+    /// * The endianness of the target platform matches the endianness used to
+    ///   serialized the provided DFA.
+    /// * The slice given must have the same alignment as `u32`.
+    ///
+    /// If any of the above are not true, then an error will be returned.
+    ///
+    /// # Panics
+    ///
+    /// This routine will never panic for any input.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize a DFA to raw bytes, deserialize it
+    /// and then use it for searching.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+    ///
+    /// let initial = DFA::new("foo[0-9]+")?;
+    /// let (bytes, _) = initial.to_bytes_native_endian();
+    /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes)?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// # Example: dealing with alignment and padding
+    ///
+    /// In the above example, we used the `to_bytes_native_endian` method to
+    /// serialize a DFA, but we ignored part of its return value corresponding
+    /// to padding added to the beginning of the serialized DFA. This is OK
+    /// because deserialization will skip this initial padding. What matters
+    /// is that the address immediately following the padding has an alignment
+    /// that matches `u32`. That is, the following is an equivalent but
+    /// alternative way to write the above example:
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+    ///
+    /// let initial = DFA::new("foo[0-9]+")?;
+    /// // Serialization returns the number of leading padding bytes added to
+    /// // the returned Vec<u8>.
+    /// let (bytes, pad) = initial.to_bytes_native_endian();
+    /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes[pad..])?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// This padding is necessary because Rust's standard library does
+    /// not expose any safe and robust way of creating a `Vec<u8>` with a
+    /// guaranteed alignment other than 1. Now, in practice, the underlying
+    /// allocator is likely to provide a `Vec<u8>` that meets our alignment
+    /// requirements, which means `pad` is zero in practice most of the time.
+    ///
+    /// The purpose of exposing the padding like this is flexibility for the
+    /// caller. For example, if one wants to embed a serialized DFA into a
+    /// compiled program, then it's important to guarantee that it starts at a
+    /// `u32`-aligned address. The simplest way to do this is to discard the
+    /// padding bytes and set it up so that the serialized DFA itself begins at
+    /// a properly aligned address. We can show this in two parts. The first
+    /// part is serializing the DFA to a file:
+    ///
+    /// ```no_run
+    /// use regex_automata::dfa::{Automaton, dense::DFA};
+    ///
+    /// let dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// let (bytes, pad) = dfa.to_bytes_big_endian();
+    /// // Write the contents of the DFA *without* the initial padding.
+    /// std::fs::write("foo.bigendian.dfa", &bytes[pad..])?;
+    ///
+    /// // Do it again, but this time for little endian.
+    /// let (bytes, pad) = dfa.to_bytes_little_endian();
+    /// std::fs::write("foo.littleendian.dfa", &bytes[pad..])?;
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// And now the second part is embedding the DFA into the compiled program
+    /// and deserializing it at runtime on first use. We use conditional
+    /// compilation to choose the correct endianness.
+    ///
+    /// ```no_run
+    /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
+    ///
+    /// type S = u32;
+    /// type DFA = dense::DFA<&'static [S]>;
+    ///
+    /// fn get_foo() -> &'static DFA {
+    ///     use std::cell::Cell;
+    ///     use std::mem::MaybeUninit;
+    ///     use std::sync::Once;
+    ///
+    ///     // This struct with a generic B is used to permit unsizing
+    ///     // coercions, specifically, where B winds up being a [u8]. We also
+    ///     // need repr(C) to guarantee that _align comes first, which forces
+    ///     // a correct alignment.
+    ///     #[repr(C)]
+    ///     struct Aligned<B: ?Sized> {
+    ///         _align: [S; 0],
+    ///         bytes: B,
+    ///     }
+    ///
+    ///     # const _: &str = stringify! {
+    ///     // This assignment is made possible (implicitly) via the
+    ///     // CoerceUnsized trait.
+    ///     static ALIGNED: &Aligned<[u8]> = &Aligned {
+    ///         _align: [],
+    ///         #[cfg(target_endian = "big")]
+    ///         bytes: *include_bytes!("foo.bigendian.dfa"),
+    ///         #[cfg(target_endian = "little")]
+    ///         bytes: *include_bytes!("foo.littleendian.dfa"),
+    ///     };
+    ///     # };
+    ///     # static ALIGNED: &Aligned<[u8]> = &Aligned {
+    ///     #     _align: [],
+    ///     #     bytes: [],
+    ///     # };
+    ///
+    ///     struct Lazy(Cell<MaybeUninit<DFA>>);
+    ///     // SAFETY: This is safe because DFA impls Sync.
+    ///     unsafe impl Sync for Lazy {}
+    ///
+    ///     static INIT: Once = Once::new();
+    ///     static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit()));
+    ///
+    ///     INIT.call_once(|| {
+    ///         let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
+    ///             .expect("serialized DFA should be valid");
+    ///         // SAFETY: This is guaranteed to only execute once, and all
+    ///         // we do with the pointer is write the DFA to it.
+    ///         unsafe {
+    ///             (*DFA.0.as_ptr()).as_mut_ptr().write(dfa);
+    ///         }
+    ///     });
+    ///     // SAFETY: DFA is guaranteed to by initialized via INIT and is
+    ///     // stored in static memory.
+    ///     unsafe {
+    ///         let dfa = (*DFA.0.as_ptr()).as_ptr();
+    ///         std::mem::transmute::<*const DFA, &'static DFA>(dfa)
+    ///     }
+    /// }
+    ///
+    /// let dfa = get_foo();
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345"));
+    /// ```
+    ///
+    /// Alternatively, consider using
+    /// [`lazy_static`](https://crates.io/crates/lazy_static)
+    /// or
+    /// [`once_cell`](https://crates.io/crates/once_cell),
+    /// which will guarantee safety for you. You will still need to use the
+    /// `Aligned` trick above to force correct alignment, but this is safe to
+    /// do and `from_bytes` will return an error if you get it wrong.
+    pub fn from_bytes(
+        slice: &'a [u8],
+    ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
+        // SAFETY: This is safe because we validate both the transition table,
+        // start state ID list and the match states below. If either validation
+        // fails, then we return an error.
+        let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
+        dfa.tt.validate()?;
+        dfa.st.validate(&dfa.tt)?;
+        dfa.ms.validate(&dfa)?;
+        dfa.accels.validate()?;
+        // N.B. dfa.special doesn't have a way to do unchecked deserialization,
+        // so it has already been validated.
+        Ok((dfa, nread))
+    }
+
+    /// Deserialize a DFA with a specific state identifier representation in
+    /// constant time by omitting the verification of the validity of the
+    /// transition table and other data inside the DFA.
+    ///
+    /// This is just like [`DFA::from_bytes`], except it can potentially return
+    /// a DFA that exhibits undefined behavior if its transition table contains
+    /// invalid state identifiers.
+    ///
+    /// This routine is useful if you need to deserialize a DFA cheaply
+    /// and cannot afford the transition table validation performed by
+    /// `from_bytes`.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
+    ///
+    /// let initial = DFA::new("foo[0-9]+")?;
+    /// let (bytes, _) = initial.to_bytes_native_endian();
+    /// // SAFETY: This is guaranteed to be safe since the bytes given come
+    /// // directly from a compatible serialization routine.
+    /// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub unsafe fn from_bytes_unchecked(
+        slice: &'a [u8],
+    ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
+        let mut nr = 0;
+
+        nr += bytes::skip_initial_padding(slice);
+        bytes::check_alignment::<StateID>(&slice[nr..])?;
+        nr += bytes::read_label(&slice[nr..], LABEL)?;
+        nr += bytes::read_endianness_check(&slice[nr..])?;
+        nr += bytes::read_version(&slice[nr..], VERSION)?;
+
+        let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?;
+        nr += size_of::<u32>();
+
+        let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?;
+        nr += nread;
+
+        let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
+        nr += nread;
+
+        let (ms, nread) = MatchStates::from_bytes_unchecked(&slice[nr..])?;
+        nr += nread;
+
+        let (special, nread) = Special::from_bytes(&slice[nr..])?;
+        nr += nread;
+        special.validate_state_count(tt.count(), tt.stride2)?;
+
+        let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?;
+        nr += nread;
+
+        Ok((DFA { tt, st, ms, special, accels }, nr))
+    }
+
+    /// The implementation of the public `write_to` serialization methods,
+    /// which is generic over endianness.
+    ///
+    /// This is defined only for &[u32] to reduce binary size/compilation time.
+    fn write_to<E: Endian>(
+        &self,
+        mut dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        let nwrite = self.write_to_len();
+        if dst.len() < nwrite {
+            return Err(SerializeError::buffer_too_small("dense DFA"));
+        }
+        dst = &mut dst[..nwrite];
+
+        let mut nw = 0;
+        nw += bytes::write_label(LABEL, &mut dst[nw..])?;
+        nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?;
+        nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?;
+        nw += {
+            // Currently unused, intended for future flexibility
+            E::write_u32(0, &mut dst[nw..]);
+            size_of::<u32>()
+        };
+        nw += self.tt.write_to::<E>(&mut dst[nw..])?;
+        nw += self.st.write_to::<E>(&mut dst[nw..])?;
+        nw += self.ms.write_to::<E>(&mut dst[nw..])?;
+        nw += self.special.write_to::<E>(&mut dst[nw..])?;
+        nw += self.accels.write_to::<E>(&mut dst[nw..])?;
+        Ok(nw)
+    }
+}
+
+/// The following methods implement mutable routines on the internal
+/// representation of a DFA. As such, we must fix the first type parameter to a
+/// `Vec<u32>` since a generic `T: AsRef<[u32]>` does not permit mutation. We
+/// can get away with this because these methods are internal to the crate and
+/// are exclusively used during construction of the DFA.
+#[cfg(feature = "alloc")]
+impl OwnedDFA {
+    /// Add a start state of this DFA.
+    pub(crate) fn set_start_state(
+        &mut self,
+        index: Start,
+        pattern_id: Option<PatternID>,
+        id: StateID,
+    ) {
+        assert!(self.tt.is_valid(id), "invalid start state");
+        self.st.set_start(index, pattern_id, id);
+    }
+
+    /// Set the given transition to this DFA. Both the `from` and `to` states
+    /// must already exist.
+    pub(crate) fn set_transition(
+        &mut self,
+        from: StateID,
+        byte: alphabet::Unit,
+        to: StateID,
+    ) {
+        self.tt.set(from, byte, to);
+    }
+
+    /// An an empty state (a state where all transitions lead to a dead state)
+    /// and return its identifier. The identifier returned is guaranteed to
+    /// not point to any other existing state.
+    ///
+    /// If adding a state would exceed `StateID::LIMIT`, then this returns an
+    /// error.
+    pub(crate) fn add_empty_state(&mut self) -> Result<StateID, Error> {
+        self.tt.add_empty_state()
+    }
+
+    /// Swap the two states given in the transition table.
+    ///
+    /// This routine does not do anything to check the correctness of this
+    /// swap. Callers must ensure that other states pointing to id1 and id2 are
+    /// updated appropriately.
+    pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
+        self.tt.swap(id1, id2);
+    }
+
+    /// Truncate the states in this DFA to the given count.
+    ///
+    /// This routine does not do anything to check the correctness of this
+    /// truncation. Callers must ensure that other states pointing to truncated
+    /// states are updated appropriately.
+    pub(crate) fn truncate_states(&mut self, count: usize) {
+        self.tt.truncate(count);
+    }
+
+    /// Return a mutable representation of the state corresponding to the given
+    /// id. This is useful for implementing routines that manipulate DFA states
+    /// (e.g., swapping states).
+    pub(crate) fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
+        self.tt.state_mut(id)
+    }
+
+    /// Minimize this DFA in place using Hopcroft's algorithm.
+    pub(crate) fn minimize(&mut self) {
+        Minimizer::new(self).run();
+    }
+
+    /// Updates the match state pattern ID map to use the one provided.
+    ///
+    /// This is useful when it's convenient to manipulate matching states
+    /// (and their corresponding pattern IDs) as a map. In particular, the
+    /// representation used by a DFA for this map is not amenable to mutation,
+    /// so if things need to be changed (like when shuffling states), it's
+    /// often easier to work with the map form.
+    pub(crate) fn set_pattern_map(
+        &mut self,
+        map: &BTreeMap<StateID, Vec<PatternID>>,
+    ) -> Result<(), Error> {
+        self.ms = self.ms.new_with_map(map)?;
+        Ok(())
+    }
+
+    /// Find states that have a small number of non-loop transitions and mark
+    /// them as candidates for acceleration during search.
+    pub(crate) fn accelerate(&mut self) {
+        // dead and quit states can never be accelerated.
+        if self.state_count() <= 2 {
+            return;
+        }
+
+        // Go through every state and record their accelerator, if possible.
+        let mut accels = BTreeMap::new();
+        // Count the number of accelerated match, start and non-match/start
+        // states.
+        let (mut cmatch, mut cstart, mut cnormal) = (0, 0, 0);
+        for state in self.states() {
+            if let Some(accel) = state.accelerate(self.byte_classes()) {
+                accels.insert(state.id(), accel);
+                if self.is_match_state(state.id()) {
+                    cmatch += 1;
+                } else if self.is_start_state(state.id()) {
+                    cstart += 1;
+                } else {
+                    assert!(!self.is_dead_state(state.id()));
+                    assert!(!self.is_quit_state(state.id()));
+                    cnormal += 1;
+                }
+            }
+        }
+        // If no states were able to be accelerated, then we're done.
+        if accels.is_empty() {
+            return;
+        }
+        let original_accels_len = accels.len();
+
+        // A remapper keeps track of state ID changes. Once we're done
+        // shuffling, the remapper is used to rewrite all transitions in the
+        // DFA based on the new positions of states.
+        let mut remapper = Remapper::from_dfa(self);
+
+        // As we swap states, if they are match states, we need to swap their
+        // pattern ID lists too (for multi-regexes). We do this by converting
+        // the lists to an easily swappable map, and then convert back to
+        // MatchStates once we're done.
+        let mut new_matches = self.ms.to_map(self);
+
+        // There is at least one state that gets accelerated, so these are
+        // guaranteed to get set to sensible values below.
+        self.special.min_accel = StateID::MAX;
+        self.special.max_accel = StateID::ZERO;
+        let update_special_accel =
+            |special: &mut Special, accel_id: StateID| {
+                special.min_accel = cmp::min(special.min_accel, accel_id);
+                special.max_accel = cmp::max(special.max_accel, accel_id);
+            };
+
+        // Start by shuffling match states. Any match states that are
+        // accelerated get moved to the end of the match state range.
+        if cmatch > 0 && self.special.matches() {
+            // N.B. special.{min,max}_match do not need updating, since the
+            // range/number of match states does not change. Only the ordering
+            // of match states may change.
+            let mut next_id = self.special.max_match;
+            let mut cur_id = next_id;
+            while cur_id >= self.special.min_match {
+                if let Some(accel) = accels.remove(&cur_id) {
+                    accels.insert(next_id, accel);
+                    update_special_accel(&mut self.special, next_id);
+
+                    // No need to do any actual swapping for equivalent IDs.
+                    if cur_id != next_id {
+                        remapper.swap(self, cur_id, next_id);
+
+                        // Swap pattern IDs for match states.
+                        let cur_pids = new_matches.remove(&cur_id).unwrap();
+                        let next_pids = new_matches.remove(&next_id).unwrap();
+                        new_matches.insert(cur_id, next_pids);
+                        new_matches.insert(next_id, cur_pids);
+                    }
+                    next_id = self.tt.prev_state_id(next_id);
+                }
+                cur_id = self.tt.prev_state_id(cur_id);
+            }
+        }
+
+        // This is where it gets tricky. Without acceleration, start states
+        // normally come right after match states. But we want accelerated
+        // states to be a single contiguous range (to make it very fast
+        // to determine whether a state *is* accelerated), while also keeping
+        // match and starting states as contiguous ranges for the same reason.
+        // So what we do here is shuffle states such that it looks like this:
+        //
+        //     DQMMMMAAAAASSSSSSNNNNNNN
+        //         |         |
+        //         |---------|
+        //      accelerated states
+        //
+        // Where:
+        //   D - dead state
+        //   Q - quit state
+        //   M - match state (may be accelerated)
+        //   A - normal state that is accelerated
+        //   S - start state (may be accelerated)
+        //   N - normal state that is NOT accelerated
+        //
+        // We implement this by shuffling states, which is done by a sequence
+        // of pairwise swaps. We start by looking at all normal states to be
+        // accelerated. When we find one, we swap it with the earliest starting
+        // state, and then swap that with the earliest normal state. This
+        // preserves the contiguous property.
+        //
+        // Once we're done looking for accelerated normal states, now we look
+        // for accelerated starting states by moving them to the beginning
+        // of the starting state range (just like we moved accelerated match
+        // states to the end of the matching state range).
+        //
+        // For a more detailed/different perspective on this, see the docs
+        // in dfa/special.rs.
+        if cnormal > 0 {
+            // our next available starting and normal states for swapping.
+            let mut next_start_id = self.special.min_start;
+            let mut cur_id = self.from_index(self.state_count() - 1);
+            // This is guaranteed to exist since cnormal > 0.
+            let mut next_norm_id =
+                self.tt.next_state_id(self.special.max_start);
+            while cur_id >= next_norm_id {
+                if let Some(accel) = accels.remove(&cur_id) {
+                    remapper.swap(self, next_start_id, cur_id);
+                    remapper.swap(self, next_norm_id, cur_id);
+                    // Keep our accelerator map updated with new IDs if the
+                    // states we swapped were also accelerated.
+                    if let Some(accel2) = accels.remove(&next_norm_id) {
+                        accels.insert(cur_id, accel2);
+                    }
+                    if let Some(accel2) = accels.remove(&next_start_id) {
+                        accels.insert(next_norm_id, accel2);
+                    }
+                    accels.insert(next_start_id, accel);
+                    update_special_accel(&mut self.special, next_start_id);
+                    // Our start range shifts one to the right now.
+                    self.special.min_start =
+                        self.tt.next_state_id(self.special.min_start);
+                    self.special.max_start =
+                        self.tt.next_state_id(self.special.max_start);
+                    next_start_id = self.tt.next_state_id(next_start_id);
+                    next_norm_id = self.tt.next_state_id(next_norm_id);
+                }
+                // This is pretty tricky, but if our 'next_norm_id' state also
+                // happened to be accelerated, then the result is that it is
+                // now in the position of cur_id, so we need to consider it
+                // again. This loop is still guaranteed to terminate though,
+                // because when accels contains cur_id, we're guaranteed to
+                // increment next_norm_id even if cur_id remains unchanged.
+                if !accels.contains_key(&cur_id) {
+                    cur_id = self.tt.prev_state_id(cur_id);
+                }
+            }
+        }
+        // Just like we did for match states, but we want to move accelerated
+        // start states to the beginning of the range instead of the end.
+        if cstart > 0 {
+            // N.B. special.{min,max}_start do not need updating, since the
+            // range/number of start states does not change at this point. Only
+            // the ordering of start states may change.
+            let mut next_id = self.special.min_start;
+            let mut cur_id = next_id;
+            while cur_id <= self.special.max_start {
+                if let Some(accel) = accels.remove(&cur_id) {
+                    remapper.swap(self, cur_id, next_id);
+                    accels.insert(next_id, accel);
+                    update_special_accel(&mut self.special, next_id);
+                    next_id = self.tt.next_state_id(next_id);
+                }
+                cur_id = self.tt.next_state_id(cur_id);
+            }
+        }
+
+        // Remap all transitions in our DFA and assert some things.
+        remapper.remap(self);
+        // This unwrap is OK because acceleration never changes the number of
+        // match states or patterns in those match states. Since acceleration
+        // runs after the pattern map has been set at least once, we know that
+        // our match states cannot error.
+        self.set_pattern_map(&new_matches).unwrap();
+        self.special.set_max();
+        self.special.validate().expect("special state ranges should validate");
+        self.special
+            .validate_state_count(self.state_count(), self.stride2())
+            .expect(
+                "special state ranges should be consistent with state count",
+            );
+        assert_eq!(
+            self.special.accel_len(self.stride()),
+            // We record the number of accelerated states initially detected
+            // since the accels map is itself mutated in the process above.
+            // If mutated incorrectly, its size may change, and thus can't be
+            // trusted as a source of truth of how many accelerated states we
+            // expected there to be.
+            original_accels_len,
+            "mismatch with expected number of accelerated states",
+        );
+
+        // And finally record our accelerators. We kept our accels map updated
+        // as we shuffled states above, so the accelerators should now
+        // correspond to a contiguous range in the state ID space. (Which we
+        // assert.)
+        let mut prev: Option<StateID> = None;
+        for (id, accel) in accels {
+            assert!(prev.map_or(true, |p| self.tt.next_state_id(p) == id));
+            prev = Some(id);
+            self.accels.add(accel);
+        }
+    }
+
+    /// Shuffle the states in this DFA so that starting states, match
+    /// states and accelerated states are all contiguous.
+    ///
+    /// See dfa/special.rs for more details.
+    pub(crate) fn shuffle(
+        &mut self,
+        mut matches: BTreeMap<StateID, Vec<PatternID>>,
+    ) -> Result<(), Error> {
+        // The determinizer always adds a quit state and it is always second.
+        self.special.quit_id = self.from_index(1);
+        // If all we have are the dead and quit states, then we're done and
+        // the DFA will never produce a match.
+        if self.state_count() <= 2 {
+            self.special.set_max();
+            return Ok(());
+        }
+
+        // Collect all our start states into a convenient set and confirm there
+        // is no overlap with match states. In the classicl DFA construction,
+        // start states can be match states. But because of look-around, we
+        // delay all matches by a byte, which prevents start states from being
+        // match states.
+        let mut is_start: BTreeSet<StateID> = BTreeSet::new();
+        for (start_id, _, _) in self.starts() {
+            // While there's nothing theoretically wrong with setting a start
+            // state to a dead ID (indeed, it could be an optimization!), the
+            // shuffling code below assumes that start states aren't dead. If
+            // this assumption is violated, the dead state could be shuffled
+            // to a new location, which must never happen. So if we do want
+            // to allow start states to be dead, then this assert should be
+            // removed and the code below fixed.
+            //
+            // N.B. Minimization can cause start states to be dead, but that
+            // happens after states are shuffled, so it's OK. Also, start
+            // states are dead for the DFA that never matches anything, but
+            // in that case, there are no states to shuffle.
+            assert_ne!(start_id, DEAD, "start state cannot be dead");
+            assert!(
+                !matches.contains_key(&start_id),
+                "{:?} is both a start and a match state, which is not allowed",
+                start_id,
+            );
+            is_start.insert(start_id);
+        }
+
+        // We implement shuffling by a sequence of pairwise swaps of states.
+        // Since we have a number of things referencing states via their
+        // IDs and swapping them changes their IDs, we need to record every
+        // swap we make so that we can remap IDs. The remapper handles this
+        // book-keeping for us.
+        let mut remapper = Remapper::from_dfa(self);
+
+        // Shuffle matching states.
+        if matches.is_empty() {
+            self.special.min_match = DEAD;
+            self.special.max_match = DEAD;
+        } else {
+            // The determinizer guarantees that the first two states are the
+            // dead and quit states, respectively. We want our match states to
+            // come right after quit.
+            let mut next_id = self.from_index(2);
+            let mut new_matches = BTreeMap::new();
+            self.special.min_match = next_id;
+            for (id, pids) in matches {
+                remapper.swap(self, next_id, id);
+                new_matches.insert(next_id, pids);
+                // If we swapped a start state, then update our set.
+                if is_start.contains(&next_id) {
+                    is_start.remove(&next_id);
+                    is_start.insert(id);
+                }
+                next_id = self.tt.next_state_id(next_id);
+            }
+            matches = new_matches;
+            self.special.max_match = cmp::max(
+                self.special.min_match,
+                self.tt.prev_state_id(next_id),
+            );
+        }
+
+        // Shuffle starting states.
+        {
+            let mut next_id = self.from_index(2);
+            if self.special.matches() {
+                next_id = self.tt.next_state_id(self.special.max_match);
+            }
+            self.special.min_start = next_id;
+            for id in is_start {
+                remapper.swap(self, next_id, id);
+                next_id = self.tt.next_state_id(next_id);
+            }
+            self.special.max_start = cmp::max(
+                self.special.min_start,
+                self.tt.prev_state_id(next_id),
+            );
+        }
+
+        // Finally remap all transitions in our DFA.
+        remapper.remap(self);
+        self.set_pattern_map(&matches)?;
+        self.special.set_max();
+        self.special.validate().expect("special state ranges should validate");
+        self.special
+            .validate_state_count(self.state_count(), self.stride2())
+            .expect(
+                "special state ranges should be consistent with state count",
+            );
+        Ok(())
+    }
+}
+
+/// A variety of generic internal methods for accessing DFA internals.
+impl<T: AsRef<[u32]>> DFA<T> {
+    /// Return the byte classes used by this DFA.
+    pub(crate) fn byte_classes(&self) -> &ByteClasses {
+        &self.tt.classes
+    }
+
+    /// Return the info about special states.
+    pub(crate) fn special(&self) -> &Special {
+        &self.special
+    }
+
+    /// Return the info about special states as a mutable borrow.
+    #[cfg(feature = "alloc")]
+    pub(crate) fn special_mut(&mut self) -> &mut Special {
+        &mut self.special
+    }
+
+    /// Returns an iterator over all states in this DFA.
+    ///
+    /// This iterator yields a tuple for each state. The first element of the
+    /// tuple corresponds to a state's identifier, and the second element
+    /// corresponds to the state itself (comprised of its transitions).
+    pub(crate) fn states(&self) -> StateIter<'_, T> {
+        self.tt.states()
+    }
+
+    /// Return the total number of states in this DFA. Every DFA has at least
+    /// 1 state, even the empty DFA.
+    pub(crate) fn state_count(&self) -> usize {
+        self.tt.count()
+    }
+
+    /// Return an iterator over all pattern IDs for the given match state.
+    ///
+    /// If the given state is not a match state, then this panics.
+    #[cfg(feature = "alloc")]
+    pub(crate) fn pattern_id_slice(&self, id: StateID) -> &[PatternID] {
+        assert!(self.is_match_state(id));
+        self.ms.pattern_id_slice(self.match_state_index(id))
+    }
+
+    /// Return the total number of pattern IDs for the given match state.
+    ///
+    /// If the given state is not a match state, then this panics.
+    pub(crate) fn match_pattern_len(&self, id: StateID) -> usize {
+        assert!(self.is_match_state(id));
+        self.ms.pattern_len(self.match_state_index(id))
+    }
+
+    /// Returns the total number of patterns matched by this DFA.
+    pub(crate) fn pattern_count(&self) -> usize {
+        self.ms.patterns
+    }
+
+    /// Returns a map from match state ID to a list of pattern IDs that match
+    /// in that state.
+    #[cfg(feature = "alloc")]
+    pub(crate) fn pattern_map(&self) -> BTreeMap<StateID, Vec<PatternID>> {
+        self.ms.to_map(self)
+    }
+
+    /// Returns the ID of the quit state for this DFA.
+    #[cfg(feature = "alloc")]
+    pub(crate) fn quit_id(&self) -> StateID {
+        self.from_index(1)
+    }
+
+    /// Convert the given state identifier to the state's index. The state's
+    /// index corresponds to the position in which it appears in the transition
+    /// table. When a DFA is NOT premultiplied, then a state's identifier is
+    /// also its index. When a DFA is premultiplied, then a state's identifier
+    /// is equal to `index * alphabet_len`. This routine reverses that.
+    pub(crate) fn to_index(&self, id: StateID) -> usize {
+        self.tt.to_index(id)
+    }
+
+    /// Convert an index to a state (in the range 0..self.state_count()) to an
+    /// actual state identifier.
+    ///
+    /// This is useful when using a `Vec<T>` as an efficient map keyed by state
+    /// to some other information (such as a remapped state ID).
+    #[cfg(feature = "alloc")]
+    pub(crate) fn from_index(&self, index: usize) -> StateID {
+        self.tt.from_index(index)
+    }
+
+    /// Return the table of state IDs for this DFA's start states.
+    pub(crate) fn starts(&self) -> StartStateIter<'_> {
+        self.st.iter()
+    }
+
+    /// Returns the index of the match state for the given ID. If the
+    /// given ID does not correspond to a match state, then this may
+    /// panic or produce an incorrect result.
+    fn match_state_index(&self, id: StateID) -> usize {
+        debug_assert!(self.is_match_state(id));
+        // This is one of the places where we rely on the fact that match
+        // states are contiguous in the transition table. Namely, that the
+        // first match state ID always corresponds to dfa.special.min_start.
+        // From there, since we know the stride, we can compute the overall
+        // index of any match state given the match state's ID.
+        let min = self.special().min_match.as_usize();
+        // CORRECTNESS: We're allowed to produce an incorrect result or panic,
+        // so both the subtraction and the unchecked StateID construction is
+        // OK.
+        self.to_index(StateID::new_unchecked(id.as_usize() - min))
+    }
+
+    /// Returns the index of the accelerator state for the given ID. If the
+    /// given ID does not correspond to an accelerator state, then this may
+    /// panic or produce an incorrect result.
+    fn accelerator_index(&self, id: StateID) -> usize {
+        let min = self.special().min_accel.as_usize();
+        // CORRECTNESS: We're allowed to produce an incorrect result or panic,
+        // so both the subtraction and the unchecked StateID construction is
+        // OK.
+        self.to_index(StateID::new_unchecked(id.as_usize() - min))
+    }
+
+    /// Return the accelerators for this DFA.
+    fn accels(&self) -> Accels<&[u32]> {
+        self.accels.as_ref()
+    }
+
+    /// Return this DFA's transition table as a slice.
+    fn trans(&self) -> &[StateID] {
+        self.tt.table()
+    }
+}
+
+impl<T: AsRef<[u32]>> fmt::Debug for DFA<T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "dense::DFA(")?;
+        for state in self.states() {
+            fmt_state_indicator(f, self, state.id())?;
+            let id = if f.alternate() {
+                state.id().as_usize()
+            } else {
+                self.to_index(state.id())
+            };
+            write!(f, "{:06?}: ", id)?;
+            state.fmt(f)?;
+            write!(f, "\n")?;
+        }
+        writeln!(f, "")?;
+        for (i, (start_id, sty, pid)) in self.starts().enumerate() {
+            let id = if f.alternate() {
+                start_id.as_usize()
+            } else {
+                self.to_index(start_id)
+            };
+            if i % self.st.stride == 0 {
+                match pid {
+                    None => writeln!(f, "START-GROUP(ALL)")?,
+                    Some(pid) => {
+                        writeln!(f, "START_GROUP(pattern: {:?})", pid)?
+                    }
+                }
+            }
+            writeln!(f, "  {:?} => {:06?}", sty, id)?;
+        }
+        if self.pattern_count() > 1 {
+            writeln!(f, "")?;
+            for i in 0..self.ms.count() {
+                let id = self.ms.match_state_id(self, i);
+                let id = if f.alternate() {
+                    id.as_usize()
+                } else {
+                    self.to_index(id)
+                };
+                write!(f, "MATCH({:06?}): ", id)?;
+                for (i, &pid) in self.ms.pattern_id_slice(i).iter().enumerate()
+                {
+                    if i > 0 {
+                        write!(f, ", ")?;
+                    }
+                    write!(f, "{:?}", pid)?;
+                }
+                writeln!(f, "")?;
+            }
+        }
+        writeln!(f, "state count: {:?}", self.state_count())?;
+        writeln!(f, "pattern count: {:?}", self.pattern_count())?;
+        writeln!(f, ")")?;
+        Ok(())
+    }
+}
+
+unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> {
+    #[inline]
+    fn is_special_state(&self, id: StateID) -> bool {
+        self.special.is_special_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: StateID) -> bool {
+        self.special.is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_quit_state(&self, id: StateID) -> bool {
+        self.special.is_quit_state(id)
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: StateID) -> bool {
+        self.special.is_match_state(id)
+    }
+
+    #[inline]
+    fn is_start_state(&self, id: StateID) -> bool {
+        self.special.is_start_state(id)
+    }
+
+    #[inline]
+    fn is_accel_state(&self, id: StateID) -> bool {
+        self.special.is_accel_state(id)
+    }
+
+    #[inline]
+    fn next_state(&self, current: StateID, input: u8) -> StateID {
+        let input = self.byte_classes().get(input);
+        let o = current.as_usize() + usize::from(input);
+        self.trans()[o]
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(
+        &self,
+        current: StateID,
+        input: u8,
+    ) -> StateID {
+        let input = self.byte_classes().get_unchecked(input);
+        let o = current.as_usize() + usize::from(input);
+        *self.trans().get_unchecked(o)
+    }
+
+    #[inline]
+    fn next_eoi_state(&self, current: StateID) -> StateID {
+        let eoi = self.byte_classes().eoi().as_usize();
+        let o = current.as_usize() + eoi;
+        self.trans()[o]
+    }
+
+    #[inline]
+    fn pattern_count(&self) -> usize {
+        self.ms.patterns
+    }
+
+    #[inline]
+    fn match_count(&self, id: StateID) -> usize {
+        self.match_pattern_len(id)
+    }
+
+    #[inline]
+    fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
+        // This is an optimization for the very common case of a DFA with a
+        // single pattern. This conditional avoids a somewhat more costly path
+        // that finds the pattern ID from the state machine, which requires
+        // a bit of slicing/pointer-chasing. This optimization tends to only
+        // matter when matches are frequent.
+        if self.ms.patterns == 1 {
+            return PatternID::ZERO;
+        }
+        let state_index = self.match_state_index(id);
+        self.ms.pattern_id(state_index, match_index)
+    }
+
+    #[inline]
+    fn start_state_forward(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> StateID {
+        let index = Start::from_position_fwd(bytes, start, end);
+        self.st.start(index, pattern_id)
+    }
+
+    #[inline]
+    fn start_state_reverse(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> StateID {
+        let index = Start::from_position_rev(bytes, start, end);
+        self.st.start(index, pattern_id)
+    }
+
+    #[inline(always)]
+    fn accelerator(&self, id: StateID) -> &[u8] {
+        if !self.is_accel_state(id) {
+            return &[];
+        }
+        self.accels.needles(self.accelerator_index(id))
+    }
+}
+
+/// The transition table portion of a dense DFA.
+///
+/// The transition table is the core part of the DFA in that it describes how
+/// to move from one state to another based on the input sequence observed.
+#[derive(Clone)]
+pub(crate) struct TransitionTable<T> {
+    /// A contiguous region of memory representing the transition table in
+    /// row-major order. The representation is dense. That is, every state
+    /// has precisely the same number of transitions. The maximum number of
+    /// transitions per state is 257 (256 for each possible byte value, plus 1
+    /// for the special EOI transition). If a DFA has been instructed to use
+    /// byte classes (the default), then the number of transitions is usually
+    /// substantially fewer.
+    ///
+    /// In practice, T is either `Vec<u32>` or `&[u32]`.
+    table: T,
+    /// A set of equivalence classes, where a single equivalence class
+    /// represents a set of bytes that never discriminate between a match
+    /// and a non-match in the DFA. Each equivalence class corresponds to a
+    /// single character in this DFA's alphabet, where the maximum number of
+    /// characters is 257 (each possible value of a byte plus the special
+    /// EOI transition). Consequently, the number of equivalence classes
+    /// corresponds to the number of transitions for each DFA state. Note
+    /// though that the *space* used by each DFA state in the transition table
+    /// may be larger. The total space used by each DFA state is known as the
+    /// stride.
+    ///
+    /// The only time the number of equivalence classes is fewer than 257 is if
+    /// the DFA's kind uses byte classes (which is the default). Equivalence
+    /// classes should generally only be disabled when debugging, so that
+    /// the transitions themselves aren't obscured. Disabling them has no
+    /// other benefit, since the equivalence class map is always used while
+    /// searching. In the vast majority of cases, the number of equivalence
+    /// classes is substantially smaller than 257, particularly when large
+    /// Unicode classes aren't used.
+    classes: ByteClasses,
+    /// The stride of each DFA state, expressed as a power-of-two exponent.
+    ///
+    /// The stride of a DFA corresponds to the total amount of space used by
+    /// each DFA state in the transition table. This may be bigger than the
+    /// size of a DFA's alphabet, since the stride is always the smallest
+    /// power of two greater than or equal to the alphabet size.
+    ///
+    /// While this wastes space, this avoids the need for integer division
+    /// to convert between premultiplied state IDs and their corresponding
+    /// indices. Instead, we can use simple bit-shifts.
+    ///
+    /// See the docs for the `stride2` method for more details.
+    ///
+    /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
+    /// while the maximum `stride2` value is `9` (corresponding to a stride of
+    /// `512`). The maximum is not `8` since the maximum alphabet size is `257`
+    /// when accounting for the special EOI transition. However, an alphabet
+    /// length of that size is exceptionally rare since the alphabet is shrunk
+    /// into equivalence classes.
+    stride2: usize,
+}
+
+impl<'a> TransitionTable<&'a [u32]> {
+    /// Deserialize a transition table starting at the beginning of `slice`.
+    /// Upon success, return the total number of bytes read along with the
+    /// transition table.
+    ///
+    /// If there was a problem deserializing any part of the transition table,
+    /// then this returns an error. Notably, if the given slice does not have
+    /// the same alignment as `StateID`, then this will return an error (among
+    /// other possible errors).
+    ///
+    /// This is guaranteed to execute in constant time.
+    ///
+    /// # Safety
+    ///
+    /// This routine is not safe because it does not check the valdity of the
+    /// transition table itself. In particular, the transition table can be
+    /// quite large, so checking its validity can be somewhat expensive. An
+    /// invalid transition table is not safe because other code may rely on the
+    /// transition table being correct (such as explicit bounds check elision).
+    /// Therefore, an invalid transition table can lead to undefined behavior.
+    ///
+    /// Callers that use this function must either pass on the safety invariant
+    /// or guarantee that the bytes given contain a valid transition table.
+    /// This guarantee is upheld by the bytes written by `write_to`.
+    unsafe fn from_bytes_unchecked(
+        mut slice: &'a [u8],
+    ) -> Result<(TransitionTable<&'a [u32]>, usize), DeserializeError> {
+        let slice_start = slice.as_ptr() as usize;
+
+        let (count, nr) = bytes::try_read_u32_as_usize(slice, "state count")?;
+        slice = &slice[nr..];
+
+        let (stride2, nr) = bytes::try_read_u32_as_usize(slice, "stride2")?;
+        slice = &slice[nr..];
+
+        let (classes, nr) = ByteClasses::from_bytes(slice)?;
+        slice = &slice[nr..];
+
+        // The alphabet length (determined by the byte class map) cannot be
+        // bigger than the stride (total space used by each DFA state).
+        if stride2 > 9 {
+            return Err(DeserializeError::generic(
+                "dense DFA has invalid stride2 (too big)",
+            ));
+        }
+        // It also cannot be zero, since even a DFA that never matches anything
+        // has a non-zero number of states with at least two equivalence
+        // classes: one for all 256 byte values and another for the EOI
+        // sentinel.
+        if stride2 < 1 {
+            return Err(DeserializeError::generic(
+                "dense DFA has invalid stride2 (too small)",
+            ));
+        }
+        // This is OK since 1 <= stride2 <= 9.
+        let stride =
+            1usize.checked_shl(u32::try_from(stride2).unwrap()).unwrap();
+        if classes.alphabet_len() > stride {
+            return Err(DeserializeError::generic(
+                "alphabet size cannot be bigger than transition table stride",
+            ));
+        }
+
+        let trans_count =
+            bytes::shl(count, stride2, "dense table transition count")?;
+        let table_bytes_len = bytes::mul(
+            trans_count,
+            StateID::SIZE,
+            "dense table state byte count",
+        )?;
+        bytes::check_slice_len(slice, table_bytes_len, "transition table")?;
+        bytes::check_alignment::<StateID>(slice)?;
+        let table_bytes = &slice[..table_bytes_len];
+        slice = &slice[table_bytes_len..];
+        // SAFETY: Since StateID is always representable as a u32, all we need
+        // to do is ensure that we have the proper length and alignment. We've
+        // checked both above, so the cast below is safe.
+        //
+        // N.B. This is the only not-safe code in this function, so we mark
+        // it explicitly to call it out, even though it is technically
+        // superfluous.
+        #[allow(unused_unsafe)]
+        let table = unsafe {
+            core::slice::from_raw_parts(
+                table_bytes.as_ptr() as *const u32,
+                trans_count,
+            )
+        };
+        let tt = TransitionTable { table, classes, stride2 };
+        Ok((tt, slice.as_ptr() as usize - slice_start))
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl TransitionTable<Vec<u32>> {
+    /// Create a minimal transition table with just two states: a dead state
+    /// and a quit state. The alphabet length and stride of the transition
+    /// table is determined by the given set of equivalence classes.
+    fn minimal(classes: ByteClasses) -> TransitionTable<Vec<u32>> {
+        let mut tt = TransitionTable {
+            table: vec![],
+            classes,
+            stride2: classes.stride2(),
+        };
+        // Two states, regardless of alphabet size, can always fit into u32.
+        tt.add_empty_state().unwrap(); // dead state
+        tt.add_empty_state().unwrap(); // quit state
+        tt
+    }
+
+    /// Set a transition in this table. Both the `from` and `to` states must
+    /// already exist, otherwise this panics. `unit` should correspond to the
+    /// transition out of `from` to set to `to`.
+    fn set(&mut self, from: StateID, unit: alphabet::Unit, to: StateID) {
+        assert!(self.is_valid(from), "invalid 'from' state");
+        assert!(self.is_valid(to), "invalid 'to' state");
+        self.table[from.as_usize() + self.classes.get_by_unit(unit)] =
+            to.as_u32();
+    }
+
+    /// Add an empty state (a state where all transitions lead to a dead state)
+    /// and return its identifier. The identifier returned is guaranteed to
+    /// not point to any other existing state.
+    ///
+    /// If adding a state would exhaust the state identifier space, then this
+    /// returns an error.
+    fn add_empty_state(&mut self) -> Result<StateID, Error> {
+        // Normally, to get a fresh state identifier, we would just
+        // take the index of the next state added to the transition
+        // table. However, we actually perform an optimization here
+        // that premultiplies state IDs by the stride, such that they
+        // point immediately at the beginning of their transitions in
+        // the transition table. This avoids an extra multiplication
+        // instruction for state lookup at search time.
+        //
+        // Premultiplied identifiers means that instead of your matching
+        // loop looking something like this:
+        //
+        //   state = dfa.start
+        //   for byte in haystack:
+        //       next = dfa.transitions[state * stride + byte]
+        //       if dfa.is_match(next):
+        //           return true
+        //   return false
+        //
+        // it can instead look like this:
+        //
+        //   state = dfa.start
+        //   for byte in haystack:
+        //       next = dfa.transitions[state + byte]
+        //       if dfa.is_match(next):
+        //           return true
+        //   return false
+        //
+        // In other words, we save a multiplication instruction in the
+        // critical path. This turns out to be a decent performance win.
+        // The cost of using premultiplied state ids is that they can
+        // require a bigger state id representation. (And they also make
+        // the code a bit more complex, especially during minimization and
+        // when reshuffling states, as one needs to convert back and forth
+        // between state IDs and state indices.)
+        //
+        // To do this, we simply take the index of the state into the
+        // entire transition table, rather than the index of the state
+        // itself. e.g., If the stride is 64, then the ID of the 3rd state
+        // is 192, not 2.
+        let next = self.table.len();
+        let id = StateID::new(next).map_err(|_| Error::too_many_states())?;
+        self.table.extend(iter::repeat(0).take(self.stride()));
+        Ok(id)
+    }
+
+    /// Swap the two states given in this transition table.
+    ///
+    /// This routine does not do anything to check the correctness of this
+    /// swap. Callers must ensure that other states pointing to id1 and id2 are
+    /// updated appropriately.
+    ///
+    /// Both id1 and id2 must point to valid states, otherwise this panics.
+    fn swap(&mut self, id1: StateID, id2: StateID) {
+        assert!(self.is_valid(id1), "invalid 'id1' state: {:?}", id1);
+        assert!(self.is_valid(id2), "invalid 'id2' state: {:?}", id2);
+        // We only need to swap the parts of the state that are used. So if the
+        // stride is 64, but the alphabet length is only 33, then we save a lot
+        // of work.
+        for b in 0..self.classes.alphabet_len() {
+            self.table.swap(id1.as_usize() + b, id2.as_usize() + b);
+        }
+    }
+
+    /// Truncate the states in this transition table to the given count.
+    ///
+    /// This routine does not do anything to check the correctness of this
+    /// truncation. Callers must ensure that other states pointing to truncated
+    /// states are updated appropriately.
+    fn truncate(&mut self, count: usize) {
+        self.table.truncate(count << self.stride2);
+    }
+
+    /// Return a mutable representation of the state corresponding to the given
+    /// id. This is useful for implementing routines that manipulate DFA states
+    /// (e.g., swapping states).
+    fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
+        let alphabet_len = self.alphabet_len();
+        let i = id.as_usize();
+        StateMut {
+            id,
+            stride2: self.stride2,
+            transitions: &mut self.table_mut()[i..i + alphabet_len],
+        }
+    }
+}
+
+impl<T: AsRef<[u32]>> TransitionTable<T> {
+    /// Writes a serialized form of this transition table to the buffer given.
+    /// If the buffer is too small, then an error is returned. To determine
+    /// how big the buffer must be, use `write_to_len`.
+    fn write_to<E: Endian>(
+        &self,
+        mut dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        let nwrite = self.write_to_len();
+        if dst.len() < nwrite {
+            return Err(SerializeError::buffer_too_small("transition table"));
+        }
+        dst = &mut dst[..nwrite];
+
+        // write state count
+        // Unwrap is OK since number of states is guaranteed to fit in a u32.
+        E::write_u32(u32::try_from(self.count()).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+
+        // write state stride (as power of 2)
+        // Unwrap is OK since stride2 is guaranteed to be <= 9.
+        E::write_u32(u32::try_from(self.stride2).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+
+        // write byte class map
+        let n = self.classes.write_to(dst)?;
+        dst = &mut dst[n..];
+
+        // write actual transitions
+        for &sid in self.table() {
+            let n = bytes::write_state_id::<E>(sid, &mut dst);
+            dst = &mut dst[n..];
+        }
+        Ok(nwrite)
+    }
+
+    /// Returns the number of bytes the serialized form of this transition
+    /// table will use.
+    fn write_to_len(&self) -> usize {
+        size_of::<u32>()   // state count
+        + size_of::<u32>() // stride2
+        + self.classes.write_to_len()
+        + (self.table().len() * StateID::SIZE)
+    }
+
+    /// Validates that every state ID in this transition table is valid.
+    ///
+    /// That is, every state ID can be used to correctly index a state in this
+    /// table.
+    fn validate(&self) -> Result<(), DeserializeError> {
+        for state in self.states() {
+            for (_, to) in state.transitions() {
+                if !self.is_valid(to) {
+                    return Err(DeserializeError::generic(
+                        "found invalid state ID in transition table",
+                    ));
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// Converts this transition table to a borrowed value.
+    fn as_ref(&self) -> TransitionTable<&'_ [u32]> {
+        TransitionTable {
+            table: self.table.as_ref(),
+            classes: self.classes.clone(),
+            stride2: self.stride2,
+        }
+    }
+
+    /// Converts this transition table to an owned value.
+    #[cfg(feature = "alloc")]
+    fn to_owned(&self) -> TransitionTable<Vec<u32>> {
+        TransitionTable {
+            table: self.table.as_ref().to_vec(),
+            classes: self.classes.clone(),
+            stride2: self.stride2,
+        }
+    }
+
+    /// Return the state for the given ID. If the given ID is not valid, then
+    /// this panics.
+    fn state(&self, id: StateID) -> State<'_> {
+        assert!(self.is_valid(id));
+
+        let i = id.as_usize();
+        State {
+            id,
+            stride2: self.stride2,
+            transitions: &self.table()[i..i + self.alphabet_len()],
+        }
+    }
+
+    /// Returns an iterator over all states in this transition table.
+    ///
+    /// This iterator yields a tuple for each state. The first element of the
+    /// tuple corresponds to a state's identifier, and the second element
+    /// corresponds to the state itself (comprised of its transitions).
+    fn states(&self) -> StateIter<'_, T> {
+        StateIter {
+            tt: self,
+            it: self.table().chunks(self.stride()).enumerate(),
+        }
+    }
+
+    /// Convert a state identifier to an index to a state (in the range
+    /// 0..self.count()).
+    ///
+    /// This is useful when using a `Vec<T>` as an efficient map keyed by state
+    /// to some other information (such as a remapped state ID).
+    ///
+    /// If the given ID is not valid, then this may panic or produce an
+    /// incorrect index.
+    fn to_index(&self, id: StateID) -> usize {
+        id.as_usize() >> self.stride2
+    }
+
+    /// Convert an index to a state (in the range 0..self.count()) to an actual
+    /// state identifier.
+    ///
+    /// This is useful when using a `Vec<T>` as an efficient map keyed by state
+    /// to some other information (such as a remapped state ID).
+    ///
+    /// If the given index is not in the specified range, then this may panic
+    /// or produce an incorrect state ID.
+    fn from_index(&self, index: usize) -> StateID {
+        // CORRECTNESS: If the given index is not valid, then it is not
+        // required for this to panic or return a valid state ID.
+        StateID::new_unchecked(index << self.stride2)
+    }
+
+    /// Returns the state ID for the state immediately following the one given.
+    ///
+    /// This does not check whether the state ID returned is invalid. In fact,
+    /// if the state ID given is the last state in this DFA, then the state ID
+    /// returned is guaranteed to be invalid.
+    #[cfg(feature = "alloc")]
+    fn next_state_id(&self, id: StateID) -> StateID {
+        self.from_index(self.to_index(id).checked_add(1).unwrap())
+    }
+
+    /// Returns the state ID for the state immediately preceding the one given.
+    ///
+    /// If the dead ID given (which is zero), then this panics.
+    #[cfg(feature = "alloc")]
+    fn prev_state_id(&self, id: StateID) -> StateID {
+        self.from_index(self.to_index(id).checked_sub(1).unwrap())
+    }
+
+    /// Returns the table as a slice of state IDs.
+    fn table(&self) -> &[StateID] {
+        let integers = self.table.as_ref();
+        // SAFETY: This is safe because StateID is guaranteed to be
+        // representable as a u32.
+        unsafe {
+            core::slice::from_raw_parts(
+                integers.as_ptr() as *const StateID,
+                integers.len(),
+            )
+        }
+    }
+
+    /// Returns the total number of states in this transition table.
+    ///
+    /// Note that a DFA always has at least two states: the dead and quit
+    /// states. In particular, the dead state always has ID 0 and is
+    /// correspondingly always the first state. The dead state is never a match
+    /// state.
+    fn count(&self) -> usize {
+        self.table().len() >> self.stride2
+    }
+
+    /// Returns the total stride for every state in this DFA. This corresponds
+    /// to the total number of transitions used by each state in this DFA's
+    /// transition table.
+    fn stride(&self) -> usize {
+        1 << self.stride2
+    }
+
+    /// Returns the total number of elements in the alphabet for this
+    /// transition table. This is always less than or equal to `self.stride()`.
+    /// It is only equal when the alphabet length is a power of 2. Otherwise,
+    /// it is always strictly less.
+    fn alphabet_len(&self) -> usize {
+        self.classes.alphabet_len()
+    }
+
+    /// Returns true if and only if the given state ID is valid for this
+    /// transition table. Validity in this context means that the given ID can
+    /// be used as a valid offset with `self.stride()` to index this transition
+    /// table.
+    fn is_valid(&self, id: StateID) -> bool {
+        let id = id.as_usize();
+        id < self.table().len() && id % self.stride() == 0
+    }
+
+    /// Return the memory usage, in bytes, of this transition table.
+    ///
+    /// This does not include the size of a `TransitionTable` value itself.
+    fn memory_usage(&self) -> usize {
+        self.table().len() * StateID::SIZE
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl<T: AsMut<[u32]>> TransitionTable<T> {
+    /// Returns the table as a slice of state IDs.
+    fn table_mut(&mut self) -> &mut [StateID] {
+        let integers = self.table.as_mut();
+        // SAFETY: This is safe because StateID is guaranteed to be
+        // representable as a u32.
+        unsafe {
+            core::slice::from_raw_parts_mut(
+                integers.as_mut_ptr() as *mut StateID,
+                integers.len(),
+            )
+        }
+    }
+}
+
+/// The set of all possible starting states in a DFA.
+///
+/// The set of starting states corresponds to the possible choices one can make
+/// in terms of starting a DFA. That is, before following the first transition,
+/// you first need to select the state that you start in.
+///
+/// Normally, a DFA converted from an NFA that has a single starting state
+/// would itself just have one starting state. However, our support for look
+/// around generally requires more starting states. The correct starting state
+/// is chosen based on certain properties of the position at which we begin
+/// our search.
+///
+/// Before listing those properties, we first must define two terms:
+///
+/// * `haystack` - The bytes to execute the search. The search always starts
+///   at the beginning of `haystack` and ends before or at the end of
+///   `haystack`.
+/// * `context` - The (possibly empty) bytes surrounding `haystack`. `haystack`
+///   must be contained within `context` such that `context` is at least as big
+///   as `haystack`.
+///
+/// This split is crucial for dealing with look-around. For example, consider
+/// the context `foobarbaz`, the haystack `bar` and the regex `^bar$`. This
+/// regex should _not_ match the haystack since `bar` does not appear at the
+/// beginning of the input. Similarly, the regex `\Bbar\B` should match the
+/// haystack because `bar` is not surrounded by word boundaries. But a search
+/// that does not take context into account would not permit `\B` to match
+/// since the beginning of any string matches a word boundary. Similarly, a
+/// search that does not take context into account when searching `^bar$` in
+/// the haystack `bar` would produce a match when it shouldn't.
+///
+/// Thus, it follows that the starting state is chosen based on the following
+/// criteria, derived from the position at which the search starts in the
+/// `context` (corresponding to the start of `haystack`):
+///
+/// 1. If the search starts at the beginning of `context`, then the `Text`
+///    start state is used. (Since `^` corresponds to
+///    `hir::Anchor::StartText`.)
+/// 2. If the search starts at a position immediately following a line
+///    terminator, then the `Line` start state is used. (Since `(?m:^)`
+///    corresponds to `hir::Anchor::StartLine`.)
+/// 3. If the search starts at a position immediately following a byte
+///    classified as a "word" character (`[_0-9a-zA-Z]`), then the `WordByte`
+///    start state is used. (Since `(?-u:\b)` corresponds to a word boundary.)
+/// 4. Otherwise, if the search starts at a position immediately following
+///    a byte that is not classified as a "word" character (`[^_0-9a-zA-Z]`),
+///    then the `NonWordByte` start state is used. (Since `(?-u:\B)`
+///    corresponds to a not-word-boundary.)
+///
+/// (N.B. Unicode word boundaries are not supported by the DFA because they
+/// require multi-byte look-around and this is difficult to support in a DFA.)
+///
+/// To further complicate things, we also support constructing individual
+/// anchored start states for each pattern in the DFA. (Which is required to
+/// implement overlapping regexes correctly, but is also generally useful.)
+/// Thus, when individual start states for each pattern are enabled, then the
+/// total number of start states represented is `4 + (4 * #patterns)`, where
+/// the 4 comes from each of the 4 possibilities above. The first 4 represents
+/// the starting states for the entire DFA, which support searching for
+/// multiple patterns simultaneously (possibly unanchored).
+///
+/// If individual start states are disabled, then this will only store 4
+/// start states. Typically, individual start states are only enabled when
+/// constructing the reverse DFA for regex matching. But they are also useful
+/// for building DFAs that can search for a specific pattern or even to support
+/// both anchored and unanchored searches with the same DFA.
+///
+/// Note though that while the start table always has either `4` or
+/// `4 + (4 * #patterns)` starting state *ids*, the total number of states
+/// might be considerably smaller. That is, many of the IDs may be duplicative.
+/// (For example, if a regex doesn't have a `\b` sub-pattern, then there's no
+/// reason to generate a unique starting state for handling word boundaries.
+/// Similarly for start/end anchors.)
+#[derive(Clone)]
+pub(crate) struct StartTable<T> {
+    /// The initial start state IDs.
+    ///
+    /// In practice, T is either `Vec<u32>` or `&[u32]`.
+    ///
+    /// The first `stride` (currently always 4) entries always correspond to
+    /// the start states for the entire DFA. After that, there are
+    /// `stride * patterns` state IDs, where `patterns` may be zero in the
+    /// case of a DFA with no patterns or in the case where the DFA was built
+    /// without enabling starting states for each pattern.
+    table: T,
+    /// The number of starting state IDs per pattern.
+    stride: usize,
+    /// The total number of patterns for which starting states are encoded.
+    /// This may be zero for non-empty DFAs when the DFA was built without
+    /// start states for each pattern. Thus, one cannot use this field to
+    /// say how many patterns are in the DFA in all cases. It is specific to
+    /// how many patterns are represented in this start table.
+    patterns: usize,
+}
+
+#[cfg(feature = "alloc")]
+impl StartTable<Vec<u32>> {
+    /// Create a valid set of start states all pointing to the dead state.
+    ///
+    /// When the corresponding DFA is constructed with start states for each
+    /// pattern, then `patterns` should be the number of patterns. Otherwise,
+    /// it should be zero.
+    ///
+    /// If the total table size could exceed the allocatable limit, then this
+    /// returns an error. In practice, this is unlikely to be able to occur,
+    /// since it's likely that allocation would have failed long before it got
+    /// to this point.
+    fn dead(patterns: usize) -> Result<StartTable<Vec<u32>>, Error> {
+        assert!(patterns <= PatternID::LIMIT);
+        let stride = Start::count();
+        let pattern_starts_len = match stride.checked_mul(patterns) {
+            Some(x) => x,
+            None => return Err(Error::too_many_start_states()),
+        };
+        let table_len = match stride.checked_add(pattern_starts_len) {
+            Some(x) => x,
+            None => return Err(Error::too_many_start_states()),
+        };
+        if table_len > core::isize::MAX as usize {
+            return Err(Error::too_many_start_states());
+        }
+        let table = vec![DEAD.as_u32(); table_len];
+        Ok(StartTable { table, stride, patterns })
+    }
+}
+
+impl<'a> StartTable<&'a [u32]> {
+    /// Deserialize a table of start state IDs starting at the beginning of
+    /// `slice`. Upon success, return the total number of bytes read along with
+    /// the table of starting state IDs.
+    ///
+    /// If there was a problem deserializing any part of the starting IDs,
+    /// then this returns an error. Notably, if the given slice does not have
+    /// the same alignment as `StateID`, then this will return an error (among
+    /// other possible errors).
+    ///
+    /// This is guaranteed to execute in constant time.
+    ///
+    /// # Safety
+    ///
+    /// This routine is not safe because it does not check the valdity of the
+    /// starting state IDs themselves. In particular, the number of starting
+    /// IDs can be of variable length, so it's possible that checking their
+    /// validity cannot be done in constant time. An invalid starting state
+    /// ID is not safe because other code may rely on the starting IDs being
+    /// correct (such as explicit bounds check elision). Therefore, an invalid
+    /// start ID can lead to undefined behavior.
+    ///
+    /// Callers that use this function must either pass on the safety invariant
+    /// or guarantee that the bytes given contain valid starting state IDs.
+    /// This guarantee is upheld by the bytes written by `write_to`.
+    unsafe fn from_bytes_unchecked(
+        mut slice: &'a [u8],
+    ) -> Result<(StartTable<&'a [u32]>, usize), DeserializeError> {
+        let slice_start = slice.as_ptr() as usize;
+
+        let (stride, nr) =
+            bytes::try_read_u32_as_usize(slice, "start table stride")?;
+        slice = &slice[nr..];
+
+        let (patterns, nr) =
+            bytes::try_read_u32_as_usize(slice, "start table patterns")?;
+        slice = &slice[nr..];
+
+        if stride != Start::count() {
+            return Err(DeserializeError::generic(
+                "invalid starting table stride",
+            ));
+        }
+        if patterns > PatternID::LIMIT {
+            return Err(DeserializeError::generic(
+                "invalid number of patterns",
+            ));
+        }
+        let pattern_table_size =
+            bytes::mul(stride, patterns, "invalid pattern count")?;
+        // Our start states always start with a single stride of start states
+        // for the entire automaton which permit it to match any pattern. What
+        // follows it are an optional set of start states for each pattern.
+        let start_state_count = bytes::add(
+            stride,
+            pattern_table_size,
+            "invalid 'any' pattern starts size",
+        )?;
+        let table_bytes_len = bytes::mul(
+            start_state_count,
+            StateID::SIZE,
+            "pattern table bytes length",
+        )?;
+        bytes::check_slice_len(slice, table_bytes_len, "start ID table")?;
+        bytes::check_alignment::<StateID>(slice)?;
+        let table_bytes = &slice[..table_bytes_len];
+        slice = &slice[table_bytes_len..];
+        // SAFETY: Since StateID is always representable as a u32, all we need
+        // to do is ensure that we have the proper length and alignment. We've
+        // checked both above, so the cast below is safe.
+        //
+        // N.B. This is the only not-safe code in this function, so we mark
+        // it explicitly to call it out, even though it is technically
+        // superfluous.
+        #[allow(unused_unsafe)]
+        let table = unsafe {
+            core::slice::from_raw_parts(
+                table_bytes.as_ptr() as *const u32,
+                start_state_count,
+            )
+        };
+        let st = StartTable { table, stride, patterns };
+        Ok((st, slice.as_ptr() as usize - slice_start))
+    }
+}
+
+impl<T: AsRef<[u32]>> StartTable<T> {
+    /// Writes a serialized form of this start table to the buffer given. If
+    /// the buffer is too small, then an error is returned. To determine how
+    /// big the buffer must be, use `write_to_len`.
+    fn write_to<E: Endian>(
+        &self,
+        mut dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        let nwrite = self.write_to_len();
+        if dst.len() < nwrite {
+            return Err(SerializeError::buffer_too_small(
+                "starting table ids",
+            ));
+        }
+        dst = &mut dst[..nwrite];
+
+        // write stride
+        // Unwrap is OK since the stride is always 4 (currently).
+        E::write_u32(u32::try_from(self.stride).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+        // write pattern count
+        // Unwrap is OK since number of patterns is guaranteed to fit in a u32.
+        E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+        // write start IDs
+        for &sid in self.table() {
+            let n = bytes::write_state_id::<E>(sid, &mut dst);
+            dst = &mut dst[n..];
+        }
+        Ok(nwrite)
+    }
+
+    /// Returns the number of bytes the serialized form of this start ID table
+    /// will use.
+    fn write_to_len(&self) -> usize {
+        size_of::<u32>()   // stride
+        + size_of::<u32>() // # patterns
+        + (self.table().len() * StateID::SIZE)
+    }
+
+    /// Validates that every state ID in this start table is valid by checking
+    /// it against the given transition table (which must be for the same DFA).
+    ///
+    /// That is, every state ID can be used to correctly index a state.
+    fn validate(
+        &self,
+        tt: &TransitionTable<T>,
+    ) -> Result<(), DeserializeError> {
+        for &id in self.table() {
+            if !tt.is_valid(id) {
+                return Err(DeserializeError::generic(
+                    "found invalid starting state ID",
+                ));
+            }
+        }
+        Ok(())
+    }
+
+    /// Converts this start list to a borrowed value.
+    fn as_ref(&self) -> StartTable<&'_ [u32]> {
+        StartTable {
+            table: self.table.as_ref(),
+            stride: self.stride,
+            patterns: self.patterns,
+        }
+    }
+
+    /// Converts this start list to an owned value.
+    #[cfg(feature = "alloc")]
+    fn to_owned(&self) -> StartTable<Vec<u32>> {
+        StartTable {
+            table: self.table.as_ref().to_vec(),
+            stride: self.stride,
+            patterns: self.patterns,
+        }
+    }
+
+    /// Return the start state for the given start index and pattern ID. If the
+    /// pattern ID is None, then the corresponding start state for the entire
+    /// DFA is returned. If the pattern ID is not None, then the corresponding
+    /// starting state for the given pattern is returned. If this start table
+    /// does not have individual starting states for each pattern, then this
+    /// panics.
+    fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID {
+        let start_index = index.as_usize();
+        let index = match pattern_id {
+            None => start_index,
+            Some(pid) => {
+                let pid = pid.as_usize();
+                assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
+                self.stride + (self.stride * pid) + start_index
+            }
+        };
+        self.table()[index]
+    }
+
+    /// Returns an iterator over all start state IDs in this table.
+    ///
+    /// Each item is a triple of: start state ID, the start state type and the
+    /// pattern ID (if any).
+    fn iter(&self) -> StartStateIter<'_> {
+        StartStateIter { st: self.as_ref(), i: 0 }
+    }
+
+    /// Returns the table as a slice of state IDs.
+    fn table(&self) -> &[StateID] {
+        let integers = self.table.as_ref();
+        // SAFETY: This is safe because StateID is guaranteed to be
+        // representable as a u32.
+        unsafe {
+            core::slice::from_raw_parts(
+                integers.as_ptr() as *const StateID,
+                integers.len(),
+            )
+        }
+    }
+
+    /// Return the memory usage, in bytes, of this start list.
+    ///
+    /// This does not include the size of a `StartList` value itself.
+    fn memory_usage(&self) -> usize {
+        self.table().len() * StateID::SIZE
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl<T: AsMut<[u32]>> StartTable<T> {
+    /// Set the start state for the given index and pattern.
+    ///
+    /// If the pattern ID or state ID are not valid, then this will panic.
+    fn set_start(
+        &mut self,
+        index: Start,
+        pattern_id: Option<PatternID>,
+        id: StateID,
+    ) {
+        let start_index = index.as_usize();
+        let index = match pattern_id {
+            None => start_index,
+            Some(pid) => self
+                .stride
+                .checked_mul(pid.as_usize())
+                .unwrap()
+                .checked_add(self.stride)
+                .unwrap()
+                .checked_add(start_index)
+                .unwrap(),
+        };
+        self.table_mut()[index] = id;
+    }
+
+    /// Returns the table as a mutable slice of state IDs.
+    fn table_mut(&mut self) -> &mut [StateID] {
+        let integers = self.table.as_mut();
+        // SAFETY: This is safe because StateID is guaranteed to be
+        // representable as a u32.
+        unsafe {
+            core::slice::from_raw_parts_mut(
+                integers.as_mut_ptr() as *mut StateID,
+                integers.len(),
+            )
+        }
+    }
+}
+
+/// An iterator over start state IDs.
+///
+/// This iterator yields a triple of start state ID, the start state type
+/// and the pattern ID (if any). The pattern ID is None for start states
+/// corresponding to the entire DFA and non-None for start states corresponding
+/// to a specific pattern. The latter only occurs when the DFA is compiled with
+/// start states for each pattern.
+pub(crate) struct StartStateIter<'a> {
+    st: StartTable<&'a [u32]>,
+    i: usize,
+}
+
+impl<'a> Iterator for StartStateIter<'a> {
+    type Item = (StateID, Start, Option<PatternID>);
+
+    fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> {
+        let i = self.i;
+        let table = self.st.table();
+        if i >= table.len() {
+            return None;
+        }
+        self.i += 1;
+
+        // This unwrap is okay since the stride of the starting state table
+        // must always match the number of start state types.
+        let start_type = Start::from_usize(i % self.st.stride).unwrap();
+        let pid = if i < self.st.stride {
+            None
+        } else {
+            Some(
+                PatternID::new((i - self.st.stride) / self.st.stride).unwrap(),
+            )
+        };
+        Some((table[i], start_type, pid))
+    }
+}
+
+/// This type represents that patterns that should be reported whenever a DFA
+/// enters a match state. This structure exists to support DFAs that search for
+/// matches for multiple regexes.
+///
+/// This structure relies on the fact that all match states in a DFA occur
+/// contiguously in the DFA's transition table. (See dfa/special.rs for a more
+/// detailed breakdown of the representation.) Namely, when a match occurs, we
+/// know its state ID. Since we know the start and end of the contiguous region
+/// of match states, we can use that to compute the position at which the match
+/// state occurs. That in turn is used as an offset into this structure.
+#[derive(Clone, Debug)]
+struct MatchStates<T> {
+    /// Slices is a flattened sequence of pairs, where each pair points to a
+    /// sub-slice of pattern_ids. The first element of the pair is an offset
+    /// into pattern_ids and the second element of the pair is the number
+    /// of 32-bit pattern IDs starting at that position. That is, each pair
+    /// corresponds to a single DFA match state and its corresponding match
+    /// IDs. The number of pairs always corresponds to the number of distinct
+    /// DFA match states.
+    ///
+    /// In practice, T is either Vec<u32> or &[u32].
+    slices: T,
+    /// A flattened sequence of pattern IDs for each DFA match state. The only
+    /// way to correctly read this sequence is indirectly via `slices`.
+    ///
+    /// In practice, T is either Vec<u32> or &[u32].
+    pattern_ids: T,
+    /// The total number of unique patterns represented by these match states.
+    patterns: usize,
+}
+
+impl<'a> MatchStates<&'a [u32]> {
+    unsafe fn from_bytes_unchecked(
+        mut slice: &'a [u8],
+    ) -> Result<(MatchStates<&'a [u32]>, usize), DeserializeError> {
+        let slice_start = slice.as_ptr() as usize;
+
+        // Read the total number of match states.
+        let (count, nr) =
+            bytes::try_read_u32_as_usize(slice, "match state count")?;
+        slice = &slice[nr..];
+
+        // Read the slice start/length pairs.
+        let pair_count = bytes::mul(2, count, "match state offset pairs")?;
+        let slices_bytes_len = bytes::mul(
+            pair_count,
+            PatternID::SIZE,
+            "match state slice offset byte length",
+        )?;
+        bytes::check_slice_len(slice, slices_bytes_len, "match state slices")?;
+        bytes::check_alignment::<PatternID>(slice)?;
+        let slices_bytes = &slice[..slices_bytes_len];
+        slice = &slice[slices_bytes_len..];
+        // SAFETY: Since PatternID is always representable as a u32, all we
+        // need to do is ensure that we have the proper length and alignment.
+        // We've checked both above, so the cast below is safe.
+        //
+        // N.B. This is one of the few not-safe snippets in this function, so
+        // we mark it explicitly to call it out, even though it is technically
+        // superfluous.
+        #[allow(unused_unsafe)]
+        let slices = unsafe {
+            core::slice::from_raw_parts(
+                slices_bytes.as_ptr() as *const u32,
+                pair_count,
+            )
+        };
+
+        // Read the total number of unique pattern IDs (which is always 1 more
+        // than the maximum pattern ID in this automaton, since pattern IDs are
+        // handed out contiguously starting at 0).
+        let (patterns, nr) =
+            bytes::try_read_u32_as_usize(slice, "pattern count")?;
+        slice = &slice[nr..];
+
+        // Now read the pattern ID count. We don't need to store this
+        // explicitly, but we need it to know how many pattern IDs to read.
+        let (idcount, nr) =
+            bytes::try_read_u32_as_usize(slice, "pattern ID count")?;
+        slice = &slice[nr..];
+
+        // Read the actual pattern IDs.
+        let pattern_ids_len =
+            bytes::mul(idcount, PatternID::SIZE, "pattern ID byte length")?;
+        bytes::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?;
+        bytes::check_alignment::<PatternID>(slice)?;
+        let pattern_ids_bytes = &slice[..pattern_ids_len];
+        slice = &slice[pattern_ids_len..];
+        // SAFETY: Since PatternID is always representable as a u32, all we
+        // need to do is ensure that we have the proper length and alignment.
+        // We've checked both above, so the cast below is safe.
+        //
+        // N.B. This is one of the few not-safe snippets in this function, so
+        // we mark it explicitly to call it out, even though it is technically
+        // superfluous.
+        #[allow(unused_unsafe)]
+        let pattern_ids = unsafe {
+            core::slice::from_raw_parts(
+                pattern_ids_bytes.as_ptr() as *const u32,
+                idcount,
+            )
+        };
+
+        let ms = MatchStates { slices, pattern_ids, patterns };
+        Ok((ms, slice.as_ptr() as usize - slice_start))
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl MatchStates<Vec<u32>> {
+    fn empty(pattern_count: usize) -> MatchStates<Vec<u32>> {
+        assert!(pattern_count <= PatternID::LIMIT);
+        MatchStates {
+            slices: vec![],
+            pattern_ids: vec![],
+            patterns: pattern_count,
+        }
+    }
+
+    fn new(
+        matches: &BTreeMap<StateID, Vec<PatternID>>,
+        pattern_count: usize,
+    ) -> Result<MatchStates<Vec<u32>>, Error> {
+        let mut m = MatchStates::empty(pattern_count);
+        for (_, pids) in matches.iter() {
+            let start = PatternID::new(m.pattern_ids.len())
+                .map_err(|_| Error::too_many_match_pattern_ids())?;
+            m.slices.push(start.as_u32());
+            // This is always correct since the number of patterns in a single
+            // match state can never exceed maximum number of allowable
+            // patterns. Why? Because a pattern can only appear once in a
+            // particular match state, by construction. (And since our pattern
+            // ID limit is one less than u32::MAX, we're guaranteed that the
+            // length fits in a u32.)
+            m.slices.push(u32::try_from(pids.len()).unwrap());
+            for &pid in pids {
+                m.pattern_ids.push(pid.as_u32());
+            }
+        }
+        m.patterns = pattern_count;
+        Ok(m)
+    }
+
+    fn new_with_map(
+        &self,
+        matches: &BTreeMap<StateID, Vec<PatternID>>,
+    ) -> Result<MatchStates<Vec<u32>>, Error> {
+        MatchStates::new(matches, self.patterns)
+    }
+}
+
+impl<T: AsRef<[u32]>> MatchStates<T> {
+    /// Writes a serialized form of these match states to the buffer given. If
+    /// the buffer is too small, then an error is returned. To determine how
+    /// big the buffer must be, use `write_to_len`.
+    fn write_to<E: Endian>(
+        &self,
+        mut dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        let nwrite = self.write_to_len();
+        if dst.len() < nwrite {
+            return Err(SerializeError::buffer_too_small("match states"));
+        }
+        dst = &mut dst[..nwrite];
+
+        // write state ID count
+        // Unwrap is OK since number of states is guaranteed to fit in a u32.
+        E::write_u32(u32::try_from(self.count()).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+
+        // write slice offset pairs
+        for &pid in self.slices() {
+            let n = bytes::write_pattern_id::<E>(pid, &mut dst);
+            dst = &mut dst[n..];
+        }
+
+        // write unique pattern ID count
+        // Unwrap is OK since number of patterns is guaranteed to fit in a u32.
+        E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+
+        // write pattern ID count
+        // Unwrap is OK since we check at construction (and deserialization)
+        // that the number of patterns is representable as a u32.
+        E::write_u32(u32::try_from(self.pattern_ids().len()).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+
+        // write pattern IDs
+        for &pid in self.pattern_ids() {
+            let n = bytes::write_pattern_id::<E>(pid, &mut dst);
+            dst = &mut dst[n..];
+        }
+
+        Ok(nwrite)
+    }
+
+    /// Returns the number of bytes the serialized form of this transition
+    /// table will use.
+    fn write_to_len(&self) -> usize {
+        size_of::<u32>()   // match state count
+        + (self.slices().len() * PatternID::SIZE)
+        + size_of::<u32>() // unique pattern ID count
+        + size_of::<u32>() // pattern ID count
+        + (self.pattern_ids().len() * PatternID::SIZE)
+    }
+
+    /// Valides that the match state info is itself internally consistent and
+    /// consistent with the recorded match state region in the given DFA.
+    fn validate(&self, dfa: &DFA<T>) -> Result<(), DeserializeError> {
+        if self.count() != dfa.special.match_len(dfa.stride()) {
+            return Err(DeserializeError::generic(
+                "match state count mismatch",
+            ));
+        }
+        for si in 0..self.count() {
+            let start = self.slices()[si * 2].as_usize();
+            let len = self.slices()[si * 2 + 1].as_usize();
+            if start >= self.pattern_ids().len() {
+                return Err(DeserializeError::generic(
+                    "invalid pattern ID start offset",
+                ));
+            }
+            if start + len > self.pattern_ids().len() {
+                return Err(DeserializeError::generic(
+                    "invalid pattern ID length",
+                ));
+            }
+            for mi in 0..len {
+                let pid = self.pattern_id(si, mi);
+                if pid.as_usize() >= self.patterns {
+                    return Err(DeserializeError::generic(
+                        "invalid pattern ID",
+                    ));
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// Converts these match states back into their map form. This is useful
+    /// when shuffling states, as the normal MatchStates representation is not
+    /// amenable to easy state swapping. But with this map, to swap id1 and
+    /// id2, all you need to do is:
+    ///
+    /// if let Some(pids) = map.remove(&id1) {
+    ///     map.insert(id2, pids);
+    /// }
+    ///
+    /// Once shuffling is done, use MatchStates::new to convert back.
+    #[cfg(feature = "alloc")]
+    fn to_map(&self, dfa: &DFA<T>) -> BTreeMap<StateID, Vec<PatternID>> {
+        let mut map = BTreeMap::new();
+        for i in 0..self.count() {
+            let mut pids = vec![];
+            for j in 0..self.pattern_len(i) {
+                pids.push(self.pattern_id(i, j));
+            }
+            map.insert(self.match_state_id(dfa, i), pids);
+        }
+        map
+    }
+
+    /// Converts these match states to a borrowed value.
+    fn as_ref(&self) -> MatchStates<&'_ [u32]> {
+        MatchStates {
+            slices: self.slices.as_ref(),
+            pattern_ids: self.pattern_ids.as_ref(),
+            patterns: self.patterns,
+        }
+    }
+
+    /// Converts these match states to an owned value.
+    #[cfg(feature = "alloc")]
+    fn to_owned(&self) -> MatchStates<Vec<u32>> {
+        MatchStates {
+            slices: self.slices.as_ref().to_vec(),
+            pattern_ids: self.pattern_ids.as_ref().to_vec(),
+            patterns: self.patterns,
+        }
+    }
+
+    /// Returns the match state ID given the match state index. (Where the
+    /// first match state corresponds to index 0.)
+    ///
+    /// This panics if there is no match state at the given index.
+    fn match_state_id(&self, dfa: &DFA<T>, index: usize) -> StateID {
+        assert!(dfa.special.matches(), "no match states to index");
+        // This is one of the places where we rely on the fact that match
+        // states are contiguous in the transition table. Namely, that the
+        // first match state ID always corresponds to dfa.special.min_start.
+        // From there, since we know the stride, we can compute the ID of any
+        // match state given its index.
+        let stride2 = u32::try_from(dfa.stride2()).unwrap();
+        let offset = index.checked_shl(stride2).unwrap();
+        let id = dfa.special.min_match.as_usize().checked_add(offset).unwrap();
+        let sid = StateID::new(id).unwrap();
+        assert!(dfa.is_match_state(sid));
+        sid
+    }
+
+    /// Returns the pattern ID at the given match index for the given match
+    /// state.
+    ///
+    /// The match state index is the state index minus the state index of the
+    /// first match state in the DFA.
+    ///
+    /// The match index is the index of the pattern ID for the given state.
+    /// The index must be less than `self.pattern_len(state_index)`.
+    fn pattern_id(&self, state_index: usize, match_index: usize) -> PatternID {
+        self.pattern_id_slice(state_index)[match_index]
+    }
+
+    /// Returns the number of patterns in the given match state.
+    ///
+    /// The match state index is the state index minus the state index of the
+    /// first match state in the DFA.
+    fn pattern_len(&self, state_index: usize) -> usize {
+        self.slices()[state_index * 2 + 1].as_usize()
+    }
+
+    /// Returns all of the pattern IDs for the given match state index.
+    ///
+    /// The match state index is the state index minus the state index of the
+    /// first match state in the DFA.
+    fn pattern_id_slice(&self, state_index: usize) -> &[PatternID] {
+        let start = self.slices()[state_index * 2].as_usize();
+        let len = self.pattern_len(state_index);
+        &self.pattern_ids()[start..start + len]
+    }
+
+    /// Returns the pattern ID offset slice of u32 as a slice of PatternID.
+    fn slices(&self) -> &[PatternID] {
+        let integers = self.slices.as_ref();
+        // SAFETY: This is safe because PatternID is guaranteed to be
+        // representable as a u32.
+        unsafe {
+            core::slice::from_raw_parts(
+                integers.as_ptr() as *const PatternID,
+                integers.len(),
+            )
+        }
+    }
+
+    /// Returns the total number of match states.
+    fn count(&self) -> usize {
+        assert_eq!(0, self.slices().len() % 2);
+        self.slices().len() / 2
+    }
+
+    /// Returns the pattern ID slice of u32 as a slice of PatternID.
+    fn pattern_ids(&self) -> &[PatternID] {
+        let integers = self.pattern_ids.as_ref();
+        // SAFETY: This is safe because PatternID is guaranteed to be
+        // representable as a u32.
+        unsafe {
+            core::slice::from_raw_parts(
+                integers.as_ptr() as *const PatternID,
+                integers.len(),
+            )
+        }
+    }
+
+    /// Return the memory usage, in bytes, of these match pairs.
+    fn memory_usage(&self) -> usize {
+        (self.slices().len() + self.pattern_ids().len()) * PatternID::SIZE
+    }
+}
+
+/// An iterator over all states in a DFA.
+///
+/// This iterator yields a tuple for each state. The first element of the
+/// tuple corresponds to a state's identifier, and the second element
+/// corresponds to the state itself (comprised of its transitions).
+///
+/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to
+/// the type of the transition table itself.
+pub(crate) struct StateIter<'a, T> {
+    tt: &'a TransitionTable<T>,
+    it: iter::Enumerate<slice::Chunks<'a, StateID>>,
+}
+
+impl<'a, T: AsRef<[u32]>> Iterator for StateIter<'a, T> {
+    type Item = State<'a>;
+
+    fn next(&mut self) -> Option<State<'a>> {
+        self.it.next().map(|(index, _)| {
+            let id = self.tt.from_index(index);
+            self.tt.state(id)
+        })
+    }
+}
+
+/// An immutable representation of a single DFA state.
+///
+/// `'a` correspondings to the lifetime of a DFA's transition table.
+pub(crate) struct State<'a> {
+    id: StateID,
+    stride2: usize,
+    transitions: &'a [StateID],
+}
+
+impl<'a> State<'a> {
+    /// Return an iterator over all transitions in this state. This yields
+    /// a number of transitions equivalent to the alphabet length of the
+    /// corresponding DFA.
+    ///
+    /// Each transition is represented by a tuple. The first element is
+    /// the input byte for that transition and the second element is the
+    /// transitions itself.
+    pub(crate) fn transitions(&self) -> StateTransitionIter<'_> {
+        StateTransitionIter {
+            len: self.transitions.len(),
+            it: self.transitions.iter().enumerate(),
+        }
+    }
+
+    /// Return an iterator over a sparse representation of the transitions in
+    /// this state. Only non-dead transitions are returned.
+    ///
+    /// The "sparse" representation in this case corresponds to a sequence of
+    /// triples. The first two elements of the triple comprise an inclusive
+    /// byte range while the last element corresponds to the transition taken
+    /// for all bytes in the range.
+    ///
+    /// This is somewhat more condensed than the classical sparse
+    /// representation (where you have an element for every non-dead
+    /// transition), but in practice, checking if a byte is in a range is very
+    /// cheap and using ranges tends to conserve quite a bit more space.
+    pub(crate) fn sparse_transitions(&self) -> StateSparseTransitionIter<'_> {
+        StateSparseTransitionIter { dense: self.transitions(), cur: None }
+    }
+
+    /// Returns the identifier for this state.
+    pub(crate) fn id(&self) -> StateID {
+        self.id
+    }
+
+    /// Analyzes this state to determine whether it can be accelerated. If so,
+    /// it returns an accelerator that contains at least one byte.
+    #[cfg(feature = "alloc")]
+    fn accelerate(&self, classes: &ByteClasses) -> Option<Accel> {
+        // We just try to add bytes to our accelerator. Once adding fails
+        // (because we've added too many bytes), then give up.
+        let mut accel = Accel::new();
+        for (class, id) in self.transitions() {
+            if id == self.id() {
+                continue;
+            }
+            for unit in classes.elements(class) {
+                if let Some(byte) = unit.as_u8() {
+                    if !accel.add(byte) {
+                        return None;
+                    }
+                }
+            }
+        }
+        if accel.is_empty() {
+            None
+        } else {
+            Some(accel)
+        }
+    }
+}
+
+impl<'a> fmt::Debug for State<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        for (i, (start, end, id)) in self.sparse_transitions().enumerate() {
+            let index = if f.alternate() {
+                id.as_usize()
+            } else {
+                id.as_usize() >> self.stride2
+            };
+            if i > 0 {
+                write!(f, ", ")?;
+            }
+            if start == end {
+                write!(f, "{:?} => {:?}", start, index)?;
+            } else {
+                write!(f, "{:?}-{:?} => {:?}", start, end, index)?;
+            }
+        }
+        Ok(())
+    }
+}
+
+/// A mutable representation of a single DFA state.
+///
+/// `'a` correspondings to the lifetime of a DFA's transition table.
+#[cfg(feature = "alloc")]
+pub(crate) struct StateMut<'a> {
+    id: StateID,
+    stride2: usize,
+    transitions: &'a mut [StateID],
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> StateMut<'a> {
+    /// Return an iterator over all transitions in this state. This yields
+    /// a number of transitions equivalent to the alphabet length of the
+    /// corresponding DFA.
+    ///
+    /// Each transition is represented by a tuple. The first element is the
+    /// input byte for that transition and the second element is a mutable
+    /// reference to the transition itself.
+    pub(crate) fn iter_mut(&mut self) -> StateTransitionIterMut<'_> {
+        StateTransitionIterMut {
+            len: self.transitions.len(),
+            it: self.transitions.iter_mut().enumerate(),
+        }
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> fmt::Debug for StateMut<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(
+            &State {
+                id: self.id,
+                stride2: self.stride2,
+                transitions: self.transitions,
+            },
+            f,
+        )
+    }
+}
+
+/// An iterator over all transitions in a single DFA state. This yields
+/// a number of transitions equivalent to the alphabet length of the
+/// corresponding DFA.
+///
+/// Each transition is represented by a tuple. The first element is the input
+/// byte for that transition and the second element is the transition itself.
+#[derive(Debug)]
+pub(crate) struct StateTransitionIter<'a> {
+    len: usize,
+    it: iter::Enumerate<slice::Iter<'a, StateID>>,
+}
+
+impl<'a> Iterator for StateTransitionIter<'a> {
+    type Item = (alphabet::Unit, StateID);
+
+    fn next(&mut self) -> Option<(alphabet::Unit, StateID)> {
+        self.it.next().map(|(i, &id)| {
+            let unit = if i + 1 == self.len {
+                alphabet::Unit::eoi(i)
+            } else {
+                let b = u8::try_from(i)
+                    .expect("raw byte alphabet is never exceeded");
+                alphabet::Unit::u8(b)
+            };
+            (unit, id)
+        })
+    }
+}
+
+/// A mutable iterator over all transitions in a DFA state.
+///
+/// Each transition is represented by a tuple. The first element is the
+/// input byte for that transition and the second element is a mutable
+/// reference to the transition itself.
+#[cfg(feature = "alloc")]
+#[derive(Debug)]
+pub(crate) struct StateTransitionIterMut<'a> {
+    len: usize,
+    it: iter::Enumerate<slice::IterMut<'a, StateID>>,
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> Iterator for StateTransitionIterMut<'a> {
+    type Item = (alphabet::Unit, &'a mut StateID);
+
+    fn next(&mut self) -> Option<(alphabet::Unit, &'a mut StateID)> {
+        self.it.next().map(|(i, id)| {
+            let unit = if i + 1 == self.len {
+                alphabet::Unit::eoi(i)
+            } else {
+                let b = u8::try_from(i)
+                    .expect("raw byte alphabet is never exceeded");
+                alphabet::Unit::u8(b)
+            };
+            (unit, id)
+        })
+    }
+}
+
+/// An iterator over all non-DEAD transitions in a single DFA state using a
+/// sparse representation.
+///
+/// Each transition is represented by a triple. The first two elements of the
+/// triple comprise an inclusive byte range while the last element corresponds
+/// to the transition taken for all bytes in the range.
+///
+/// As a convenience, this always returns `alphabet::Unit` values of the same
+/// type. That is, you'll never get a (byte, EOI) or a (EOI, byte). Only (byte,
+/// byte) and (EOI, EOI) values are yielded.
+#[derive(Debug)]
+pub(crate) struct StateSparseTransitionIter<'a> {
+    dense: StateTransitionIter<'a>,
+    cur: Option<(alphabet::Unit, alphabet::Unit, StateID)>,
+}
+
+impl<'a> Iterator for StateSparseTransitionIter<'a> {
+    type Item = (alphabet::Unit, alphabet::Unit, StateID);
+
+    fn next(&mut self) -> Option<(alphabet::Unit, alphabet::Unit, StateID)> {
+        while let Some((unit, next)) = self.dense.next() {
+            let (prev_start, prev_end, prev_next) = match self.cur {
+                Some(t) => t,
+                None => {
+                    self.cur = Some((unit, unit, next));
+                    continue;
+                }
+            };
+            if prev_next == next && !unit.is_eoi() {
+                self.cur = Some((prev_start, unit, prev_next));
+            } else {
+                self.cur = Some((unit, unit, next));
+                if prev_next != DEAD {
+                    return Some((prev_start, prev_end, prev_next));
+                }
+            }
+        }
+        if let Some((start, end, next)) = self.cur.take() {
+            if next != DEAD {
+                return Some((start, end, next));
+            }
+        }
+        None
+    }
+}
+
+/// An iterator over pattern IDs for a single match state.
+#[derive(Debug)]
+pub(crate) struct PatternIDIter<'a>(slice::Iter<'a, PatternID>);
+
+impl<'a> Iterator for PatternIDIter<'a> {
+    type Item = PatternID;
+
+    fn next(&mut self) -> Option<PatternID> {
+        self.0.next().copied()
+    }
+}
+
+/// Remapper is an abstraction the manages the remapping of state IDs in a
+/// dense DFA. This is useful when one wants to shuffle states into different
+/// positions in the DFA.
+///
+/// One of the key complexities this manages is the ability to correctly move
+/// one state multiple times.
+///
+/// Once shuffling is complete, `remap` should be called, which will rewrite
+/// all pertinent transitions to updated state IDs.
+#[cfg(feature = "alloc")]
+#[derive(Debug)]
+struct Remapper {
+    /// A map from the index of a state to its pre-multiplied identifier.
+    ///
+    /// When a state is swapped with another, then their corresponding
+    /// locations in this map are also swapped. Thus, its new position will
+    /// still point to its old pre-multiplied StateID.
+    ///
+    /// While there is a bit more to it, this then allows us to rewrite the
+    /// state IDs in a DFA's transition table in a single pass. This is done
+    /// by iterating over every ID in this map, then iterating over each
+    /// transition for the state at that ID and re-mapping the transition from
+    /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
+    /// in this map where `old_id` *started*, and set it to where it ended up
+    /// after all swaps have been completed.
+    map: Vec<StateID>,
+}
+
+#[cfg(feature = "alloc")]
+impl Remapper {
+    fn from_dfa(dfa: &OwnedDFA) -> Remapper {
+        Remapper {
+            map: (0..dfa.state_count()).map(|i| dfa.from_index(i)).collect(),
+        }
+    }
+
+    fn swap(&mut self, dfa: &mut OwnedDFA, id1: StateID, id2: StateID) {
+        dfa.swap_states(id1, id2);
+        self.map.swap(dfa.to_index(id1), dfa.to_index(id2));
+    }
+
+    fn remap(mut self, dfa: &mut OwnedDFA) {
+        // Update the map to account for states that have been swapped
+        // multiple times. For example, if (A, C) and (C, G) are swapped, then
+        // transitions previously pointing to A should now point to G. But if
+        // we don't update our map, they will erroneously be set to C. All we
+        // do is follow the swaps in our map until we see our original state
+        // ID.
+        let oldmap = self.map.clone();
+        for i in 0..dfa.state_count() {
+            let cur_id = dfa.from_index(i);
+            let mut new = oldmap[i];
+            if cur_id == new {
+                continue;
+            }
+            loop {
+                let id = oldmap[dfa.to_index(new)];
+                if cur_id == id {
+                    self.map[i] = new;
+                    break;
+                }
+                new = id;
+            }
+        }
+
+        // To work around the borrow checker for converting state IDs to
+        // indices. We cannot borrow self while mutably iterating over a
+        // state's transitions. Otherwise, we'd just use dfa.to_index(..).
+        let stride2 = dfa.stride2();
+        let to_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
+
+        // Now that we've finished shuffling, we need to remap all of our
+        // transitions. We don't need to handle re-mapping accelerated states
+        // since `accels` is only populated after shuffling.
+        for &id in self.map.iter() {
+            for (_, next_id) in dfa.state_mut(id).iter_mut() {
+                *next_id = self.map[to_index(*next_id)];
+            }
+        }
+        for start_id in dfa.st.table_mut().iter_mut() {
+            *start_id = self.map[to_index(*start_id)];
+        }
+    }
+}
+
+#[cfg(all(test, feature = "alloc"))]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn errors_with_unicode_word_boundary() {
+        let pattern = r"\b";
+        assert!(Builder::new().build(pattern).is_err());
+    }
+
+    #[test]
+    fn roundtrip_never_match() {
+        let dfa = DFA::never_match().unwrap();
+        let (buf, _) = dfa.to_bytes_native_endian();
+        let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0;
+
+        assert_eq!(None, dfa.find_leftmost_fwd(b"foo12345").unwrap());
+    }
+
+    #[test]
+    fn roundtrip_always_match() {
+        use crate::HalfMatch;
+
+        let dfa = DFA::always_match().unwrap();
+        let (buf, _) = dfa.to_bytes_native_endian();
+        let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0;
+
+        assert_eq!(
+            Some(HalfMatch::must(0, 0)),
+            dfa.find_leftmost_fwd(b"foo12345").unwrap()
+        );
+    }
+}
diff --git a/src/dfa/determinize.rs b/src/dfa/determinize.rs

new file mode 100644 (file)

index 0000000..6160348
--- /dev/null
+++ b/src/dfa/determinize.rs
@@ -0,0 +1,547 @@
+use alloc::{
+    collections::BTreeMap,
+    vec::{self, Vec},
+};
+
+use crate::{
+    dfa::{dense, Error, DEAD},
+    nfa::thompson,
+    util::{
+        self,
+        alphabet::{self, ByteSet},
+        determinize::{State, StateBuilderEmpty, StateBuilderNFA},
+        id::{PatternID, StateID},
+        matchtypes::MatchKind,
+        sparse_set::{SparseSet, SparseSets},
+        start::Start,
+    },
+};
+
+/// A builder for configuring and running a DFA determinizer.
+#[derive(Clone, Debug)]
+pub(crate) struct Config {
+    anchored: bool,
+    match_kind: MatchKind,
+    quit: ByteSet,
+    dfa_size_limit: Option<usize>,
+    determinize_size_limit: Option<usize>,
+}
+
+impl Config {
+    /// Create a new default config for a determinizer. The determinizer may be
+    /// configured before calling `run`.
+    pub fn new() -> Config {
+        Config {
+            anchored: false,
+            match_kind: MatchKind::LeftmostFirst,
+            quit: ByteSet::empty(),
+            dfa_size_limit: None,
+            determinize_size_limit: None,
+        }
+    }
+
+    /// Run determinization on the given NFA and write the resulting DFA into
+    /// the one given. The DFA given should be initialized but otherwise empty.
+    /// "Initialized" means that it is setup to handle the NFA's byte classes,
+    /// number of patterns and whether to build start states for each pattern.
+    pub fn run(
+        &self,
+        nfa: &thompson::NFA,
+        dfa: &mut dense::OwnedDFA,
+    ) -> Result<(), Error> {
+        let dead = State::dead();
+        let quit = State::dead();
+        let mut cache = StateMap::default();
+        // We only insert the dead state here since its representation is
+        // identical to the quit state. And we never want anything pointing
+        // to the quit state other than specific transitions derived from the
+        // determinizer's configured "quit" bytes.
+        //
+        // We do put the quit state into 'builder_states' below. This ensures
+        // that a proper DFA state ID is allocated for it, and that no other
+        // DFA state uses the "location after the DEAD state." That is, it
+        // is assumed that the quit state is always the state immediately
+        // following the DEAD state.
+        cache.insert(dead.clone(), DEAD);
+
+        let runner = Runner {
+            config: self.clone(),
+            nfa,
+            dfa,
+            builder_states: alloc::vec![dead, quit],
+            cache,
+            memory_usage_state: 0,
+            sparses: SparseSets::new(nfa.len()),
+            stack: alloc::vec![],
+            scratch_state_builder: StateBuilderEmpty::new(),
+        };
+        runner.run()
+    }
+
+    /// Whether to build an anchored DFA or not. When disabled (the default),
+    /// the unanchored prefix from the NFA is used to start the DFA. Otherwise,
+    /// the anchored start state of the NFA is used to start the DFA.
+    pub fn anchored(&mut self, yes: bool) -> &mut Config {
+        self.anchored = yes;
+        self
+    }
+
+    /// The match semantics to use for determinization.
+    ///
+    /// MatchKind::All corresponds to the standard textbook construction.
+    /// All possible match states are represented in the DFA.
+    /// MatchKind::LeftmostFirst permits greediness and otherwise tries to
+    /// simulate the match semantics of backtracking regex engines. Namely,
+    /// only a subset of match states are built, and dead states are used to
+    /// stop searches with an unanchored prefix.
+    ///
+    /// The default is MatchKind::LeftmostFirst.
+    pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
+        self.match_kind = kind;
+        self
+    }
+
+    /// The set of bytes to use that will cause the DFA to enter a quit state,
+    /// stop searching and return an error. By default, this is empty.
+    pub fn quit(&mut self, set: ByteSet) -> &mut Config {
+        self.quit = set;
+        self
+    }
+
+    /// The limit, in bytes of the heap, that the DFA is permitted to use. This
+    /// does not include the auxiliary heap storage used by determinization.
+    pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
+        self.dfa_size_limit = bytes;
+        self
+    }
+
+    /// The limit, in bytes of the heap, that determinization itself is allowed
+    /// to use. This does not include the size of the DFA being built.
+    pub fn determinize_size_limit(
+        &mut self,
+        bytes: Option<usize>,
+    ) -> &mut Config {
+        self.determinize_size_limit = bytes;
+        self
+    }
+}
+
+/// The actual implementation of determinization that converts an NFA to a DFA
+/// through powerset construction.
+///
+/// This determinizer roughly follows the typical powerset construction, where
+/// each DFA state is comprised of one or more NFA states. In the worst case,
+/// there is one DFA state for every possible combination of NFA states. In
+/// practice, this only happens in certain conditions, typically when there are
+/// bounded repetitions.
+///
+/// The main differences between this implementation and typical deteminization
+/// are that this implementation delays matches by one state and hackily makes
+/// look-around work. Comments below attempt to explain this.
+///
+/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
+/// whichever is shorter.
+#[derive(Debug)]
+struct Runner<'a> {
+    /// The configuration used to initialize determinization.
+    config: Config,
+    /// The NFA we're converting into a DFA.
+    nfa: &'a thompson::NFA,
+    /// The DFA we're building.
+    dfa: &'a mut dense::OwnedDFA,
+    /// Each DFA state being built is defined as an *ordered* set of NFA
+    /// states, along with some meta facts about the ordered set of NFA states.
+    ///
+    /// This is never empty. The first state is always a dummy state such that
+    /// a state id == 0 corresponds to a dead state. The second state is always
+    /// the quit state.
+    ///
+    /// Why do we have states in both a `Vec` and in a cache map below?
+    /// Well, they serve two different roles based on access patterns.
+    /// `builder_states` is the canonical home of each state, and provides
+    /// constant random access by a DFA state's ID. The cache map below, on
+    /// the other hand, provides a quick way of searching for identical DFA
+    /// states by using the DFA state as a key in the map. Of course, we use
+    /// reference counting to avoid actually duplicating the state's data
+    /// itself. (Although this has never been benchmarked.) Note that the cache
+    /// map does not give us full minimization; it just lets us avoid some very
+    /// obvious redundant states.
+    ///
+    /// Note that the index into this Vec isn't quite the DFA's state ID.
+    /// Rather, it's just an index. To get the state ID, you have to multiply
+    /// it by the DFA's stride. That's done by self.dfa.from_index. And the
+    /// inverse is self.dfa.to_index.
+    ///
+    /// Moreover, DFA states don't usually retain the IDs assigned to them
+    /// by their position in this Vec. After determinization completes,
+    /// states are shuffled around to support other optimizations. See the
+    /// sibling 'special' module for more details on that. (The reason for
+    /// mentioning this is that if you print out the DFA for debugging during
+    /// determinization, and then print out the final DFA after it is fully
+    /// built, then the state IDs likely won't match up.)
+    builder_states: Vec<State>,
+    /// A cache of DFA states that already exist and can be easily looked up
+    /// via ordered sets of NFA states.
+    ///
+    /// See `builder_states` docs for why we store states in two different
+    /// ways.
+    cache: StateMap,
+    /// The memory usage, in bytes, used by builder_states and cache. We track
+    /// this as new states are added since states use a variable amount of
+    /// heap. Tracking this as we add states makes it possible to compute the
+    /// total amount of memory used by the determinizer in constant time.
+    memory_usage_state: usize,
+    /// A pair of sparse sets for tracking ordered sets of NFA state IDs.
+    /// These are reused throughout determinization. A bounded sparse set
+    /// gives us constant time insertion, membership testing and clearing.
+    sparses: SparseSets,
+    /// Scratch space for a stack of NFA states to visit, for depth first
+    /// visiting without recursion.
+    stack: Vec<StateID>,
+    /// Scratch space for storing an ordered sequence of NFA states, for
+    /// amortizing allocation. This is principally useful for when we avoid
+    /// adding a new DFA state since it already exists. In order to detect this
+    /// case though, we still need an ordered set of NFA state IDs. So we use
+    /// this space to stage that ordered set before we know whether we need to
+    /// create a new DFA state or not.
+    scratch_state_builder: StateBuilderEmpty,
+}
+
+/// A map from states to state identifiers. When using std, we use a standard
+/// hashmap, since it's a bit faster for this use case. (Other maps, like
+/// one's based on FNV, have not yet been benchmarked.)
+///
+/// The main purpose of this map is to reuse states where possible. This won't
+/// fully minimize the DFA, but it works well in a lot of cases.
+#[cfg(feature = "std")]
+type StateMap = std::collections::HashMap<State, StateID>;
+#[cfg(not(feature = "std"))]
+type StateMap = BTreeMap<State, StateID>;
+
+impl<'a> Runner<'a> {
+    /// Build the DFA. If there was a problem constructing the DFA (e.g., if
+    /// the chosen state identifier representation is too small), then an error
+    /// is returned.
+    fn run(mut self) -> Result<(), Error> {
+        if self.nfa.has_word_boundary_unicode()
+            && !self.config.quit.contains_range(0x80, 0xFF)
+        {
+            return Err(Error::unsupported_dfa_word_boundary_unicode());
+        }
+
+        // A sequence of "representative" bytes drawn from each equivalence
+        // class. These representative bytes are fed to the NFA to compute
+        // state transitions. This allows us to avoid re-computing state
+        // transitions for bytes that are guaranteed to produce identical
+        // results.
+        let representatives: Vec<alphabet::Unit> =
+            self.dfa.byte_classes().representatives().collect();
+        // The set of all DFA state IDs that still need to have their
+        // transitions set. We start by seeding this with all starting states.
+        let mut uncompiled = alloc::vec![];
+        self.add_all_starts(&mut uncompiled)?;
+        while let Some(dfa_id) = uncompiled.pop() {
+            for &unit in &representatives {
+                if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
+                {
+                    continue;
+                }
+                // In many cases, the state we transition to has already been
+                // computed. 'cached_state' will do the minimal amount of work
+                // to check this, and if it exists, immediately return an
+                // already existing state ID.
+                let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
+                self.dfa.set_transition(dfa_id, unit, next_dfa_id);
+                // If the state ID we got back is newly created, then we need
+                // to compile it, so add it to our uncompiled frontier.
+                if is_new {
+                    uncompiled.push(next_dfa_id);
+                }
+            }
+        }
+        trace!(
+            "determinization complete, memory usage: {}, dense DFA size: {}",
+            self.memory_usage(),
+            self.dfa.memory_usage(),
+        );
+
+        // A map from DFA state ID to one or more NFA match IDs. Each NFA match
+        // ID corresponds to a distinct regex pattern that matches in the state
+        // corresponding to the key.
+        let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
+        self.cache.clear();
+        #[allow(unused_variables)]
+        let mut total_pat_count = 0;
+        for (i, state) in self.builder_states.into_iter().enumerate() {
+            if let Some(pat_ids) = state.match_pattern_ids() {
+                let id = self.dfa.from_index(i);
+                total_pat_count += pat_ids.len();
+                matches.insert(id, pat_ids);
+            }
+        }
+        log! {
+            use core::mem::size_of;
+            let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
+            let pats = total_pat_count * size_of::<PatternID>();
+            let mem = (matches.len() * per_elem) + pats;
+            log::trace!("matches map built, memory usage: {}", mem);
+        }
+        // At this point, we shuffle the "special" states in the final DFA.
+        // This permits a DFA's match loop to detect a match condition (among
+        // other things) by merely inspecting the current state's identifier,
+        // and avoids the need for any additional auxiliary storage.
+        self.dfa.shuffle(matches)?;
+        Ok(())
+    }
+
+    /// Return the identifier for the next DFA state given an existing DFA
+    /// state and an input byte. If the next DFA state already exists, then
+    /// return its identifier from the cache. Otherwise, build the state, cache
+    /// it and return its identifier.
+    ///
+    /// This routine returns a boolean indicating whether a new state was
+    /// built. If a new state is built, then the caller needs to add it to its
+    /// frontier of uncompiled DFA states to compute transitions for.
+    fn cached_state(
+        &mut self,
+        dfa_id: StateID,
+        unit: alphabet::Unit,
+    ) -> Result<(StateID, bool), Error> {
+        // Compute the set of all reachable NFA states, including epsilons.
+        let empty_builder = self.get_state_builder();
+        let builder = util::determinize::next(
+            self.nfa,
+            self.config.match_kind,
+            &mut self.sparses,
+            &mut self.stack,
+            &self.builder_states[self.dfa.to_index(dfa_id)],
+            unit,
+            empty_builder,
+        );
+        self.maybe_add_state(builder)
+    }
+
+    /// Compute the set of DFA start states and add their identifiers in
+    /// 'dfa_state_ids' (no duplicates are added).
+    fn add_all_starts(
+        &mut self,
+        dfa_state_ids: &mut Vec<StateID>,
+    ) -> Result<(), Error> {
+        // Always add the (possibly unanchored) start states for matching any
+        // of the patterns in this DFA.
+        self.add_start_group(None, dfa_state_ids)?;
+        // We only need to compute anchored start states for each pattern if it
+        // was requested to do so.
+        if self.dfa.has_starts_for_each_pattern() {
+            for pid in PatternID::iter(self.dfa.pattern_count()) {
+                self.add_start_group(Some(pid), dfa_state_ids)?;
+            }
+        }
+        Ok(())
+    }
+
+    /// Add a group of start states for the given match pattern ID. Any new
+    /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
+    /// pushed.)
+    ///
+    /// When pattern_id is None, then this will compile a group of unanchored
+    /// start states (if the DFA is unanchored). When the pattern_id is
+    /// present, then this will compile a group of anchored start states that
+    /// only match the given pattern.
+    fn add_start_group(
+        &mut self,
+        pattern_id: Option<PatternID>,
+        dfa_state_ids: &mut Vec<StateID>,
+    ) -> Result<(), Error> {
+        let nfa_start = match pattern_id {
+            Some(pid) => self.nfa.start_pattern(pid),
+            None if self.config.anchored => self.nfa.start_anchored(),
+            None => self.nfa.start_unanchored(),
+        };
+
+        // When compiling start states, we're careful not to build additional
+        // states that aren't necessary. For example, if the NFA has no word
+        // boundary assertion, then there's no reason to have distinct start
+        // states for 'NonWordByte' and 'WordByte' starting configurations.
+        // Instead, the 'WordByte' starting configuration can just point
+        // directly to the start state for the 'NonWordByte' config.
+
+        let (id, is_new) =
+            self.add_one_start(nfa_start, Start::NonWordByte)?;
+        self.dfa.set_start_state(Start::NonWordByte, pattern_id, id);
+        if is_new {
+            dfa_state_ids.push(id);
+        }
+
+        if !self.nfa.has_word_boundary() {
+            self.dfa.set_start_state(Start::WordByte, pattern_id, id);
+        } else {
+            let (id, is_new) =
+                self.add_one_start(nfa_start, Start::WordByte)?;
+            self.dfa.set_start_state(Start::WordByte, pattern_id, id);
+            if is_new {
+                dfa_state_ids.push(id);
+            }
+        }
+        if !self.nfa.has_any_anchor() {
+            self.dfa.set_start_state(Start::Text, pattern_id, id);
+            self.dfa.set_start_state(Start::Line, pattern_id, id);
+        } else {
+            let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
+            self.dfa.set_start_state(Start::Text, pattern_id, id);
+            if is_new {
+                dfa_state_ids.push(id);
+            }
+
+            let (id, is_new) = self.add_one_start(nfa_start, Start::Line)?;
+            self.dfa.set_start_state(Start::Line, pattern_id, id);
+            if is_new {
+                dfa_state_ids.push(id);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Add a new DFA start state corresponding to the given starting NFA
+    /// state, and the starting search configuration. (The starting search
+    /// configuration essentially tells us which look-behind assertions are
+    /// true for this particular state.)
+    ///
+    /// The boolean returned indicates whether the state ID returned is a newly
+    /// created state, or a previously cached state.
+    fn add_one_start(
+        &mut self,
+        nfa_start: StateID,
+        start: Start,
+    ) -> Result<(StateID, bool), Error> {
+        // Compute the look-behind assertions that are true in this starting
+        // configuration, and the determine the epsilon closure. While
+        // computing the epsilon closure, we only follow condiional epsilon
+        // transitions that satisfy the look-behind assertions in 'facts'.
+        let mut builder_matches = self.get_state_builder().into_matches();
+        util::determinize::set_lookbehind_from_start(
+            &start,
+            &mut builder_matches,
+        );
+        self.sparses.set1.clear();
+        util::determinize::epsilon_closure(
+            self.nfa,
+            nfa_start,
+            *builder_matches.look_have(),
+            &mut self.stack,
+            &mut self.sparses.set1,
+        );
+        let mut builder = builder_matches.into_nfa();
+        util::determinize::add_nfa_states(
+            &self.nfa,
+            &self.sparses.set1,
+            &mut builder,
+        );
+        self.maybe_add_state(builder)
+    }
+
+    /// Adds the given state to the DFA being built depending on whether it
+    /// already exists in this determinizer's cache.
+    ///
+    /// If it does exist, then the memory used by 'state' is put back into the
+    /// determinizer and the previously created state's ID is returned. (Along
+    /// with 'false', indicating that no new state was added.)
+    ///
+    /// If it does not exist, then the state is added to the DFA being built
+    /// and a fresh ID is allocated (if ID allocation fails, then an error is
+    /// returned) and returned. (Along with 'true', indicating that a new state
+    /// was added.)
+    fn maybe_add_state(
+        &mut self,
+        builder: StateBuilderNFA,
+    ) -> Result<(StateID, bool), Error> {
+        if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
+            // Since we have a cached state, put the constructed state's
+            // memory back into our scratch space, so that it can be reused.
+            self.put_state_builder(builder);
+            return Ok((cached_id, false));
+        }
+        self.add_state(builder).map(|sid| (sid, true))
+    }
+
+    /// Add the given state to the DFA and make it available in the cache.
+    ///
+    /// The state initially has no transitions. That is, it transitions to the
+    /// dead state for all possible inputs, and transitions to the quit state
+    /// for all quit bytes.
+    ///
+    /// If adding the state would exceed the maximum value for StateID, then an
+    /// error is returned.
+    fn add_state(
+        &mut self,
+        builder: StateBuilderNFA,
+    ) -> Result<StateID, Error> {
+        let id = self.dfa.add_empty_state()?;
+        if !self.config.quit.is_empty() {
+            for b in self.config.quit.iter() {
+                self.dfa.set_transition(
+                    id,
+                    alphabet::Unit::u8(b),
+                    self.dfa.quit_id(),
+                );
+            }
+        }
+        let state = builder.to_state();
+        // States use reference counting internally, so we only need to count
+        // their memroy usage once.
+        self.memory_usage_state += state.memory_usage();
+        self.builder_states.push(state.clone());
+        self.cache.insert(state, id);
+        self.put_state_builder(builder);
+        if let Some(limit) = self.config.dfa_size_limit {
+            if self.dfa.memory_usage() > limit {
+                return Err(Error::dfa_exceeded_size_limit(limit));
+            }
+        }
+        if let Some(limit) = self.config.determinize_size_limit {
+            if self.memory_usage() > limit {
+                return Err(Error::determinize_exceeded_size_limit(limit));
+            }
+        }
+        Ok(id)
+    }
+
+    /// Returns a state builder from this determinizer that might have existing
+    /// capacity. This helps avoid allocs in cases where a state is built that
+    /// turns out to already be cached.
+    ///
+    /// Callers must put the state builder back with 'put_state_builder',
+    /// otherwise the allocation reuse won't work.
+    fn get_state_builder(&mut self) -> StateBuilderEmpty {
+        core::mem::replace(
+            &mut self.scratch_state_builder,
+            StateBuilderEmpty::new(),
+        )
+    }
+
+    /// Puts the given state builder back into this determinizer for reuse.
+    ///
+    /// Note that building a 'State' from a builder always creates a new
+    /// alloc, so callers should always put the builder back.
+    fn put_state_builder(&mut self, builder: StateBuilderNFA) {
+        let _ = core::mem::replace(
+            &mut self.scratch_state_builder,
+            builder.clear(),
+        );
+    }
+
+    /// Return the memory usage, in bytes, of this determinizer at the current
+    /// point in time. This does not include memory used by the NFA or the
+    /// dense DFA itself.
+    fn memory_usage(&self) -> usize {
+        use core::mem::size_of;
+
+        self.builder_states.len() * size_of::<State>()
+        // Maps likely use more memory than this, but it's probably close.
+        + self.cache.len() * (size_of::<State>() + size_of::<StateID>())
+        + self.memory_usage_state
+        + self.stack.capacity() * size_of::<StateID>()
+        + self.scratch_state_builder.capacity()
+    }
+}
diff --git a/src/dfa/error.rs b/src/dfa/error.rs

new file mode 100644 (file)

index 0000000..6497a4c
--- /dev/null
+++ b/src/dfa/error.rs
@@ -0,0 +1,162 @@
+use crate::{
+    nfa,
+    util::{
+        id::{PatternID, StateID},
+        start::Start,
+    },
+};
+
+/// An error that occurred during the construction of a DFA.
+///
+/// This error does not provide many introspection capabilities. There are
+/// generally only two things you can do with it:
+///
+/// * Obtain a human readable message via its `std::fmt::Display` impl.
+/// * Access an underlying [`nfa::thompson::Error`] type from its `source`
+/// method via the `std::error::Error` trait. This error only occurs when using
+/// convenience routines for building a DFA directly from a pattern string.
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct Error {
+    kind: ErrorKind,
+}
+
+/// The kind of error that occurred during the construction of a DFA.
+///
+/// Note that this error is non-exhaustive. Adding new variants is not
+/// considered a breaking change.
+#[derive(Clone, Debug)]
+enum ErrorKind {
+    /// An error that occurred while constructing an NFA as a precursor step
+    /// before a DFA is compiled.
+    NFA(nfa::thompson::Error),
+    /// An error that occurred because an unsupported regex feature was used.
+    /// The message string describes which unsupported feature was used.
+    ///
+    /// The primary regex feature that is unsupported by DFAs is the Unicode
+    /// word boundary look-around assertion (`\b`). This can be worked around
+    /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling the
+    /// [`dense::Builder::allow_unicode_word_boundary`](dense/struct.Builder.html#method.allow_unicode_word_boundary)
+    /// option when building a DFA.
+    Unsupported(&'static str),
+    /// An error that occurs if too many states are produced while building a
+    /// DFA.
+    TooManyStates,
+    /// An error that occurs if too many start states are needed while building
+    /// a DFA.
+    ///
+    /// This is a kind of oddball error that occurs when building a DFA with
+    /// start states enabled for each pattern and enough patterns to cause
+    /// the table of start states to overflow `usize`.
+    TooManyStartStates,
+    /// This is another oddball error that can occur if there are too many
+    /// patterns spread out across too many match states.
+    TooManyMatchPatternIDs,
+    /// An error that occurs if the DFA got too big during determinization.
+    DFAExceededSizeLimit { limit: usize },
+    /// An error that occurs if auxiliary storage (not the DFA) used during
+    /// determinization got too big.
+    DeterminizeExceededSizeLimit { limit: usize },
+}
+
+impl Error {
+    /// Return the kind of this error.
+    fn kind(&self) -> &ErrorKind {
+        &self.kind
+    }
+
+    pub(crate) fn nfa(err: nfa::thompson::Error) -> Error {
+        Error { kind: ErrorKind::NFA(err) }
+    }
+
+    pub(crate) fn unsupported_dfa_word_boundary_unicode() -> Error {
+        let msg = "cannot build DFAs for regexes with Unicode word \
+                   boundaries; switch to ASCII word boundaries, or \
+                   heuristically enable Unicode word boundaries or use a \
+                   different regex engine";
+        Error { kind: ErrorKind::Unsupported(msg) }
+    }
+
+    pub(crate) fn too_many_states() -> Error {
+        Error { kind: ErrorKind::TooManyStates }
+    }
+
+    pub(crate) fn too_many_start_states() -> Error {
+        Error { kind: ErrorKind::TooManyStartStates }
+    }
+
+    pub(crate) fn too_many_match_pattern_ids() -> Error {
+        Error { kind: ErrorKind::TooManyMatchPatternIDs }
+    }
+
+    pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> Error {
+        Error { kind: ErrorKind::DFAExceededSizeLimit { limit } }
+    }
+
+    pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> Error {
+        Error { kind: ErrorKind::DeterminizeExceededSizeLimit { limit } }
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for Error {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self.kind() {
+            ErrorKind::NFA(ref err) => Some(err),
+            ErrorKind::Unsupported(_) => None,
+            ErrorKind::TooManyStates => None,
+            ErrorKind::TooManyStartStates => None,
+            ErrorKind::TooManyMatchPatternIDs => None,
+            ErrorKind::DFAExceededSizeLimit { .. } => None,
+            ErrorKind::DeterminizeExceededSizeLimit { .. } => None,
+        }
+    }
+}
+
+impl core::fmt::Display for Error {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match self.kind() {
+            ErrorKind::NFA(_) => write!(f, "error building NFA"),
+            ErrorKind::Unsupported(ref msg) => {
+                write!(f, "unsupported regex feature for DFAs: {}", msg)
+            }
+            ErrorKind::TooManyStates => write!(
+                f,
+                "number of DFA states exceeds limit of {}",
+                StateID::LIMIT,
+            ),
+            ErrorKind::TooManyStartStates => {
+                let stride = Start::count();
+                // The start table has `stride` entries for starting states for
+                // the entire DFA, and then `stride` entries for each pattern
+                // if start states for each pattern are enabled (which is the
+                // only way this error can occur). Thus, the total number of
+                // patterns that can fit in the table is `stride` less than
+                // what we can allocate.
+                let limit = ((core::isize::MAX as usize) - stride) / stride;
+                write!(
+                    f,
+                    "compiling DFA with start states exceeds pattern \
+                     pattern limit of {}",
+                    limit,
+                )
+            }
+            ErrorKind::TooManyMatchPatternIDs => write!(
+                f,
+                "compiling DFA with total patterns in all match states \
+                 exceeds limit of {}",
+                PatternID::LIMIT,
+            ),
+            ErrorKind::DFAExceededSizeLimit { limit } => write!(
+                f,
+                "DFA exceeded size limit of {:?} during determinization",
+                limit,
+            ),
+            ErrorKind::DeterminizeExceededSizeLimit { limit } => {
+                write!(f, "determinization exceeded size limit of {:?}", limit)
+            }
+        }
+    }
+}
diff --git a/src/dfa/minimize.rs b/src/dfa/minimize.rs

new file mode 100644 (file)

index 0000000..80e2f4e
--- /dev/null
+++ b/src/dfa/minimize.rs
@@ -0,0 +1,461 @@
+use core::{cell::RefCell, fmt, mem};
+
+use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
+
+use crate::{
+    dfa::{automaton::Automaton, dense, DEAD},
+    util::{
+        alphabet,
+        id::{PatternID, StateID},
+    },
+};
+
+/// An implementation of Hopcroft's algorithm for minimizing DFAs.
+///
+/// The algorithm implemented here is mostly taken from Wikipedia:
+/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
+///
+/// This code has had some light optimization attention paid to it,
+/// particularly in the form of reducing allocation as much as possible.
+/// However, it is still generally slow. Future optimization work should
+/// probably focus on the bigger picture rather than micro-optimizations. For
+/// example:
+///
+/// 1. Figure out how to more intelligently create initial partitions. That is,
+///    Hopcroft's algorithm starts by creating two partitions of DFA states
+///    that are known to NOT be equivalent: match states and non-match states.
+///    The algorithm proceeds by progressively refining these partitions into
+///    smaller partitions. If we could start with more partitions, then we
+///    could reduce the amount of work that Hopcroft's algorithm needs to do.
+/// 2. For every partition that we visit, we find all incoming transitions to
+///    every state in the partition for *every* element in the alphabet. (This
+///    is why using byte classes can significantly decrease minimization times,
+///    since byte classes shrink the alphabet.) This is quite costly and there
+///    is perhaps some redundant work being performed depending on the specific
+///    states in the set. For example, we might be able to only visit some
+///    elements of the alphabet based on the transitions.
+/// 3. Move parts of minimization into determinization. If minimization has
+///    fewer states to deal with, then it should run faster. A prime example
+///    of this might be large Unicode classes, which are generated in way that
+///    can create a lot of redundant states. (Some work has been done on this
+///    point during NFA compilation via the algorithm described in the
+///    "Incremental Construction of MinimalAcyclic Finite-State Automata"
+///    paper.)
+pub(crate) struct Minimizer<'a> {
+    dfa: &'a mut dense::OwnedDFA,
+    in_transitions: Vec<Vec<Vec<StateID>>>,
+    partitions: Vec<StateSet>,
+    waiting: Vec<StateSet>,
+}
+
+impl<'a> fmt::Debug for Minimizer<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Minimizer")
+            .field("dfa", &self.dfa)
+            .field("in_transitions", &self.in_transitions)
+            .field("partitions", &self.partitions)
+            .field("waiting", &self.waiting)
+            .finish()
+    }
+}
+
+/// A set of states. A state set makes up a single partition in Hopcroft's
+/// algorithm.
+///
+/// It is represented by an ordered set of state identifiers. We use shared
+/// ownership so that a single state set can be in both the set of partitions
+/// and in the set of waiting sets simultaneously without an additional
+/// allocation. Generally, once a state set is built, it becomes immutable.
+///
+/// We use this representation because it avoids the overhead of more
+/// traditional set data structures (HashSet/BTreeSet), and also because
+/// computing intersection/subtraction on this representation is especially
+/// fast.
+#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
+struct StateSet {
+    ids: Rc<RefCell<Vec<StateID>>>,
+}
+
+impl<'a> Minimizer<'a> {
+    pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
+        let in_transitions = Minimizer::incoming_transitions(dfa);
+        let partitions = Minimizer::initial_partitions(dfa);
+        let waiting = partitions.clone();
+        Minimizer { dfa, in_transitions, partitions, waiting }
+    }
+
+    pub fn run(mut self) {
+        let stride2 = self.dfa.stride2();
+        let as_state_id = |index: usize| -> StateID {
+            StateID::new(index << stride2).unwrap()
+        };
+        let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
+
+        let mut incoming = StateSet::empty();
+        let mut scratch1 = StateSet::empty();
+        let mut scratch2 = StateSet::empty();
+        let mut newparts = vec![];
+
+        // This loop is basically Hopcroft's algorithm. Everything else is just
+        // shuffling data around to fit our representation.
+        while let Some(set) = self.waiting.pop() {
+            for b in self.dfa.byte_classes().iter() {
+                self.find_incoming_to(b, &set, &mut incoming);
+                // If incoming is empty, then the intersection with any other
+                // set must also be empty. So 'newparts' just ends up being
+                // 'self.partitions'. So there's no need to go through the loop
+                // below.
+                //
+                // This actually turns out to be rather large optimization. On
+                // the order of making minimization 4-5x faster. It's likely
+                // that the vast majority of all states have very few incoming
+                // transitions.
+                if incoming.is_empty() {
+                    continue;
+                }
+
+                for p in 0..self.partitions.len() {
+                    self.partitions[p].intersection(&incoming, &mut scratch1);
+                    if scratch1.is_empty() {
+                        newparts.push(self.partitions[p].clone());
+                        continue;
+                    }
+
+                    self.partitions[p].subtract(&incoming, &mut scratch2);
+                    if scratch2.is_empty() {
+                        newparts.push(self.partitions[p].clone());
+                        continue;
+                    }
+
+                    let (x, y) =
+                        (scratch1.deep_clone(), scratch2.deep_clone());
+                    newparts.push(x.clone());
+                    newparts.push(y.clone());
+                    match self.find_waiting(&self.partitions[p]) {
+                        Some(i) => {
+                            self.waiting[i] = x;
+                            self.waiting.push(y);
+                        }
+                        None => {
+                            if x.len() <= y.len() {
+                                self.waiting.push(x);
+                            } else {
+                                self.waiting.push(y);
+                            }
+                        }
+                    }
+                }
+                newparts = mem::replace(&mut self.partitions, newparts);
+                newparts.clear();
+            }
+        }
+
+        // At this point, we now have a minimal partitioning of states, where
+        // each partition is an equivalence class of DFA states. Now we need to
+        // use this partioning to update the DFA to only contain one state for
+        // each partition.
+
+        // Create a map from DFA state ID to the representative ID of the
+        // equivalence class to which it belongs. The representative ID of an
+        // equivalence class of states is the minimum ID in that class.
+        let mut state_to_part = vec![DEAD; self.dfa.state_count()];
+        for p in &self.partitions {
+            p.iter(|id| state_to_part[as_index(id)] = p.min());
+        }
+
+        // Generate a new contiguous sequence of IDs for minimal states, and
+        // create a map from equivalence IDs to the new IDs. Thus, the new
+        // minimal ID of *any* state in the unminimized DFA can be obtained
+        // with minimals_ids[state_to_part[old_id]].
+        let mut minimal_ids = vec![DEAD; self.dfa.state_count()];
+        let mut new_index = 0;
+        for state in self.dfa.states() {
+            if state_to_part[as_index(state.id())] == state.id() {
+                minimal_ids[as_index(state.id())] = as_state_id(new_index);
+                new_index += 1;
+            }
+        }
+        // The total number of states in the minimal DFA.
+        let minimal_count = new_index;
+        // Convenience function for remapping state IDs. This takes an old ID,
+        // looks up its Hopcroft partition and then maps that to the new ID
+        // range.
+        let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])];
+
+        // Re-map this DFA in place such that the only states remaining
+        // correspond to the representative states of every equivalence class.
+        for id in (0..self.dfa.state_count()).map(as_state_id) {
+            // If this state isn't a representative for an equivalence class,
+            // then we skip it since it won't appear in the minimal DFA.
+            if state_to_part[as_index(id)] != id {
+                continue;
+            }
+            for (_, next) in self.dfa.state_mut(id).iter_mut() {
+                *next = remap(*next);
+            }
+            self.dfa.swap_states(id, minimal_ids[as_index(id)]);
+        }
+        // Trim off all unused states from the pre-minimized DFA. This
+        // represents all states that were merged into a non-singleton
+        // equivalence class of states, and appeared after the first state
+        // in each such class. (Because the state with the smallest ID in each
+        // equivalence class is its representative ID.)
+        self.dfa.truncate_states(minimal_count);
+
+        // Update the new start states, which is now just the minimal ID of
+        // whatever state the old start state was collapsed into. Also, we
+        // collect everything before-hand to work around the borrow checker.
+        // We're already allocating so much that this is probably fine. If this
+        // turns out to be costly, then I guess add a `starts_mut` iterator.
+        let starts: Vec<_> = self.dfa.starts().collect();
+        for (old_start_id, start_type, pid) in starts {
+            self.dfa.set_start_state(start_type, pid, remap(old_start_id));
+        }
+
+        // Update the match state pattern ID list for multi-regexes. All we
+        // need to do is remap the match state IDs. The pattern ID lists are
+        // always the same as they were since match states with distinct
+        // pattern ID lists are always considered distinct states.
+        let mut pmap = BTreeMap::new();
+        for (match_id, pattern_ids) in self.dfa.pattern_map() {
+            let new_id = remap(match_id);
+            pmap.insert(new_id, pattern_ids);
+        }
+        // This unwrap is OK because minimization never increases the number of
+        // match states or patterns in those match states. Since minimization
+        // runs after the pattern map has already been set at least once, we
+        // know that our match states cannot error.
+        self.dfa.set_pattern_map(&pmap).unwrap();
+
+        // In order to update the ID of the maximum match state, we need to
+        // find the maximum ID among all of the match states in the minimized
+        // DFA. This is not necessarily the new ID of the unminimized maximum
+        // match state, since that could have been collapsed with a much
+        // earlier match state. Therefore, to find the new max match state,
+        // we iterate over all previous match states, find their corresponding
+        // new minimal ID, and take the maximum of those.
+        let old = self.dfa.special().clone();
+        let new = self.dfa.special_mut();
+        // ... but only remap if we had match states.
+        if old.matches() {
+            new.min_match = StateID::MAX;
+            new.max_match = StateID::ZERO;
+            for i in as_index(old.min_match)..=as_index(old.max_match) {
+                let new_id = remap(as_state_id(i));
+                if new_id < new.min_match {
+                    new.min_match = new_id;
+                }
+                if new_id > new.max_match {
+                    new.max_match = new_id;
+                }
+            }
+        }
+        // ... same, but for start states.
+        if old.starts() {
+            new.min_start = StateID::MAX;
+            new.max_start = StateID::ZERO;
+            for i in as_index(old.min_start)..=as_index(old.max_start) {
+                let new_id = remap(as_state_id(i));
+                if new_id == DEAD {
+                    continue;
+                }
+                if new_id < new.min_start {
+                    new.min_start = new_id;
+                }
+                if new_id > new.max_start {
+                    new.max_start = new_id;
+                }
+            }
+            if new.max_start == DEAD {
+                new.min_start = DEAD;
+            }
+        }
+        new.quit_id = remap(new.quit_id);
+        new.set_max();
+    }
+
+    fn find_waiting(&self, set: &StateSet) -> Option<usize> {
+        self.waiting.iter().position(|s| s == set)
+    }
+
+    fn find_incoming_to(
+        &self,
+        b: alphabet::Unit,
+        set: &StateSet,
+        incoming: &mut StateSet,
+    ) {
+        incoming.clear();
+        set.iter(|id| {
+            for &inid in
+                &self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
+            {
+                incoming.add(inid);
+            }
+        });
+        incoming.canonicalize();
+    }
+
+    fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
+        // For match states, we know that two match states with different
+        // pattern ID lists will *always* be distinct, so we can partition them
+        // initially based on that.
+        let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
+        let mut is_quit = StateSet::empty();
+        let mut no_match = StateSet::empty();
+        for state in dfa.states() {
+            if dfa.is_match_state(state.id()) {
+                let mut pids = vec![];
+                for i in 0..dfa.match_count(state.id()) {
+                    pids.push(dfa.match_pattern(state.id(), i));
+                }
+                matching
+                    .entry(pids)
+                    .or_insert(StateSet::empty())
+                    .add(state.id());
+            } else if dfa.is_quit_state(state.id()) {
+                is_quit.add(state.id());
+            } else {
+                no_match.add(state.id());
+            }
+        }
+
+        let mut sets: Vec<StateSet> =
+            matching.into_iter().map(|(_, set)| set).collect();
+        sets.push(no_match);
+        sets.push(is_quit);
+        sets
+    }
+
+    fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
+        let mut incoming = vec![];
+        for _ in dfa.states() {
+            incoming.push(vec![vec![]; dfa.alphabet_len()]);
+        }
+        for state in dfa.states() {
+            for (b, next) in state.transitions() {
+                incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
+            }
+        }
+        incoming
+    }
+}
+
+impl StateSet {
+    fn empty() -> StateSet {
+        StateSet { ids: Rc::new(RefCell::new(vec![])) }
+    }
+
+    fn add(&mut self, id: StateID) {
+        self.ids.borrow_mut().push(id);
+    }
+
+    fn min(&self) -> StateID {
+        self.ids.borrow()[0]
+    }
+
+    fn canonicalize(&mut self) {
+        self.ids.borrow_mut().sort();
+        self.ids.borrow_mut().dedup();
+    }
+
+    fn clear(&mut self) {
+        self.ids.borrow_mut().clear();
+    }
+
+    fn len(&self) -> usize {
+        self.ids.borrow().len()
+    }
+
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    fn deep_clone(&self) -> StateSet {
+        let ids = self.ids.borrow().iter().cloned().collect();
+        StateSet { ids: Rc::new(RefCell::new(ids)) }
+    }
+
+    fn iter<F: FnMut(StateID)>(&self, mut f: F) {
+        for &id in self.ids.borrow().iter() {
+            f(id);
+        }
+    }
+
+    fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
+        dest.clear();
+        if self.is_empty() || other.is_empty() {
+            return;
+        }
+
+        let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
+        let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
+        let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
+        loop {
+            if a == b {
+                dest.add(a);
+                a = match ita.next() {
+                    None => break,
+                    Some(a) => a,
+                };
+                b = match itb.next() {
+                    None => break,
+                    Some(b) => b,
+                };
+            } else if a < b {
+                a = match ita.next() {
+                    None => break,
+                    Some(a) => a,
+                };
+            } else {
+                b = match itb.next() {
+                    None => break,
+                    Some(b) => b,
+                };
+            }
+        }
+    }
+
+    fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
+        dest.clear();
+        if self.is_empty() || other.is_empty() {
+            self.iter(|s| dest.add(s));
+            return;
+        }
+
+        let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
+        let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
+        let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
+        loop {
+            if a == b {
+                a = match ita.next() {
+                    None => break,
+                    Some(a) => a,
+                };
+                b = match itb.next() {
+                    None => {
+                        dest.add(a);
+                        break;
+                    }
+                    Some(b) => b,
+                };
+            } else if a < b {
+                dest.add(a);
+                a = match ita.next() {
+                    None => break,
+                    Some(a) => a,
+                };
+            } else {
+                b = match itb.next() {
+                    None => {
+                        dest.add(a);
+                        break;
+                    }
+                    Some(b) => b,
+                };
+            }
+        }
+        for a in ita {
+            dest.add(a);
+        }
+    }
+}
diff --git a/src/dfa/mod.rs b/src/dfa/mod.rs

new file mode 100644 (file)

index 0000000..6f9fe60
--- /dev/null
+++ b/src/dfa/mod.rs
@@ -0,0 +1,363 @@
+/*!
+A module for building and searching with determinstic finite automata (DFAs).
+
+Like other modules in this crate, DFAs support a rich regex syntax with Unicode
+features. DFAs also have extensive options for configuring the best space vs
+time trade off for your use case and provides support for cheap deserialization
+of automata for use in `no_std` environments.
+
+If you're looking for lazy DFAs that build themselves incrementally during
+search, then please see the top-level [`hybrid` module](crate::hybrid).
+
+# Overview
+
+This section gives a brief overview of the primary types in this module:
+
+* A [`regex::Regex`] provides a way to search for matches of a regular
+expression using DFAs. This includes iterating over matches with both the start
+and end positions of each match.
+* A [`dense::DFA`] provides low level access to a DFA that uses a dense
+representation (uses lots of space, but fast searching).
+* A [`sparse::DFA`] provides the same API as a `dense::DFA`, but uses a sparse
+representation (uses less space, but slower searching).
+* An [`Automaton`] trait that defines an interface that both dense and sparse
+DFAs implement. (A `regex::Regex` is generic over this trait.)
+* Both dense DFAs and sparse DFAs support serialization to raw bytes (e.g.,
+[`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g.,
+[`dense::DFA::from_bytes`]).
+
+# Example: basic regex searching
+
+This example shows how to compile a regex using the default configuration
+and then use it to find matches in a byte string:
+
+```
+use regex_automata::{MultiMatch, dfa::regex::Regex};
+
+let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+    MultiMatch::must(0, 0, 10),
+    MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: searching with regex sets
+
+The DFAs in this module all fully support searching with multiple regexes
+simultaneously. You can use this support with standard leftmost-first style
+searching to find non-overlapping matches:
+
+```
+use regex_automata::{MultiMatch, dfa::regex::Regex};
+
+let re = Regex::new_many(&[r"\w+", r"\S+"])?;
+let text = b"@foo bar";
+let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+    MultiMatch::must(1, 0, 4),
+    MultiMatch::must(0, 5, 8),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+Or use overlapping style searches to find all possible occurrences:
+
+```
+use regex_automata::{MatchKind, MultiMatch, dfa::{dense, regex::Regex}};
+
+// N.B. For overlapping searches, we need the underlying DFA to report all
+// possible matches.
+let re = Regex::builder()
+    .dense(dense::Config::new().match_kind(MatchKind::All))
+    .build_many(&[r"\w{3}", r"\S{3}"])?;
+let text = b"@foo bar";
+let matches: Vec<MultiMatch> = re.find_overlapping_iter(text).collect();
+assert_eq!(matches, vec![
+    MultiMatch::must(1, 0, 3),
+    MultiMatch::must(0, 1, 4),
+    MultiMatch::must(1, 1, 4),
+    MultiMatch::must(0, 5, 8),
+    MultiMatch::must(1, 5, 8),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: use sparse DFAs
+
+By default, compiling a regex will use dense DFAs internally. This uses more
+memory, but executes searches more quickly. If you can abide slower searches
+(somewhere around 3-5x), then sparse DFAs might make more sense since they can
+use significantly less space.
+
+Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
+`Regex::new`:
+
+```
+use regex_automata::{MultiMatch, dfa::regex::Regex};
+
+let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+    MultiMatch::must(0, 0, 10),
+    MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+If you already have dense DFAs for some reason, they can be converted to sparse
+DFAs and used to build a new `Regex`. For example:
+
+```
+use regex_automata::{MultiMatch, dfa::regex::Regex};
+
+let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let sparse_re = Regex::builder().build_from_dfas(
+    dense_re.forward().to_sparse()?,
+    dense_re.reverse().to_sparse()?,
+);
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> = sparse_re.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+    MultiMatch::must(0, 0, 10),
+    MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: deserialize a DFA
+
+This shows how to first serialize a DFA into raw bytes, and then deserialize
+those raw bytes back into a DFA. While this particular example is a
+bit contrived, this same technique can be used in your program to
+deserialize a DFA at start up time or by memory mapping a file.
+
+```
+use regex_automata::{MultiMatch, dfa::{dense, regex::Regex}};
+
+let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+// serialize both the forward and reverse DFAs, see note below
+let (fwd_bytes, fwd_pad) = re1.forward().to_bytes_native_endian();
+let (rev_bytes, rev_pad) = re1.reverse().to_bytes_native_endian();
+// now deserialize both---we need to specify the correct type!
+let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes[fwd_pad..])?.0;
+let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes[rev_pad..])?.0;
+// finally, reconstruct our regex
+let re2 = Regex::builder().build_from_dfas(fwd, rev);
+
+// we can use it like normal
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+    MultiMatch::must(0, 0, 10),
+    MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+There are a few points worth noting here:
+
+* We need to extract the raw DFAs used by the regex and serialize those. You
+can build the DFAs manually yourself using [`dense::Builder`], but using
+the DFAs from a `Regex` guarantees that the DFAs are built correctly. (In
+particular, a `Regex` constructs a reverse DFA for finding the starting
+location of matches.)
+* To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method.
+In practice, you'll want to use either [`dense::DFA::to_bytes_little_endian`]
+or [`dense::DFA::to_bytes_big_endian`], depending on which platform you're
+deserializing your DFA from. If you intend to deserialize on either platform,
+then you'll need to serialize both and deserialize the right one depending on
+your target's endianness.
+* Safely deserializing a DFA requires verifying the raw bytes, particularly if
+they are untrusted, since an invalid DFA could cause logical errors, panics
+or even undefined behavior. This verification step requires visiting all of
+the transitions in the DFA, which can be costly. If cheaper verification is
+desired, then [`dense::DFA::from_bytes_unchecked`] is available that only does
+verification that can be performed in constant time. However, one can only use
+this routine if the caller can guarantee that the bytes provided encoded a
+valid DFA.
+
+The same process can be achieved with sparse DFAs as well:
+
+```
+use regex_automata::{MultiMatch, dfa::{sparse, regex::Regex}};
+
+let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+// serialize both
+let fwd_bytes = re1.forward().to_sparse()?.to_bytes_native_endian();
+let rev_bytes = re1.reverse().to_sparse()?.to_bytes_native_endian();
+// now deserialize both---we need to specify the correct type!
+let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes)?.0;
+let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes)?.0;
+// finally, reconstruct our regex
+let re2 = Regex::builder().build_from_dfas(fwd, rev);
+
+// we can use it like normal
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect();
+assert_eq!(matches, vec![
+    MultiMatch::must(0, 0, 10),
+    MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
+Conversely, dense DFAs must be be aligned to the same alignment as a
+[`StateID`](crate::util::id::StateID).
+
+# Support for `no_std` and `alloc`-only
+
+This crate comes with `alloc` and `std` features that are enabled by default.
+When the `alloc` or `std` features are enabled, the API of this module will
+include the facilities necessary for compiling, serializing, deserializing
+and searching with DFAs. When only the `alloc` feature is enabled, then
+implementations of the `std::error::Error` trait are dropped, but everything
+else generally remains the same. When both the `alloc` and `std` features are
+disabled, the API of this module will shrink such that it only includes the
+facilities necessary for deserializing and searching with DFAs.
+
+The intended workflow for `no_std` environments is thus as follows:
+
+* Write a program with the `alloc` or `std` features that compiles and
+serializes a regular expression. You may need to serialize both little and big
+endian versions of each DFA. (So that's 4 DFAs in total for each regex.)
+* In your `no_std` environment, follow the examples above for deserializing
+your previously serialized DFAs into regexes. You can then search with them as
+you would any regex.
+
+Deserialization can happen anywhere. For example, with bytes embedded into a
+binary or with a file memory mapped at runtime.
+
+TODO: Include link to `regex-cli` here pointing out how to generate Rust code
+for deserializing DFAs.
+
+# Syntax
+
+This module supports the same syntax as the `regex` crate, since they share the
+same parser. You can find an exhaustive list of supported syntax in the
+[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
+
+There are two things that are not supported by the DFAs in this module:
+
+* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
+of them) can only find the offsets of an entire match, but cannot resolve
+the offsets of each capturing group. This is because DFAs do not have the
+expressive power necessary.
+* Unicode word boundaries. These present particularly difficult challenges for
+DFA construction and would result in an explosion in the number of states.
+One can enable [`dense::Config::unicode_word_boundary`] though, which provides
+heuristic support for Unicode word boundaries that only works on ASCII text.
+Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
+on any input.
+
+There are no plans to lift either of these limitations.
+
+Note that these restrictions are identical to the restrictions on lazy DFAs.
+
+# Differences with general purpose regexes
+
+The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
+general purpose regular expression engine. It aims to automatically balance low
+compile times, fast search times and low memory usage, while also providing
+a convenient API for users. In contrast, this module provides a lower level
+regular expression interface based exclusively on DFAs that is a bit less
+convenient while providing more explicit control over memory usage and search
+times.
+
+Here are some specific negative differences:
+
+* **Compilation can take an exponential amount of time and space** in the size
+of the regex pattern. While most patterns do not exhibit worst case exponential
+time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA
+with approximately `2^(N+2)` states. For this reason, untrusted patterns should
+not be compiled with this module. (In the future, the API may expose an option
+to return an error if the DFA gets too big.)
+* This module does not support sub-match extraction via capturing groups, which
+can be achieved with the regex crate's "captures" API.
+* While the regex crate doesn't necessarily sport fast compilation times,
+the regexes in this module are almost universally slow to compile, especially
+when they contain large Unicode character classes. For example, on my system,
+compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling
+a sparse regex takes about the same time but only uses about 1.2MB of
+memory.) Conversly, compiling the same regex without Unicode support, e.g.,
+`(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this
+reason, you should only use Unicode character classes if you absolutely need
+them! (They are enabled by default though.)
+* This module does not support Unicode word boundaries. ASCII word bondaries
+may be used though by disabling Unicode or selectively doing so in the syntax,
+e.g., `(?-u:\b)`. There is also an option to
+[heuristically enable Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary),
+where the corresponding DFA will give up if any non-ASCII byte is seen.
+* As a lower level API, this module does not do literal optimizations
+automatically. Although it does provide hooks in its API to make use of the
+[`Prefilter`](crate::util::prefilter::Prefilter) trait. Missing literal
+optimizations means that searches may run much slower than what you're
+accustomed to, although, it does provide more predictable and consistent
+performance.
+* There is no `&str` API like in the regex crate. In this module, all APIs
+operate on `&[u8]`. By default, match indices are guaranteed to fall on UTF-8
+boundaries, unless any of [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8),
+[`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) or
+[`regex::Config::utf8`] are disabled.
+
+With some of the downsides out of the way, here are some positive differences:
+
+* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
+deserialized. Deserialization can be done in constant time with the unchecked
+APIs, since searching can be performed directly on the raw serialized bytes of
+a DFA.
+* This module was specifically designed so that the searching phase of a
+DFA has minimal runtime requirements, and can therefore be used in `no_std`
+environments. While `no_std` environments cannot compile regexes, they can
+deserialize pre-compiled regexes.
+* Since this module builds DFAs ahead of time, it will generally out-perform
+the `regex` crate on equivalent tasks. The performance difference is likely
+not large. However, because of a complex set of optimizations in the regex
+crate (like literal optimizations), an accurate performance comparison may be
+difficult to do.
+* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
+performance a small amount, but uses much less storage space. Potentially even
+less than what the regex crate uses.
+* This module exposes DFAs directly, such as [`dense::DFA`] and
+[`sparse::DFA`], which enables one to do less work in some cases. For example,
+if you only need the end of a match and not the start of a match, then you can
+use a DFA directly without building a `Regex`, which always requires a second
+DFA to find the start of a match.
+* This module provides more control over memory usage. Aside from choosing
+between dense and sparse DFAs, one can also choose a smaller state identifier
+representation to use less space. Also, one can enable DFA minimization
+via [`dense::Config::minimize`], but it can increase compilation times
+dramatically.
+*/
+
+pub use crate::dfa::automaton::{Automaton, OverlappingState};
+#[cfg(feature = "alloc")]
+pub use crate::dfa::error::Error;
+
+/// This is an alias for a state ID of zero. It has special significance
+/// because it always corresponds to the first state in a DFA, and the first
+/// state in a DFA is always "dead." That is, the dead state always has all
+/// of its transitions set to itself. Moreover, the dead state is used as a
+/// sentinel for various things. e.g., In search, reaching a dead state means
+/// that the search must stop.
+const DEAD: crate::util::id::StateID = crate::util::id::StateID::ZERO;
+
+mod accel;
+mod automaton;
+pub mod dense;
+#[cfg(feature = "alloc")]
+mod determinize;
+#[cfg(feature = "alloc")]
+pub(crate) mod error;
+#[cfg(feature = "alloc")]
+mod minimize;
+pub mod regex;
+mod search;
+pub mod sparse;
+mod special;
+#[cfg(feature = "transducer")]
+mod transducer;
diff --git a/src/dfa/regex.rs b/src/dfa/regex.rs

new file mode 100644 (file)

index 0000000..d0917e1
--- /dev/null
+++ b/src/dfa/regex.rs
@@ -0,0 +1,2146 @@
+/*!
+A DFA-backed `Regex`.
+
+This module provides [`Regex`], which is defined generically over the
+[`Automaton`] trait. A `Regex` implements convenience routines you might have
+come to expect, such as finding the start/end of a match and iterating over
+all non-overlapping matches. This `Regex` type is limited in its capabilities
+to what a DFA can provide. Therefore, APIs involving capturing groups, for
+example, are not provided.
+
+Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
+finds the end offset of a match, where as the other is a "reverse" DFA that
+find the start offset of a match.
+
+See the [parent module](crate::dfa) for examples.
+*/
+
+#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+use crate::{
+    dfa::automaton::{Automaton, OverlappingState},
+    util::prefilter::{self, Prefilter},
+    MatchError, MultiMatch,
+};
+#[cfg(feature = "alloc")]
+use crate::{
+    dfa::{dense, error::Error, sparse},
+    nfa::thompson,
+    util::matchtypes::MatchKind,
+};
+
+// When the alloc feature is enabled, the regex type sets its A type parameter
+// to default to an owned dense DFA. But without alloc, we set no default. This
+// makes things a lot more convenient in the common case, since writing out the
+// DFA types is pretty annoying.
+//
+// Since we have two different definitions but only want to write one doc
+// string, we use a macro to capture the doc and other attributes once and then
+// repeat them for each definition.
+macro_rules! define_regex_type {
+    ($(#[$doc:meta])*) => {
+        #[cfg(feature = "alloc")]
+        $(#[$doc])*
+        pub struct Regex<A = dense::OwnedDFA, P = prefilter::None> {
+            prefilter: Option<P>,
+            forward: A,
+            reverse: A,
+            utf8: bool,
+        }
+
+        #[cfg(not(feature = "alloc"))]
+        $(#[$doc])*
+        pub struct Regex<A, P = prefilter::None> {
+            prefilter: Option<P>,
+            forward: A,
+            reverse: A,
+            utf8: bool,
+        }
+    };
+}
+
+define_regex_type!(
+    /// A regular expression that uses deterministic finite automata for fast
+    /// searching.
+    ///
+    /// A regular expression is comprised of two DFAs, a "forward" DFA and a
+    /// "reverse" DFA. The forward DFA is responsible for detecting the end of
+    /// a match while the reverse DFA is responsible for detecting the start
+    /// of a match. Thus, in order to find the bounds of any given match, a
+    /// forward search must first be run followed by a reverse search. A match
+    /// found by the forward DFA guarantees that the reverse DFA will also find
+    /// a match.
+    ///
+    /// The type of the DFA used by a `Regex` corresponds to the `A` type
+    /// parameter, which must satisfy the [`Automaton`] trait. Typically,
+    /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
+    /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
+    /// memory but search faster, while sparse DFAs use less memory but search
+    /// more slowly.
+    ///
+    /// By default, a regex's automaton type parameter is set to
+    /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
+    /// in-memory work loads, this is the most convenient type that gives the
+    /// best search performance. When the `alloc` feature is disabled, no
+    /// default type is used.
+    ///
+    /// A `Regex` also has a `P` type parameter, which is used to select the
+    /// prefilter used during search. By default, no prefilter is enabled by
+    /// setting the type to default to [`prefilter::None`]. A prefilter can be
+    /// enabled by using the [`Regex::prefilter`] method.
+    ///
+    /// # When should I use this?
+    ///
+    /// Generally speaking, if you can afford the overhead of building a full
+    /// DFA for your regex, and you don't need things like capturing groups,
+    /// then this is a good choice if you're looking to optimize for matching
+    /// speed. Note however that its speed may be worse than a general purpose
+    /// regex engine if you don't select a good [prefilter].
+    ///
+    /// # Earliest vs Leftmost vs Overlapping
+    ///
+    /// The search routines exposed on a `Regex` reflect three different ways
+    /// of searching:
+    ///
+    /// * "earliest" means to stop as soon as a match has been detected.
+    /// * "leftmost" means to continue matching until the underlying
+    ///   automaton cannot advance. This reflects "standard" searching you
+    ///   might be used to in other regex engines. e.g., This permits
+    ///   non-greedy and greedy searching to work as you would expect.
+    /// * "overlapping" means to find all possible matches, even if they
+    ///   overlap.
+    ///
+    /// Generally speaking, when doing an overlapping search, you'll want to
+    /// build your regex DFAs with [`MatchKind::All`] semantics. Using
+    /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is
+    /// likely to lead to odd behavior since `LeftmostFirst` specifically omits
+    /// some matches that can never be reported due to its semantics.
+    ///
+    /// The following example shows the differences between how these different
+    /// types of searches impact looking for matches of `[a-z]+` in the
+    /// haystack `abc`.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch};
+    ///
+    /// let pattern = r"[a-z]+";
+    /// let haystack = "abc".as_bytes();
+    ///
+    /// // With leftmost-first semantics, we test "earliest" and "leftmost".
+    /// let re = dfa::regex::Builder::new()
+    ///     .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst))
+    ///     .build(pattern)?;
+    ///
+    /// // "earliest" searching isn't impacted by greediness
+    /// let mut it = re.find_earliest_iter(haystack);
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// // "leftmost" searching supports greediness (and non-greediness)
+    /// let mut it = re.find_leftmost_iter(haystack);
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// // For overlapping, we want "all" match kind semantics.
+    /// let re = dfa::regex::Builder::new()
+    ///     .dense(dense::Config::new().match_kind(MatchKind::All))
+    ///     .build(pattern)?;
+    ///
+    /// // In the overlapping search, we find all three possible matches
+    /// // starting at the beginning of the haystack.
+    /// let mut it = re.find_overlapping_iter(haystack);
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// # Sparse DFAs
+    ///
+    /// Since a `Regex` is generic over the [`Automaton`] trait, it can be
+    /// used with any kind of DFA. While this crate constructs dense DFAs by
+    /// default, it is easy enough to build corresponding sparse DFAs, and then
+    /// build a regex from them:
+    ///
+    /// ```
+    /// use regex_automata::dfa::regex::Regex;
+    ///
+    /// // First, build a regex that uses dense DFAs.
+    /// let dense_re = Regex::new("foo[0-9]+")?;
+    ///
+    /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
+    /// let fwd = dense_re.forward().to_sparse()?;
+    /// let rev = dense_re.reverse().to_sparse()?;
+    ///
+    /// // Third, build a new regex from the constituent sparse DFAs.
+    /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
+    ///
+    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+    /// assert_eq!(true, sparse_re.is_match(b"foo123"));
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// Alternatively, one can use a [`Builder`] to construct a sparse DFA
+    /// more succinctly. (Note though that dense DFAs are still constructed
+    /// first internally, and then converted to sparse DFAs, as in the example
+    /// above.)
+    ///
+    /// ```
+    /// use regex_automata::dfa::regex::Regex;
+    ///
+    /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
+    /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+    /// assert!(sparse_re.is_match(b"foo123"));
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// # Fallibility
+    ///
+    /// In non-default configurations, the DFAs generated in this module may
+    /// return an error during a search. (Currently, the only way this happens
+    /// is if quit bytes are added or Unicode word boundaries are heuristically
+    /// enabled, both of which are turned off by default.) For convenience, the
+    /// main search routines, like [`find_leftmost`](Regex::find_leftmost),
+    /// will panic if an error occurs. However, if you need to use DFAs
+    /// which may produce an error at search time, then there are fallible
+    /// equivalents of all search routines. For example, for `find_leftmost`,
+    /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost).
+    /// The routines prefixed with `try_` return `Result<Option<MultiMatch>,
+    /// MatchError>`, where as the infallible routines simply return
+    /// `Option<MultiMatch>`.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to cause a search to terminate if it sees a
+    /// `\n` byte, and handle the error returned. This could be useful if, for
+    /// example, you wanted to prevent a user supplied pattern from matching
+    /// across a line boundary.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{self, regex::Regex}, MatchError};
+    ///
+    /// let re = Regex::builder()
+    ///     .dense(dfa::dense::Config::new().quit(b'\n', true))
+    ///     .build(r"foo\p{any}+bar")?;
+    ///
+    /// let haystack = "foo\nbar".as_bytes();
+    /// // Normally this would produce a match, since \p{any} contains '\n'.
+    /// // But since we instructed the automaton to enter a quit state if a
+    /// // '\n' is observed, this produces a match error instead.
+    /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
+    /// let got = re.try_find_leftmost(haystack).unwrap_err();
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[derive(Clone, Debug)]
+);
+
+#[cfg(feature = "alloc")]
+impl Regex {
+    /// Parse the given regular expression using the default configuration and
+    /// return the corresponding regex.
+    ///
+    /// If you want a non-default configuration, then use the [`Builder`] to
+    /// set your own configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+    ///
+    /// let re = Regex::new("foo[0-9]+bar")?;
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 3, 14)),
+    ///     re.find_leftmost(b"zzzfoo12345barzzz"),
+    /// );
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new(pattern: &str) -> Result<Regex, Error> {
+        Builder::new().build(pattern)
+    }
+
+    /// Like `new`, but parses multiple patterns into a single "regex set."
+    /// This similarly uses the default regex configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+    ///
+    /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
+    ///
+    /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux");
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
+    /// assert_eq!(None, it.next());
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Regex, Error> {
+        Builder::new().build_many(patterns)
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl Regex<sparse::DFA<Vec<u8>>> {
+    /// Parse the given regular expression using the default configuration,
+    /// except using sparse DFAs, and return the corresponding regex.
+    ///
+    /// If you want a non-default configuration, then use the [`Builder`] to
+    /// set your own configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+    ///
+    /// let re = Regex::new_sparse("foo[0-9]+bar")?;
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 3, 14)),
+    ///     re.find_leftmost(b"zzzfoo12345barzzz"),
+    /// );
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new_sparse(
+        pattern: &str,
+    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+        Builder::new().build_sparse(pattern)
+    }
+
+    /// Like `new`, but parses multiple patterns into a single "regex set"
+    /// using sparse DFAs. This otherwise similarly uses the default regex
+    /// configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+    ///
+    /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
+    ///
+    /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux");
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
+    /// assert_eq!(None, it.next());
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new_many_sparse<P: AsRef<str>>(
+        patterns: &[P],
+    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+        Builder::new().build_many_sparse(patterns)
+    }
+}
+
+/// Convenience routines for regex construction.
+#[cfg(feature = "alloc")]
+impl Regex {
+    /// Return a default configuration for a `Regex`.
+    ///
+    /// This is a convenience routine to avoid needing to import the `Config`
+    /// type when customizing the construction of a regex.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to disable UTF-8 mode for `Regex` iteration.
+    /// When UTF-8 mode is disabled, the position immediately following an
+    /// empty match is where the next search begins, instead of the next
+    /// position of a UTF-8 encoded codepoint.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::regex::Regex, MultiMatch};
+    ///
+    /// let re = Regex::builder()
+    ///     .configure(Regex::config().utf8(false))
+    ///     .build(r"")?;
+    /// let haystack = "a☃z".as_bytes();
+    /// let mut it = re.find_leftmost_iter(haystack);
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn config() -> Config {
+        Config::new()
+    }
+
+    /// Return a builder for configuring the construction of a `Regex`.
+    ///
+    /// This is a convenience routine to avoid needing to import the
+    /// [`Builder`] type in common cases.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use the builder to disable UTF-8 mode
+    /// everywhere.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::regex::Regex,
+    ///     nfa::thompson,
+    ///     MultiMatch, SyntaxConfig,
+    /// };
+    ///
+    /// let re = Regex::builder()
+    ///     .configure(Regex::config().utf8(false))
+    ///     .syntax(SyntaxConfig::new().utf8(false))
+    ///     .thompson(thompson::Config::new().utf8(false))
+    ///     .build(r"foo(?-u:[^b])ar.*")?;
+    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+    /// let expected = Some(MultiMatch::must(0, 1, 9));
+    /// let got = re.find_leftmost(haystack);
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn builder() -> Builder {
+        Builder::new()
+    }
+}
+
+/// Standard search routines for finding and iterating over matches.
+impl<A: Automaton, P: Prefilter> Regex<A, P> {
+    /// Returns true if and only if this regex matches the given haystack.
+    ///
+    /// This routine may short circuit if it knows that scanning future input
+    /// will never lead to a different result. In particular, if the underlying
+    /// DFA enters a match state or a dead state, then this routine will return
+    /// `true` or `false`, respectively, without inspecting any future input.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying DFAs return an error, then this routine panics. This
+    /// only occurs in non-default configurations where quit bytes are used or
+    /// Unicode word boundaries are heuristically enabled.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_is_match`](Regex::try_is_match).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::dfa::regex::Regex;
+    ///
+    /// let re = Regex::new("foo[0-9]+bar")?;
+    /// assert_eq!(true, re.is_match(b"foo12345bar"));
+    /// assert_eq!(false, re.is_match(b"foobar"));
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn is_match(&self, haystack: &[u8]) -> bool {
+        self.is_match_at(haystack, 0, haystack.len())
+    }
+
+    /// Returns the first position at which a match is found.
+    ///
+    /// This routine stops scanning input in precisely the same circumstances
+    /// as `is_match`. The key difference is that this routine returns the
+    /// position at which it stopped scanning input if and only if a match
+    /// was found. If no match is found, then `None` is returned.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying DFAs return an error, then this routine panics. This
+    /// only occurs in non-default configurations where quit bytes are used or
+    /// Unicode word boundaries are heuristically enabled.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_earliest`](Regex::try_find_earliest).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+    ///
+    /// // Normally, the leftmost first match would greedily consume as many
+    /// // decimal digits as it could. But a match is detected as soon as one
+    /// // digit is seen.
+    /// let re = Regex::new("foo[0-9]+")?;
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 0, 4)),
+    ///     re.find_earliest(b"foo12345"),
+    /// );
+    ///
+    /// // Normally, the end of the leftmost first match here would be 3,
+    /// // but the "earliest" match semantics detect a match earlier.
+    /// let re = Regex::new("abc|a")?;
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc"));
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_earliest(&self, haystack: &[u8]) -> Option<MultiMatch> {
+        self.find_earliest_at(haystack, 0, haystack.len())
+    }
+
+    /// Returns the start and end offset of the leftmost match. If no match
+    /// exists, then `None` is returned.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying DFAs return an error, then this routine panics. This
+    /// only occurs in non-default configurations where quit bytes are used or
+    /// Unicode word boundaries are heuristically enabled.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_leftmost`](Regex::try_find_leftmost).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+    ///
+    /// // Greediness is applied appropriately when compared to find_earliest.
+    /// let re = Regex::new("foo[0-9]+")?;
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 3, 11)),
+    ///     re.find_leftmost(b"zzzfoo12345zzz"),
+    /// );
+    ///
+    /// // Even though a match is found after reading the first byte (`a`),
+    /// // the default leftmost-first match semantics demand that we find the
+    /// // earliest match that prefers earlier parts of the pattern over latter
+    /// // parts.
+    /// let re = Regex::new("abc|a")?;
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc"));
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_leftmost(&self, haystack: &[u8]) -> Option<MultiMatch> {
+        self.find_leftmost_at(haystack, 0, haystack.len())
+    }
+
+    /// Search for the first overlapping match in `haystack`.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// In particular, callers must preserve the automaton's search state from
+    /// prior calls so that the implementation knows where the last match
+    /// occurred and which pattern was reported.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying DFAs return an error, then this routine panics. This
+    /// only occurs in non-default configurations where quit bytes are used or
+    /// Unicode word boundaries are heuristically enabled.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_overlapping`](Regex::try_find_overlapping).
+    ///
+    /// # Example
+    ///
+    /// This example shows how to run an overlapping search with multiple
+    /// regexes.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch};
+    ///
+    /// let re = Regex::builder()
+    ///     .dense(dfa::dense::Config::new().match_kind(MatchKind::All))
+    ///     .build_many(&[r"\w+$", r"\S+$"])?;
+    /// let haystack = "@foo".as_bytes();
+    /// let mut state = dfa::OverlappingState::start();
+    ///
+    /// let expected = Some(MultiMatch::must(1, 0, 4));
+    /// let got = re.find_overlapping(haystack, &mut state);
+    /// assert_eq!(expected, got);
+    ///
+    /// // The first pattern also matches at the same position, so re-running
+    /// // the search will yield another match. Notice also that the first
+    /// // pattern is returned after the second. This is because the second
+    /// // pattern begins its match before the first, is therefore an earlier
+    /// // match and is thus reported first.
+    /// let expected = Some(MultiMatch::must(0, 1, 4));
+    /// let got = re.find_overlapping(haystack, &mut state);
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_overlapping(
+        &self,
+        haystack: &[u8],
+        state: &mut OverlappingState,
+    ) -> Option<MultiMatch> {
+        self.find_overlapping_at(haystack, 0, haystack.len(), state)
+    }
+
+    /// Returns an iterator over all non-overlapping "earliest" matches.
+    ///
+    /// Match positions are reported as soon as a match is known to occur, even
+    /// if the standard leftmost match would be longer.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying DFAs return an error during iteration, then iteration
+    /// panics. This only occurs in non-default configurations where quit bytes
+    /// are used or Unicode word boundaries are heuristically enabled.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter).
+    ///
+    /// # Example
+    ///
+    /// This example shows how to run an "earliest" iterator.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::regex::Regex, MultiMatch};
+    ///
+    /// let re = Regex::new("[0-9]+")?;
+    /// let haystack = "123".as_bytes();
+    ///
+    /// // Normally, a standard leftmost iterator would return a single
+    /// // match, but since "earliest" detects matches earlier, we get
+    /// // three matches.
+    /// let mut it = re.find_earliest_iter(haystack);
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_earliest_iter<'r, 't>(
+        &'r self,
+        haystack: &'t [u8],
+    ) -> FindEarliestMatches<'r, 't, A, P> {
+        FindEarliestMatches::new(self, haystack)
+    }
+
+    /// Returns an iterator over all non-overlapping leftmost matches in the
+    /// given bytes. If no match exists, then the iterator yields no elements.
+    ///
+    /// This corresponds to the "standard" regex search iterator.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying DFAs return an error during iteration, then iteration
+    /// panics. This only occurs in non-default configurations where quit bytes
+    /// are used or Unicode word boundaries are heuristically enabled.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+    ///
+    /// let re = Regex::new("foo[0-9]+")?;
+    /// let text = b"foo1 foo12 foo123";
+    /// let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect();
+    /// assert_eq!(matches, vec![
+    ///     MultiMatch::must(0, 0, 4),
+    ///     MultiMatch::must(0, 5, 10),
+    ///     MultiMatch::must(0, 11, 17),
+    /// ]);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_leftmost_iter<'r, 't>(
+        &'r self,
+        haystack: &'t [u8],
+    ) -> FindLeftmostMatches<'r, 't, A, P> {
+        FindLeftmostMatches::new(self, haystack)
+    }
+
+    /// Returns an iterator over all overlapping matches in the given haystack.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// The iterator takes care of handling the overlapping state that must be
+    /// threaded through every search.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying DFAs return an error during iteration, then iteration
+    /// panics. This only occurs in non-default configurations where quit bytes
+    /// are used or Unicode word boundaries are heuristically enabled.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter).
+    ///
+    /// # Example
+    ///
+    /// This example shows how to run an overlapping search with multiple
+    /// regexes.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch};
+    ///
+    /// let re = Regex::builder()
+    ///     .dense(dfa::dense::Config::new().match_kind(MatchKind::All))
+    ///     .build_many(&[r"\w+$", r"\S+$"])?;
+    /// let haystack = "@foo".as_bytes();
+    ///
+    /// let mut it = re.find_overlapping_iter(haystack);
+    /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_overlapping_iter<'r, 't>(
+        &'r self,
+        haystack: &'t [u8],
+    ) -> FindOverlappingMatches<'r, 't, A, P> {
+        FindOverlappingMatches::new(self, haystack)
+    }
+}
+
+/// Lower level infallible search routines that permit controlling where
+/// the search starts and ends in a particular sequence. This is useful for
+/// executing searches that need to take surrounding context into account. This
+/// is required for correctly implementing iteration because of look-around
+/// operators (`^`, `$`, `\b`).
+impl<A: Automaton, P: Prefilter> Regex<A, P> {
+    /// Returns true if and only if this regex matches the given haystack.
+    ///
+    /// This routine may short circuit if it knows that scanning future input
+    /// will never lead to a different result. In particular, if the underlying
+    /// DFA enters a match state or a dead state, then this routine will return
+    /// `true` or `false`, respectively, without inspecting any future input.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// # Panics
+    ///
+    /// If the underlying DFAs return an error, then this routine panics. This
+    /// only occurs in non-default configurations where quit bytes are used or
+    /// Unicode word boundaries are heuristically enabled.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_is_match_at`](Regex::try_is_match_at).
+    pub fn is_match_at(
+        &self,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> bool {
+        self.try_is_match_at(haystack, start, end).unwrap()
+    }
+
+    /// Returns the first position at which a match is found.
+    ///
+    /// This routine stops scanning input in precisely the same circumstances
+    /// as `is_match`. The key difference is that this routine returns the
+    /// position at which it stopped scanning input if and only if a match
+    /// was found. If no match is found, then `None` is returned.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying DFAs return an error, then this routine panics. This
+    /// only occurs in non-default configurations where quit bytes are used or
+    /// Unicode word boundaries are heuristically enabled.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_earliest_at`](Regex::try_find_earliest_at).
+    pub fn find_earliest_at(
+        &self,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Option<MultiMatch> {
+        self.try_find_earliest_at(haystack, start, end).unwrap()
+    }
+
+    /// Returns the same as `find_leftmost`, but starts the search at the given
+    /// offset.
+    ///
+    /// The significance of the starting point is that it takes the surrounding
+    /// context into consideration. For example, if the DFA is anchored, then
+    /// a match can only occur when `start == 0`.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches within the
+    /// same haystack, which cannot be done correctly by simply providing a
+    /// subslice of `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying DFAs return an error, then this routine panics. This
+    /// only occurs in non-default configurations where quit bytes are used or
+    /// Unicode word boundaries are heuristically enabled.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at).
+    pub fn find_leftmost_at(
+        &self,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Option<MultiMatch> {
+        self.try_find_leftmost_at(haystack, start, end).unwrap()
+    }
+
+    /// Search for the first overlapping match within a given range of
+    /// `haystack`.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// In particular, callers must preserve the automaton's search state from
+    /// prior calls so that the implementation knows where the last match
+    /// occurred and which pattern was reported.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying DFAs return an error, then this routine panics. This
+    /// only occurs in non-default configurations where quit bytes are used or
+    /// Unicode word boundaries are heuristically enabled.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at).
+    pub fn find_overlapping_at(
+        &self,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+        state: &mut OverlappingState,
+    ) -> Option<MultiMatch> {
+        self.try_find_overlapping_at(haystack, start, end, state).unwrap()
+    }
+}
+
+/// Fallible search routines. These may return an error when the underlying
+/// DFAs have been configured in a way that permits them to fail during a
+/// search.
+///
+/// Errors during search only occur when the DFA has been explicitly
+/// configured to do so, usually by specifying one or more "quit" bytes or by
+/// heuristically enabling Unicode word boundaries.
+///
+/// Errors will never be returned using the default configuration. So these
+/// fallible routines are only needed for particular configurations.
+impl<A: Automaton, P: Prefilter> Regex<A, P> {
+    /// Returns true if and only if this regex matches the given haystack.
+    ///
+    /// This routine may short circuit if it knows that scanning future input
+    /// will never lead to a different result. In particular, if the underlying
+    /// DFA enters a match state or a dead state, then this routine will return
+    /// `true` or `false`, respectively, without inspecting any future input.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used or Unicode word boundaries are heuristically
+    /// enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`is_match`](Regex::is_match).
+    pub fn try_is_match(&self, haystack: &[u8]) -> Result<bool, MatchError> {
+        self.try_is_match_at(haystack, 0, haystack.len())
+    }
+
+    /// Returns the first position at which a match is found.
+    ///
+    /// This routine stops scanning input in precisely the same circumstances
+    /// as `is_match`. The key difference is that this routine returns the
+    /// position at which it stopped scanning input if and only if a match
+    /// was found. If no match is found, then `None` is returned.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used or Unicode word boundaries are heuristically
+    /// enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_earliest`](Regex::find_earliest).
+    pub fn try_find_earliest(
+        &self,
+        haystack: &[u8],
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_earliest_at(haystack, 0, haystack.len())
+    }
+
+    /// Returns the start and end offset of the leftmost match. If no match
+    /// exists, then `None` is returned.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used or Unicode word boundaries are heuristically
+    /// enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_leftmost`](Regex::find_leftmost).
+    pub fn try_find_leftmost(
+        &self,
+        haystack: &[u8],
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_leftmost_at(haystack, 0, haystack.len())
+    }
+
+    /// Search for the first overlapping match in `haystack`.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// In particular, callers must preserve the automaton's search state from
+    /// prior calls so that the implementation knows where the last match
+    /// occurred and which pattern was reported.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used or Unicode word boundaries are heuristically
+    /// enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_overlapping`](Regex::find_overlapping).
+    pub fn try_find_overlapping(
+        &self,
+        haystack: &[u8],
+        state: &mut OverlappingState,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_overlapping_at(haystack, 0, haystack.len(), state)
+    }
+
+    /// Returns an iterator over all non-overlapping "earliest" matches.
+    ///
+    /// Match positions are reported as soon as a match is known to occur, even
+    /// if the standard leftmost match would be longer.
+    ///
+    /// # Errors
+    ///
+    /// This iterator only yields errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used or Unicode word boundaries are heuristically
+    /// enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_earliest_iter`](Regex::find_earliest_iter).
+    pub fn try_find_earliest_iter<'r, 't>(
+        &'r self,
+        haystack: &'t [u8],
+    ) -> TryFindEarliestMatches<'r, 't, A, P> {
+        TryFindEarliestMatches::new(self, haystack)
+    }
+
+    /// Returns an iterator over all non-overlapping leftmost matches in the
+    /// given bytes. If no match exists, then the iterator yields no elements.
+    ///
+    /// This corresponds to the "standard" regex search iterator.
+    ///
+    /// # Errors
+    ///
+    /// This iterator only yields errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used or Unicode word boundaries are heuristically
+    /// enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_leftmost_iter`](Regex::find_leftmost_iter).
+    pub fn try_find_leftmost_iter<'r, 't>(
+        &'r self,
+        haystack: &'t [u8],
+    ) -> TryFindLeftmostMatches<'r, 't, A, P> {
+        TryFindLeftmostMatches::new(self, haystack)
+    }
+
+    /// Returns an iterator over all overlapping matches in the given haystack.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// The iterator takes care of handling the overlapping state that must be
+    /// threaded through every search.
+    ///
+    /// # Errors
+    ///
+    /// This iterator only yields errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used or Unicode word boundaries are heuristically
+    /// enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_overlapping_iter`](Regex::find_overlapping_iter).
+    pub fn try_find_overlapping_iter<'r, 't>(
+        &'r self,
+        haystack: &'t [u8],
+    ) -> TryFindOverlappingMatches<'r, 't, A, P> {
+        TryFindOverlappingMatches::new(self, haystack)
+    }
+}
+
+/// Lower level fallible search routines that permit controlling where the
+/// search starts and ends in a particular sequence.
+impl<A: Automaton, P: Prefilter> Regex<A, P> {
+    /// Returns true if and only if this regex matches the given haystack.
+    ///
+    /// This routine may short circuit if it knows that scanning future input
+    /// will never lead to a different result. In particular, if the underlying
+    /// DFA enters a match state or a dead state, then this routine will return
+    /// `true` or `false`, respectively, without inspecting any future input.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`is_match_at`](Regex::is_match_at).
+    pub fn try_is_match_at(
+        &self,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<bool, MatchError> {
+        self.forward()
+            .find_earliest_fwd_at(
+                self.scanner().as_mut(),
+                None,
+                haystack,
+                start,
+                end,
+            )
+            .map(|x| x.is_some())
+    }
+
+    /// Returns the first position at which a match is found.
+    ///
+    /// This routine stops scanning input in precisely the same circumstances
+    /// as `is_match`. The key difference is that this routine returns the
+    /// position at which it stopped scanning input if and only if a match
+    /// was found. If no match is found, then `None` is returned.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `haystack`.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used or Unicode word boundaries are heuristically
+    /// enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_earliest_at`](Regex::find_earliest_at).
+    pub fn try_find_earliest_at(
+        &self,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_earliest_at_imp(
+            self.scanner().as_mut(),
+            haystack,
+            start,
+            end,
+        )
+    }
+
+    /// The implementation of "earliest" searching, where a prefilter scanner
+    /// may be given.
+    fn try_find_earliest_at_imp(
+        &self,
+        pre: Option<&mut prefilter::Scanner>,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        // N.B. We use `&&A` here to call `Automaton` methods, which ensures
+        // that we always use the `impl Automaton for &A` for calling methods.
+        // Since this is the usual way that automata are used, this helps
+        // reduce the number of monomorphized copies of the search code.
+        let (fwd, rev) = (self.forward(), self.reverse());
+        let end = match (&fwd)
+            .find_earliest_fwd_at(pre, None, haystack, start, end)?
+        {
+            None => return Ok(None),
+            Some(end) => end,
+        };
+        // N.B. The only time we need to tell the reverse searcher the pattern
+        // to match is in the overlapping case, since it's ambiguous. In the
+        // leftmost case, I have tentatively convinced myself that it isn't
+        // necessary and the reverse search will always find the same pattern
+        // to match as the forward search. But I lack a rigorous proof.
+        let start = (&rev)
+            .find_earliest_rev_at(None, haystack, start, end.offset())?
+            .expect("reverse search must match if forward search does");
+        assert_eq!(
+            start.pattern(),
+            end.pattern(),
+            "forward and reverse search must match same pattern"
+        );
+        assert!(start.offset() <= end.offset());
+        Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+    }
+
+    /// Returns the start and end offset of the leftmost match. If no match
+    /// exists, then `None` is returned.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `haystack`.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used or Unicode word boundaries are heuristically
+    /// enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_leftmost_at`](Regex::find_leftmost_at).
+    pub fn try_find_leftmost_at(
+        &self,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_leftmost_at_imp(
+            self.scanner().as_mut(),
+            haystack,
+            start,
+            end,
+        )
+    }
+
+    /// The implementation of leftmost searching, where a prefilter scanner
+    /// may be given.
+    fn try_find_leftmost_at_imp(
+        &self,
+        scanner: Option<&mut prefilter::Scanner>,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        // N.B. We use `&&A` here to call `Automaton` methods, which ensures
+        // that we always use the `impl Automaton for &A` for calling methods.
+        // Since this is the usual way that automata are used, this helps
+        // reduce the number of monomorphized copies of the search code.
+        let (fwd, rev) = (self.forward(), self.reverse());
+        let end = match (&fwd)
+            .find_leftmost_fwd_at(scanner, None, haystack, start, end)?
+        {
+            None => return Ok(None),
+            Some(end) => end,
+        };
+        // N.B. The only time we need to tell the reverse searcher the pattern
+        // to match is in the overlapping case, since it's ambiguous. In the
+        // leftmost case, I have tentatively convinced myself that it isn't
+        // necessary and the reverse search will always find the same pattern
+        // to match as the forward search. But I lack a rigorous proof. Why not
+        // just provide the pattern anyway? Well, if it is needed, then leaving
+        // it out gives us a chance to find a witness.
+        let start = (&rev)
+            .find_leftmost_rev_at(None, haystack, start, end.offset())?
+            .expect("reverse search must match if forward search does");
+        assert_eq!(
+            start.pattern(),
+            end.pattern(),
+            "forward and reverse search must match same pattern",
+        );
+        assert!(start.offset() <= end.offset());
+        Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+    }
+
+    /// Search for the first overlapping match within a given range of
+    /// `haystack`.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// In particular, callers must preserve the automaton's search state from
+    /// prior calls so that the implementation knows where the last match
+    /// occurred and which pattern was reported.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `haystack`.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used or Unicode word boundaries are heuristically
+    /// enabled.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_overlapping_at`](Regex::find_overlapping_at).
+    pub fn try_find_overlapping_at(
+        &self,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+        state: &mut OverlappingState,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_overlapping_at_imp(
+            self.scanner().as_mut(),
+            haystack,
+            start,
+            end,
+            state,
+        )
+    }
+
+    /// The implementation of overlapping search at a given range in
+    /// `haystack`, where `scanner` is a prefilter (if active) and `state` is
+    /// the current state of the search.
+    fn try_find_overlapping_at_imp(
+        &self,
+        scanner: Option<&mut prefilter::Scanner>,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+        state: &mut OverlappingState,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        // N.B. We use `&&A` here to call `Automaton` methods, which ensures
+        // that we always use the `impl Automaton for &A` for calling methods.
+        // Since this is the usual way that automata are used, this helps
+        // reduce the number of monomorphized copies of the search code.
+        let (fwd, rev) = (self.forward(), self.reverse());
+        // TODO: Decide whether it's worth making this assert work. It doesn't
+        // work currently because 'has_starts_for_each_pattern' isn't on the
+        // Automaton trait. Without this assert, we still get a panic, but it's
+        // a bit more inscrutable.
+        // assert!(
+        // rev.has_starts_for_each_pattern(),
+        // "overlapping searches require that the reverse DFA is \
+        // compiled with the 'starts_for_each_pattern' option",
+        // );
+        let end = match (&fwd).find_overlapping_fwd_at(
+            scanner, None, haystack, start, end, state,
+        )? {
+            None => return Ok(None),
+            Some(end) => end,
+        };
+        // Unlike the leftmost cases, the reverse overlapping search may match
+        // a different pattern than the forward search. See test failures when
+        // using `None` instead of `Some(end.pattern())` below. Thus, we must
+        // run our reverse search using the pattern that matched in the forward
+        // direction.
+        let start = (&rev)
+            .find_leftmost_rev_at(
+                Some(end.pattern()),
+                haystack,
+                0,
+                end.offset(),
+            )?
+            .expect("reverse search must match if forward search does");
+        assert!(start.offset() <= end.offset());
+        assert_eq!(start.pattern(), end.pattern());
+        Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+    }
+}
+
+/// Non-search APIs for querying information about the regex and setting a
+/// prefilter.
+impl<A: Automaton, P: Prefilter> Regex<A, P> {
+    /// Attach the given prefilter to this regex.
+    pub fn with_prefilter<Q: Prefilter>(self, prefilter: Q) -> Regex<A, Q> {
+        Regex {
+            prefilter: Some(prefilter),
+            forward: self.forward,
+            reverse: self.reverse,
+            utf8: self.utf8,
+        }
+    }
+
+    /// Remove any prefilter from this regex.
+    pub fn without_prefilter(self) -> Regex<A> {
+        Regex {
+            prefilter: None,
+            forward: self.forward,
+            reverse: self.reverse,
+            utf8: self.utf8,
+        }
+    }
+
+    /// Return the underlying DFA responsible for forward matching.
+    ///
+    /// This is useful for accessing the underlying DFA and converting it to
+    /// some other format or size. See the [`Builder::build_from_dfas`] docs
+    /// for an example of where this might be useful.
+    pub fn forward(&self) -> &A {
+        &self.forward
+    }
+
+    /// Return the underlying DFA responsible for reverse matching.
+    ///
+    /// This is useful for accessing the underlying DFA and converting it to
+    /// some other format or size. See the [`Builder::build_from_dfas`] docs
+    /// for an example of where this might be useful.
+    pub fn reverse(&self) -> &A {
+        &self.reverse
+    }
+
+    /// Returns the total number of patterns matched by this regex.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, dfa::regex::Regex};
+    ///
+    /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
+    /// assert_eq!(3, re.pattern_count());
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn pattern_count(&self) -> usize {
+        assert_eq!(
+            self.forward().pattern_count(),
+            self.reverse().pattern_count()
+        );
+        self.forward().pattern_count()
+    }
+
+    /// Convenience function for returning this regex's prefilter as a trait
+    /// object.
+    ///
+    /// If this regex doesn't have a prefilter, then `None` is returned.
+    pub fn prefilter(&self) -> Option<&dyn Prefilter> {
+        match self.prefilter {
+            None => None,
+            Some(ref x) => Some(&*x),
+        }
+    }
+
+    /// Convenience function for returning a prefilter scanner.
+    fn scanner(&self) -> Option<prefilter::Scanner> {
+        self.prefilter().map(prefilter::Scanner::new)
+    }
+}
+
+/// An iterator over all non-overlapping earliest matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct FindEarliestMatches<'r, 't, A, P>(
+    TryFindEarliestMatches<'r, 't, A, P>,
+);
+
+impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> {
+    fn new(
+        re: &'r Regex<A, P>,
+        text: &'t [u8],
+    ) -> FindEarliestMatches<'r, 't, A, P> {
+        FindEarliestMatches(TryFindEarliestMatches::new(re, text))
+    }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+    for FindEarliestMatches<'r, 't, A, P>
+{
+    type Item = MultiMatch;
+
+    fn next(&mut self) -> Option<MultiMatch> {
+        next_unwrap(self.0.next())
+    }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct FindLeftmostMatches<'r, 't, A, P>(
+    TryFindLeftmostMatches<'r, 't, A, P>,
+);
+
+impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> {
+    fn new(
+        re: &'r Regex<A, P>,
+        text: &'t [u8],
+    ) -> FindLeftmostMatches<'r, 't, A, P> {
+        FindLeftmostMatches(TryFindLeftmostMatches::new(re, text))
+    }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+    for FindLeftmostMatches<'r, 't, A, P>
+{
+    type Item = MultiMatch;
+
+    fn next(&mut self) -> Option<MultiMatch> {
+        next_unwrap(self.0.next())
+    }
+}
+
+/// An iterator over all overlapping matches for a particular infallible
+/// search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>(
+    TryFindOverlappingMatches<'r, 't, A, P>,
+);
+
+impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> {
+    fn new(
+        re: &'r Regex<A, P>,
+        text: &'t [u8],
+    ) -> FindOverlappingMatches<'r, 't, A, P> {
+        FindOverlappingMatches(TryFindOverlappingMatches::new(re, text))
+    }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+    for FindOverlappingMatches<'r, 't, A, P>
+{
+    type Item = MultiMatch;
+
+    fn next(&mut self) -> Option<MultiMatch> {
+        next_unwrap(self.0.next())
+    }
+}
+
+/// An iterator over all non-overlapping earliest matches for a particular
+/// fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct TryFindEarliestMatches<'r, 't, A, P> {
+    re: &'r Regex<A, P>,
+    scanner: Option<prefilter::Scanner<'r>>,
+    text: &'t [u8],
+    last_end: usize,
+    last_match: Option<usize>,
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> {
+    fn new(
+        re: &'r Regex<A, P>,
+        text: &'t [u8],
+    ) -> TryFindEarliestMatches<'r, 't, A, P> {
+        let scanner = re.scanner();
+        TryFindEarliestMatches {
+            re,
+            scanner,
+            text,
+            last_end: 0,
+            last_match: None,
+        }
+    }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+    for TryFindEarliestMatches<'r, 't, A, P>
+{
+    type Item = Result<MultiMatch, MatchError>;
+
+    fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+        if self.last_end > self.text.len() {
+            return None;
+        }
+        let result = self.re.try_find_earliest_at_imp(
+            self.scanner.as_mut(),
+            self.text,
+            self.last_end,
+            self.text.len(),
+        );
+        let m = match result {
+            Err(err) => return Some(Err(err)),
+            Ok(None) => return None,
+            Ok(Some(m)) => m,
+        };
+        if m.is_empty() {
+            // This is an empty match. To ensure we make progress, start
+            // the next search at the smallest possible starting position
+            // of the next match following this one.
+            self.last_end = if self.re.utf8 {
+                crate::util::next_utf8(self.text, m.end())
+            } else {
+                m.end() + 1
+            };
+            // Don't accept empty matches immediately following a match.
+            // Just move on to the next match.
+            if Some(m.end()) == self.last_match {
+                return self.next();
+            }
+        } else {
+            self.last_end = m.end();
+        }
+        self.last_match = Some(m.end());
+        Some(Ok(m))
+    }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct TryFindLeftmostMatches<'r, 't, A, P> {
+    re: &'r Regex<A, P>,
+    scanner: Option<prefilter::Scanner<'r>>,
+    text: &'t [u8],
+    last_end: usize,
+    last_match: Option<usize>,
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> {
+    fn new(
+        re: &'r Regex<A, P>,
+        text: &'t [u8],
+    ) -> TryFindLeftmostMatches<'r, 't, A, P> {
+        let scanner = re.scanner();
+        TryFindLeftmostMatches {
+            re,
+            scanner,
+            text,
+            last_end: 0,
+            last_match: None,
+        }
+    }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+    for TryFindLeftmostMatches<'r, 't, A, P>
+{
+    type Item = Result<MultiMatch, MatchError>;
+
+    fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+        if self.last_end > self.text.len() {
+            return None;
+        }
+        let result = self.re.try_find_leftmost_at_imp(
+            self.scanner.as_mut(),
+            self.text,
+            self.last_end,
+            self.text.len(),
+        );
+        let m = match result {
+            Err(err) => return Some(Err(err)),
+            Ok(None) => return None,
+            Ok(Some(m)) => m,
+        };
+        if m.is_empty() {
+            // This is an empty match. To ensure we make progress, start
+            // the next search at the smallest possible starting position
+            // of the next match following this one.
+            self.last_end = if self.re.utf8 {
+                crate::util::next_utf8(self.text, m.end())
+            } else {
+                m.end() + 1
+            };
+            // Don't accept empty matches immediately following a match.
+            // Just move on to the next match.
+            if Some(m.end()) == self.last_match {
+                return self.next();
+            }
+        } else {
+            self.last_end = m.end();
+        }
+        self.last_match = Some(m.end());
+        Some(Ok(m))
+    }
+}
+
+/// An iterator over all overlapping matches for a particular fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// `A` is the type used to represent the underlying DFAs used by the regex,
+/// while `P` is the type of prefilter used, if any. The lifetime variables are
+/// as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Clone, Debug)]
+pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> {
+    re: &'r Regex<A, P>,
+    scanner: Option<prefilter::Scanner<'r>>,
+    text: &'t [u8],
+    last_end: usize,
+    state: OverlappingState,
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter>
+    TryFindOverlappingMatches<'r, 't, A, P>
+{
+    fn new(
+        re: &'r Regex<A, P>,
+        text: &'t [u8],
+    ) -> TryFindOverlappingMatches<'r, 't, A, P> {
+        let scanner = re.scanner();
+        TryFindOverlappingMatches {
+            re,
+            scanner,
+            text,
+            last_end: 0,
+            state: OverlappingState::start(),
+        }
+    }
+}
+
+impl<'r, 't, A: Automaton, P: Prefilter> Iterator
+    for TryFindOverlappingMatches<'r, 't, A, P>
+{
+    type Item = Result<MultiMatch, MatchError>;
+
+    fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+        if self.last_end > self.text.len() {
+            return None;
+        }
+        let result = self.re.try_find_overlapping_at_imp(
+            self.scanner.as_mut(),
+            self.text,
+            self.last_end,
+            self.text.len(),
+            &mut self.state,
+        );
+        let m = match result {
+            Err(err) => return Some(Err(err)),
+            Ok(None) => return None,
+            Ok(Some(m)) => m,
+        };
+        // Unlike the non-overlapping case, we're OK with empty matches at this
+        // level. In particular, the overlapping search algorithm is itself
+        // responsible for ensuring that progress is always made.
+        self.last_end = m.end();
+        Some(Ok(m))
+    }
+}
+
+/// The configuration used for compiling a DFA-backed regex.
+///
+/// A regex configuration is a simple data object that is typically used with
+/// [`Builder::configure`].
+#[cfg(feature = "alloc")]
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+    utf8: Option<bool>,
+}
+
+#[cfg(feature = "alloc")]
+impl Config {
+    /// Return a new default regex compiler configuration.
+    pub fn new() -> Config {
+        Config::default()
+    }
+
+    /// Whether to enable UTF-8 mode or not.
+    ///
+    /// When UTF-8 mode is enabled (the default) and an empty match is seen,
+    /// the iterators on [`Regex`] will always start the next search at the
+    /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8
+    /// mode is disabled, such searches are begun at the next byte offset.
+    ///
+    /// If this mode is enabled and invalid UTF-8 is given to search, then
+    /// behavior is unspecified.
+    ///
+    /// Generally speaking, one should enable this when
+    /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8)
+    /// and
+    /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
+    /// are enabled, and disable it otherwise.
+    ///
+    /// # Example
+    ///
+    /// This example demonstrates the differences between when this option is
+    /// enabled and disabled. The differences only arise when the regex can
+    /// return matches of length zero.
+    ///
+    /// In this first snippet, we show the results when UTF-8 mode is disabled.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::regex::Regex, MultiMatch};
+    ///
+    /// let re = Regex::builder()
+    ///     .configure(Regex::config().utf8(false))
+    ///     .build(r"")?;
+    /// let haystack = "a☃z".as_bytes();
+    /// let mut it = re.find_leftmost_iter(haystack);
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// And in this snippet, we execute the same search on the same haystack,
+    /// but with UTF-8 mode enabled. Notice that byte offsets that would
+    /// otherwise split the encoding of `☃` are not returned.
+    ///
+    /// ```
+    /// use regex_automata::{dfa::regex::Regex, MultiMatch};
+    ///
+    /// let re = Regex::builder()
+    ///     .configure(Regex::config().utf8(true))
+    ///     .build(r"")?;
+    /// let haystack = "a☃z".as_bytes();
+    /// let mut it = re.find_leftmost_iter(haystack);
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn utf8(mut self, yes: bool) -> Config {
+        self.utf8 = Some(yes);
+        self
+    }
+
+    /// Returns true if and only if this configuration has UTF-8 mode enabled.
+    ///
+    /// When UTF-8 mode is enabled and an empty match is seen, the iterators on
+    /// [`Regex`] will always start the next search at the next UTF-8 encoded
+    /// codepoint. When UTF-8 mode is disabled, such searches are begun at the
+    /// next byte offset.
+    pub fn get_utf8(&self) -> bool {
+        self.utf8.unwrap_or(true)
+    }
+
+    /// Overwrite the default configuration such that the options in `o` are
+    /// always used. If an option in `o` is not set, then the corresponding
+    /// option in `self` is used. If it's not set in `self` either, then it
+    /// remains not set.
+    pub(crate) fn overwrite(self, o: Config) -> Config {
+        Config { utf8: o.utf8.or(self.utf8) }
+    }
+}
+
+/// A builder for a regex based on deterministic finite automatons.
+///
+/// This builder permits configuring options for the syntax of a pattern, the
+/// NFA construction, the DFA construction and finally the regex searching
+/// itself. This builder is different from a general purpose regex builder in
+/// that it permits fine grain configuration of the construction process. The
+/// trade off for this is complexity, and the possibility of setting a
+/// configuration that might not make sense. For example, there are three
+/// different UTF-8 modes:
+///
+/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the
+/// pattern itself can contain sub-expressions that match invalid UTF-8.
+/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
+/// controls whether the implicit unanchored prefix added to the NFA can
+/// match through invalid UTF-8 or not.
+/// * [`Config::utf8`] controls how the regex iterators themselves advance
+/// the starting position of the next search when a match with zero length is
+/// found.
+///
+/// Generally speaking, callers will want to either enable all of these or
+/// disable all of these.
+///
+/// Internally, building a regex requires building two DFAs, where one is
+/// responsible for finding the end of a match and the other is responsible
+/// for finding the start of a match. If you only need to detect whether
+/// something matched, or only the end of a match, then you should use a
+/// [`dense::Builder`] to construct a single DFA, which is cheaper than
+/// building two DFAs.
+///
+/// # Build methods
+///
+/// This builder has a few "build" methods. In general, it's the result of
+/// combining the following parameters:
+///
+/// * Building one or many regexes.
+/// * Building a regex with dense or sparse DFAs.
+///
+/// The simplest "build" method is [`Builder::build`]. It accepts a single
+/// pattern and builds a dense DFA using `usize` for the state identifier
+/// representation.
+///
+/// The most general "build" method is [`Builder::build_many`], which permits
+/// building a regex that searches for multiple patterns simultaneously while
+/// using a specific state identifier representation.
+///
+/// The most flexible "build" method, but hardest to use, is
+/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
+/// just a pair of DFAs, and this method allows you to specify those DFAs
+/// exactly.
+///
+/// # Example
+///
+/// This example shows how to disable UTF-8 mode in the syntax, the NFA and
+/// the regex itself. This is generally what you want for matching on
+/// arbitrary bytes.
+///
+/// ```
+/// use regex_automata::{
+///     dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig
+/// };
+///
+/// let re = Regex::builder()
+///     .configure(Regex::config().utf8(false))
+///     .syntax(SyntaxConfig::new().utf8(false))
+///     .thompson(thompson::Config::new().utf8(false))
+///     .build(r"foo(?-u:[^b])ar.*")?;
+/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+/// let expected = Some(MultiMatch::must(0, 1, 9));
+/// let got = re.find_leftmost(haystack);
+/// assert_eq!(expected, got);
+/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
+/// // but the subsequent `.*` does not! Disabling UTF-8
+/// // on the syntax permits this. Notice also that the
+/// // search was unanchored and skipped over invalid UTF-8.
+/// // Disabling UTF-8 on the Thompson NFA permits this.
+/// //
+/// // N.B. This example does not show the impact of
+/// // disabling UTF-8 mode on Config, since that
+/// // only impacts regexes that can produce matches of
+/// // length 0.
+/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug)]
+pub struct Builder {
+    config: Config,
+    dfa: dense::Builder,
+}
+
+#[cfg(feature = "alloc")]
+impl Builder {
+    /// Create a new regex builder with the default configuration.
+    pub fn new() -> Builder {
+        Builder { config: Config::default(), dfa: dense::Builder::new() }
+    }
+
+    /// Build a regex from the given pattern.
+    ///
+    /// If there was a problem parsing or compiling the pattern, then an error
+    /// is returned.
+    pub fn build(&self, pattern: &str) -> Result<Regex, Error> {
+        self.build_many(&[pattern])
+    }
+
+    /// Build a regex from the given pattern using sparse DFAs.
+    ///
+    /// If there was a problem parsing or compiling the pattern, then an error
+    /// is returned.
+    pub fn build_sparse(
+        &self,
+        pattern: &str,
+    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+        self.build_many_sparse(&[pattern])
+    }
+
+    /// Build a regex from the given patterns.
+    pub fn build_many<P: AsRef<str>>(
+        &self,
+        patterns: &[P],
+    ) -> Result<Regex, Error> {
+        let forward = self.dfa.build_many(patterns)?;
+        let reverse = self
+            .dfa
+            .clone()
+            .configure(
+                dense::Config::new()
+                    .anchored(true)
+                    .match_kind(MatchKind::All)
+                    .starts_for_each_pattern(true),
+            )
+            .thompson(thompson::Config::new().reverse(true))
+            .build_many(patterns)?;
+        Ok(self.build_from_dfas(forward, reverse))
+    }
+
+    /// Build a sparse regex from the given patterns.
+    pub fn build_many_sparse<P: AsRef<str>>(
+        &self,
+        patterns: &[P],
+    ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> {
+        let re = self.build_many(patterns)?;
+        let forward = re.forward().to_sparse()?;
+        let reverse = re.reverse().to_sparse()?;
+        Ok(self.build_from_dfas(forward, reverse))
+    }
+
+    /// Build a regex from its component forward and reverse DFAs.
+    ///
+    /// This is useful when deserializing a regex from some arbitrary
+    /// memory region. This is also useful for building regexes from other
+    /// types of DFAs.
+    ///
+    /// If you're building the DFAs from scratch instead of building new DFAs
+    /// from other DFAs, then you'll need to make sure that the reverse DFA is
+    /// configured correctly to match the intended semantics. Namely:
+    ///
+    /// * It should be anchored.
+    /// * It should use [`MatchKind::All`] semantics.
+    /// * It should match in reverse.
+    /// * It should have anchored start states compiled for each pattern.
+    /// * Otherwise, its configuration should match the forward DFA.
+    ///
+    /// If these conditions are satisfied, then behavior of searches is
+    /// unspecified.
+    ///
+    /// Note that when using this constructor, only the configuration from
+    /// [`Config`] is applied. The only configuration settings on this builder
+    /// only apply when the builder owns the construction of the DFAs
+    /// themselves.
+    ///
+    /// # Example
+    ///
+    /// This example is a bit a contrived. The usual use of these methods
+    /// would involve serializing `initial_re` somewhere and then deserializing
+    /// it later to build a regex. But in this case, we do everything in
+    /// memory.
+    ///
+    /// ```
+    /// use regex_automata::dfa::regex::Regex;
+    ///
+    /// let initial_re = Regex::new("foo[0-9]+")?;
+    /// assert_eq!(true, initial_re.is_match(b"foo123"));
+    ///
+    /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
+    /// let re = Regex::builder().build_from_dfas(fwd, rev);
+    /// assert_eq!(true, re.is_match(b"foo123"));
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// This example shows how to build a `Regex` that uses sparse DFAs instead
+    /// of dense DFAs without using one of the convenience `build_sparse`
+    /// routines:
+    ///
+    /// ```
+    /// use regex_automata::dfa::regex::Regex;
+    ///
+    /// let initial_re = Regex::new("foo[0-9]+")?;
+    /// assert_eq!(true, initial_re.is_match(b"foo123"));
+    ///
+    /// let fwd = initial_re.forward().to_sparse()?;
+    /// let rev = initial_re.reverse().to_sparse()?;
+    /// let re = Regex::builder().build_from_dfas(fwd, rev);
+    /// assert_eq!(true, re.is_match(b"foo123"));
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn build_from_dfas<A: Automaton>(
+        &self,
+        forward: A,
+        reverse: A,
+    ) -> Regex<A> {
+        let utf8 = self.config.get_utf8();
+        Regex { prefilter: None, forward, reverse, utf8 }
+    }
+
+    /// Apply the given regex configuration options to this builder.
+    pub fn configure(&mut self, config: Config) -> &mut Builder {
+        self.config = self.config.overwrite(config);
+        self
+    }
+
+    /// Set the syntax configuration for this builder using
+    /// [`SyntaxConfig`](crate::SyntaxConfig).
+    ///
+    /// This permits setting things like case insensitivity, Unicode and multi
+    /// line mode.
+    pub fn syntax(
+        &mut self,
+        config: crate::util::syntax::SyntaxConfig,
+    ) -> &mut Builder {
+        self.dfa.syntax(config);
+        self
+    }
+
+    /// Set the Thompson NFA configuration for this builder using
+    /// [`nfa::thompson::Config`](thompson::Config).
+    ///
+    /// This permits setting things like whether additional time should be
+    /// spent shrinking the size of the NFA.
+    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+        self.dfa.thompson(config);
+        self
+    }
+
+    /// Set the dense DFA compilation configuration for this builder using
+    /// [`dense::Config`](dense::Config).
+    ///
+    /// This permits setting things like whether the underlying DFAs should
+    /// be minimized.
+    pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
+        self.dfa.configure(config);
+        self
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl Default for Builder {
+    fn default() -> Builder {
+        Builder::new()
+    }
+}
+
+#[inline(always)]
+fn next_unwrap(
+    item: Option<Result<MultiMatch, MatchError>>,
+) -> Option<MultiMatch> {
+    match item {
+        None => None,
+        Some(Ok(m)) => Some(m),
+        Some(Err(err)) => panic!(
+            "unexpected regex search error: {}\n\
+             to handle search errors, use try_ methods",
+            err,
+        ),
+    }
+}
diff --git a/src/dfa/search.rs b/src/dfa/search.rs

new file mode 100644 (file)

index 0000000..4924149
--- /dev/null
+++ b/src/dfa/search.rs
@@ -0,0 +1,493 @@
+use crate::{
+    dfa::{
+        accel,
+        automaton::{Automaton, OverlappingState, StateMatch},
+    },
+    util::{
+        id::{PatternID, StateID},
+        matchtypes::HalfMatch,
+        prefilter, MATCH_OFFSET,
+    },
+    MatchError,
+};
+
+#[inline(never)]
+pub fn find_earliest_fwd<A: Automaton + ?Sized>(
+    pre: Option<&mut prefilter::Scanner>,
+    dfa: &A,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    // Searching with a pattern ID is always anchored, so we should never use
+    // a prefilter.
+    if pre.is_some() && pattern_id.is_none() {
+        find_fwd(pre, true, dfa, pattern_id, bytes, start, end)
+    } else {
+        find_fwd(None, true, dfa, pattern_id, bytes, start, end)
+    }
+}
+
+#[inline(never)]
+pub fn find_leftmost_fwd<A: Automaton + ?Sized>(
+    pre: Option<&mut prefilter::Scanner>,
+    dfa: &A,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    // Searching with a pattern ID is always anchored, so we should never use
+    // a prefilter.
+    if pre.is_some() && pattern_id.is_none() {
+        find_fwd(pre, false, dfa, pattern_id, bytes, start, end)
+    } else {
+        find_fwd(None, false, dfa, pattern_id, bytes, start, end)
+    }
+}
+
+/// This is marked as `inline(always)` specifically because it supports
+/// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters
+/// getting inlined eliminate some critical branches. To avoid bloating binary
+/// size, we only call this function in a fixed number of places.
+#[inline(always)]
+fn find_fwd<A: Automaton + ?Sized>(
+    mut pre: Option<&mut prefilter::Scanner>,
+    earliest: bool,
+    dfa: &A,
+    pattern_id: Option<PatternID>,
+    haystack: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    assert!(start <= end);
+    assert!(start <= haystack.len());
+    assert!(end <= haystack.len());
+
+    // Why do this? This lets 'bytes[at]' work without bounds checks below.
+    // It seems the assert on 'end <= haystack.len()' above is otherwise
+    // not enough. Why not just make 'bytes' scoped this way anyway? Well,
+    // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
+    // for resolving look-ahead.
+    let bytes = &haystack[..end];
+
+    let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?;
+    let mut last_match = None;
+    let mut at = start;
+    if let Some(ref mut pre) = pre {
+        // If a prefilter doesn't report false positives, then we don't need to
+        // touch the DFA at all. However, since all matches include the pattern
+        // ID, and the prefilter infrastructure doesn't report pattern IDs, we
+        // limit this optimization to cases where there is exactly one pattern.
+        // In that case, any match must be the 0th pattern.
+        if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
+            return Ok(pre.next_candidate(bytes, at).into_option().map(
+                |offset| HalfMatch { pattern: PatternID::ZERO, offset },
+            ));
+        } else if pre.is_effective(at) {
+            match pre.next_candidate(bytes, at).into_option() {
+                None => return Ok(None),
+                Some(i) => {
+                    at = i;
+                }
+            }
+        }
+    }
+    while at < end {
+        let byte = bytes[at];
+        state = dfa.next_state(state, byte);
+        at += 1;
+        if dfa.is_special_state(state) {
+            if dfa.is_start_state(state) {
+                if let Some(ref mut pre) = pre {
+                    if pre.is_effective(at) {
+                        match pre.next_candidate(bytes, at).into_option() {
+                            None => return Ok(None),
+                            Some(i) => {
+                                at = i;
+                            }
+                        }
+                    }
+                } else if dfa.is_accel_state(state) {
+                    let needles = dfa.accelerator(state);
+                    at = accel::find_fwd(needles, bytes, at)
+                        .unwrap_or(bytes.len());
+                }
+            } else if dfa.is_match_state(state) {
+                last_match = Some(HalfMatch {
+                    pattern: dfa.match_pattern(state, 0),
+                    offset: at - MATCH_OFFSET,
+                });
+                if earliest {
+                    return Ok(last_match);
+                }
+                if dfa.is_accel_state(state) {
+                    let needles = dfa.accelerator(state);
+                    at = accel::find_fwd(needles, bytes, at)
+                        .unwrap_or(bytes.len());
+                }
+            } else if dfa.is_accel_state(state) {
+                let needs = dfa.accelerator(state);
+                at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
+            } else if dfa.is_dead_state(state) {
+                return Ok(last_match);
+            } else {
+                debug_assert!(dfa.is_quit_state(state));
+                if last_match.is_some() {
+                    return Ok(last_match);
+                }
+                return Err(MatchError::Quit { byte, offset: at - 1 });
+            }
+        }
+        while at < end && dfa.next_state(state, bytes[at]) == state {
+            at += 1;
+        }
+    }
+    Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match))
+}
+
+#[inline(never)]
+pub fn find_earliest_rev<A: Automaton + ?Sized>(
+    dfa: &A,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    find_rev(true, dfa, pattern_id, bytes, start, end)
+}
+
+#[inline(never)]
+pub fn find_leftmost_rev<A: Automaton + ?Sized>(
+    dfa: &A,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    find_rev(false, dfa, pattern_id, bytes, start, end)
+}
+
+/// This is marked as `inline(always)` specifically because it supports
+/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
+/// permits eliminating a few crucial branches.
+#[inline(always)]
+fn find_rev<A: Automaton + ?Sized>(
+    earliest: bool,
+    dfa: &A,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    assert!(start <= end);
+    assert!(start <= bytes.len());
+    assert!(end <= bytes.len());
+
+    let mut state = init_rev(dfa, pattern_id, bytes, start, end)?;
+    let mut last_match = None;
+    let mut at = end;
+    while at > start {
+        at -= 1;
+        while at > start && dfa.next_state(state, bytes[at]) == state {
+            at -= 1;
+        }
+
+        let byte = bytes[at];
+        state = dfa.next_state(state, byte);
+        if dfa.is_special_state(state) {
+            if dfa.is_start_state(state) {
+                if dfa.is_accel_state(state) {
+                    let needles = dfa.accelerator(state);
+                    at = accel::find_rev(needles, bytes, at)
+                        .map(|i| i + 1)
+                        .unwrap_or(0);
+                }
+            } else if dfa.is_match_state(state) {
+                last_match = Some(HalfMatch {
+                    pattern: dfa.match_pattern(state, 0),
+                    offset: at + MATCH_OFFSET,
+                });
+                if earliest {
+                    return Ok(last_match);
+                }
+                if dfa.is_accel_state(state) {
+                    let needles = dfa.accelerator(state);
+                    at = accel::find_rev(needles, bytes, at)
+                        .map(|i| i + 1)
+                        .unwrap_or(0);
+                }
+            } else if dfa.is_accel_state(state) {
+                let needles = dfa.accelerator(state);
+                at = accel::find_rev(needles, bytes, at)
+                    .map(|i| i + 1)
+                    .unwrap_or(0);
+            } else if dfa.is_dead_state(state) {
+                return Ok(last_match);
+            } else {
+                debug_assert!(dfa.is_quit_state(state));
+                if last_match.is_some() {
+                    return Ok(last_match);
+                }
+                return Err(MatchError::Quit { byte, offset: at });
+            }
+        }
+    }
+    Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match))
+}
+
+#[inline(never)]
+pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
+    pre: Option<&mut prefilter::Scanner>,
+    dfa: &A,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+    caller_state: &mut OverlappingState,
+) -> Result<Option<HalfMatch>, MatchError> {
+    // Searching with a pattern ID is always anchored, so we should only ever
+    // use a prefilter when no pattern ID is given.
+    if pre.is_some() && pattern_id.is_none() {
+        find_overlapping_fwd_imp(
+            pre,
+            dfa,
+            pattern_id,
+            bytes,
+            start,
+            end,
+            caller_state,
+        )
+    } else {
+        find_overlapping_fwd_imp(
+            None,
+            dfa,
+            pattern_id,
+            bytes,
+            start,
+            end,
+            caller_state,
+        )
+    }
+}
+
+/// This is marked as `inline(always)` specifically because it supports
+/// multiple modes of searching. Namely, the 'pre' prefilter getting inlined
+/// permits eliminating a few crucial branches and reduces code size when it is
+/// not used.
+#[inline(always)]
+fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
+    mut pre: Option<&mut prefilter::Scanner>,
+    dfa: &A,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    mut start: usize,
+    end: usize,
+    caller_state: &mut OverlappingState,
+) -> Result<Option<HalfMatch>, MatchError> {
+    assert!(start <= end);
+    assert!(start <= bytes.len());
+    assert!(end <= bytes.len());
+
+    let mut state = match caller_state.id() {
+        None => init_fwd(dfa, pattern_id, bytes, start, end)?,
+        Some(id) => {
+            if let Some(last) = caller_state.last_match() {
+                let match_count = dfa.match_count(id);
+                if last.match_index < match_count {
+                    let m = HalfMatch {
+                        pattern: dfa.match_pattern(id, last.match_index),
+                        offset: last.offset,
+                    };
+                    last.match_index += 1;
+                    return Ok(Some(m));
+                }
+            }
+
+            // This is a subtle but critical detail. If the caller provides a
+            // non-None state ID, then it must be the case that the state ID
+            // corresponds to one set by this function. The state ID therefore
+            // corresponds to a match state, a dead state or some other state.
+            // However, "some other" state _only_ occurs when the input has
+            // been exhausted because the only way to stop before then is to
+            // see a match or a dead/quit state.
+            //
+            // If the input is exhausted or if it's a dead state, then
+            // incrementing the starting position has no relevance on
+            // correctness, since the loop below will either not execute
+            // at all or will immediately stop due to being in a dead state.
+            // (Once in a dead state it is impossible to leave it.)
+            //
+            // Therefore, the only case we need to consider is when
+            // caller_state is a match state. In this case, since our machines
+            // support the ability to delay a match by a certain number of
+            // bytes (to support look-around), it follows that we actually
+            // consumed that many additional bytes on our previous search. When
+            // the caller resumes their search to find subsequent matches, they
+            // will use the ending location from the previous match as the next
+            // starting point, which is `MATCH_OFFSET` bytes PRIOR to where
+            // we scanned to on the previous search. Therefore, we need to
+            // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
+            //
+            // Incidentally, since MATCH_OFFSET is non-zero, this also makes
+            // dealing with empty matches convenient. Namely, callers needn't
+            // special case them when implementing an iterator. Instead, this
+            // ensures that forward progress is always made.
+            start += MATCH_OFFSET;
+            id
+        }
+    };
+
+    let mut at = start;
+    while at < end {
+        let byte = bytes[at];
+        state = dfa.next_state(state, byte);
+        at += 1;
+        if dfa.is_special_state(state) {
+            caller_state.set_id(state);
+            if dfa.is_start_state(state) {
+                if let Some(ref mut pre) = pre {
+                    if pre.is_effective(at) {
+                        match pre.next_candidate(bytes, at).into_option() {
+                            None => return Ok(None),
+                            Some(i) => {
+                                at = i;
+                            }
+                        }
+                    }
+                } else if dfa.is_accel_state(state) {
+                    let needles = dfa.accelerator(state);
+                    at = accel::find_fwd(needles, bytes, at)
+                        .unwrap_or(bytes.len());
+                }
+            } else if dfa.is_match_state(state) {
+                let offset = at - MATCH_OFFSET;
+                caller_state
+                    .set_last_match(StateMatch { match_index: 1, offset });
+                return Ok(Some(HalfMatch {
+                    pattern: dfa.match_pattern(state, 0),
+                    offset,
+                }));
+            } else if dfa.is_accel_state(state) {
+                let needs = dfa.accelerator(state);
+                at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
+            } else if dfa.is_dead_state(state) {
+                return Ok(None);
+            } else {
+                debug_assert!(dfa.is_quit_state(state));
+                return Err(MatchError::Quit { byte, offset: at - 1 });
+            }
+        }
+    }
+
+    let result = eoi_fwd(dfa, bytes, end, &mut state);
+    caller_state.set_id(state);
+    if let Ok(Some(ref last_match)) = result {
+        caller_state.set_last_match(StateMatch {
+            match_index: 1,
+            offset: last_match.offset(),
+        });
+    }
+    result
+}
+
+fn init_fwd<A: Automaton + ?Sized>(
+    dfa: &A,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<StateID, MatchError> {
+    let state = dfa.start_state_forward(pattern_id, bytes, start, end);
+    // Start states can never be match states, since all matches are delayed
+    // by 1 byte.
+    assert!(!dfa.is_match_state(state));
+    Ok(state)
+}
+
+fn init_rev<A: Automaton + ?Sized>(
+    dfa: &A,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<StateID, MatchError> {
+    let state = dfa.start_state_reverse(pattern_id, bytes, start, end);
+    // Start states can never be match states, since all matches are delayed
+    // by 1 byte.
+    assert!(!dfa.is_match_state(state));
+    Ok(state)
+}
+
+fn eoi_fwd<A: Automaton + ?Sized>(
+    dfa: &A,
+    bytes: &[u8],
+    end: usize,
+    state: &mut StateID,
+) -> Result<Option<HalfMatch>, MatchError> {
+    match bytes.get(end) {
+        Some(&b) => {
+            *state = dfa.next_state(*state, b);
+            if dfa.is_match_state(*state) {
+                Ok(Some(HalfMatch {
+                    pattern: dfa.match_pattern(*state, 0),
+                    offset: end,
+                }))
+            } else {
+                Ok(None)
+            }
+        }
+        None => {
+            *state = dfa.next_eoi_state(*state);
+            if dfa.is_match_state(*state) {
+                Ok(Some(HalfMatch {
+                    pattern: dfa.match_pattern(*state, 0),
+                    offset: bytes.len(),
+                }))
+            } else {
+                Ok(None)
+            }
+        }
+    }
+}
+
+fn eoi_rev<A: Automaton + ?Sized>(
+    dfa: &A,
+    bytes: &[u8],
+    start: usize,
+    state: StateID,
+) -> Result<Option<HalfMatch>, MatchError> {
+    if start > 0 {
+        let state = dfa.next_state(state, bytes[start - 1]);
+        if dfa.is_match_state(state) {
+            Ok(Some(HalfMatch {
+                pattern: dfa.match_pattern(state, 0),
+                offset: start,
+            }))
+        } else {
+            Ok(None)
+        }
+    } else {
+        let state = dfa.next_eoi_state(state);
+        if dfa.is_match_state(state) {
+            Ok(Some(HalfMatch {
+                pattern: dfa.match_pattern(state, 0),
+                offset: 0,
+            }))
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+// Currently unused, but is useful to keep around. This was originally used
+// when the code above used raw pointers for its main loop.
+// /// Returns the distance between the given pointer and the start of `bytes`.
+// /// This assumes that the given pointer points to somewhere in the `bytes`
+// /// slice given.
+// fn offset(bytes: &[u8], p: *const u8) -> usize {
+// debug_assert!(bytes.as_ptr() <= p);
+// debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
+// ((p as isize) - (bytes.as_ptr() as isize)) as usize
+// }
diff --git a/src/dfa/search_unsafe.rs b/src/dfa/search_unsafe.rs

new file mode 100644 (file)

index 0000000..ea1c29f
--- /dev/null
+++ b/src/dfa/search_unsafe.rs
@@ -0,0 +1,321 @@
+use crate::dfa::automaton::{Automaton, State};
+use crate::MatchError;
+
+/// This is marked as `inline(always)` specifically because it supports
+/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
+/// permits eliminating a few crucial branches.
+#[inline(always)]
+pub fn find_fwd<A: Automaton + ?Sized>(
+    dfa: &A,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+    earliest: bool,
+) -> Result<Option<usize>, MatchError> {
+    assert!(start <= end);
+    assert!(start <= bytes.len());
+    assert!(end <= bytes.len());
+
+    let (mut state, mut last_match) = init_fwd(dfa, bytes, start, end)?;
+    if earliest && last_match.is_some() {
+        return Ok(last_match);
+    }
+
+    let mut at = start;
+    while at < end {
+        let byte = bytes[at];
+        state = dfa.next_state(state, byte);
+        at += 1;
+        if dfa.is_special_state(state) {
+            if dfa.is_dead_state(state) {
+                return Ok(last_match);
+            } else if dfa.is_quit_state(state) {
+                return Err(MatchError::Quit { byte, offset: at - 1 });
+            }
+            last_match = Some(at - dfa.match_offset());
+            if earliest {
+                return Ok(last_match);
+            }
+        }
+    }
+    /*
+    unsafe {
+        let mut p = bytes.as_ptr().add(start);
+        while p < bytes[end..].as_ptr() {
+            let byte = *p;
+            state = dfa.next_state_unchecked(state, byte);
+            p = p.add(1);
+            if dfa.is_special_state(state) {
+                if dfa.is_dead_state(state) {
+                    return Ok(last_match);
+                } else if dfa.is_quit_state(state) {
+                    return Err(MatchError::Quit {
+                        byte,
+                        offset: offset(bytes, p) - 1,
+                    });
+                }
+                last_match = Some(offset(bytes, p) - dfa.match_offset());
+                if earliest {
+                    return Ok(last_match);
+                }
+            }
+        }
+    }
+    */
+    Ok(eof_fwd(dfa, bytes, end, &mut state)?.or(last_match))
+}
+
+/// This is marked as `inline(always)` specifically because it supports
+/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
+/// permits eliminating a few crucial branches.
+#[inline(always)]
+pub fn find_rev<A: Automaton + ?Sized>(
+    dfa: &A,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+    earliest: bool,
+) -> Result<Option<usize>, MatchError> {
+    assert!(start <= end);
+    assert!(start <= bytes.len());
+    assert!(end <= bytes.len());
+
+    let (mut state, mut last_match) = init_rev(dfa, bytes, start, end)?;
+    if earliest && last_match.is_some() {
+        return Ok(last_match);
+    }
+
+    let mut at = end;
+    while at > start {
+        at -= 1;
+        let byte = bytes[at];
+        state = dfa.next_state(state, byte);
+        if dfa.is_special_state(state) {
+            if dfa.is_dead_state(state) {
+                return Ok(last_match);
+            } else if dfa.is_quit_state(state) {
+                return Err(MatchError::Quit { byte, offset: at });
+            }
+            last_match = Some(at + dfa.match_offset());
+            if earliest {
+                return Ok(last_match);
+            }
+        }
+    }
+    /*
+    unsafe {
+        let mut p = bytes.as_ptr().add(end);
+        while p > bytes[start..].as_ptr() {
+            p = p.sub(1);
+            let byte = *p;
+            state = dfa.next_state_unchecked(state, byte);
+            if dfa.is_special_state(state) {
+                if dfa.is_dead_state(state) {
+                    return Ok(last_match);
+                } else if dfa.is_quit_state(state) {
+                    return Err(MatchError::Quit {
+                        byte,
+                        offset: offset(bytes, p),
+                    });
+                }
+                last_match = Some(offset(bytes, p) + dfa.match_offset());
+                if earliest {
+                    return Ok(last_match);
+                }
+            }
+        }
+    }
+    */
+    Ok(eof_rev(dfa, state, bytes, start)?.or(last_match))
+}
+
+pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
+    dfa: &A,
+    bytes: &[u8],
+    mut start: usize,
+    end: usize,
+    caller_state: &mut State<A::ID>,
+) -> Result<Option<usize>, MatchError> {
+    assert!(start <= end);
+    assert!(start <= bytes.len());
+    assert!(end <= bytes.len());
+
+    let (mut state, mut last_match) = match caller_state.as_option() {
+        None => init_fwd(dfa, bytes, start, end)?,
+        Some(id) => {
+            // This is a subtle but critical detail. If the caller provides a
+            // non-None state ID, then it must be the case that the state ID
+            // corresponds to one set by this function. The state ID therefore
+            // corresponds to a match state, a dead state or some other state.
+            // However, "some other" state _only_ occurs when the input has
+            // been exhausted because the only way to stop before then is to
+            // see a match or a dead/quit state.
+            //
+            // If the input is exhausted or if it's a dead state, then
+            // incrementing the starting position has no relevance on
+            // correctness, since the loop below will either not execute
+            // at all or will immediately stop due to being in a dead state.
+            // (Once in a dead state it is impossible to leave it.)
+            //
+            // Therefore, the only case we need to consider is when
+            // caller_state is a match state. In this case, since our machines
+            // support the ability to delay a match by a certain number of
+            // bytes (to support look-around), it follows that we actually
+            // consumed that many additional bytes on our previous search. When
+            // the caller resumes their search to find subsequent matches, they
+            // will use the ending location from the previous match as the next
+            // starting point, which is `match_offset` bytes PRIOR to where
+            // we scanned to on the previous search. Therefore, we need to
+            // compensate by bumping `start` up by `match_offset` bytes.
+            start += dfa.match_offset();
+            // Since match_offset could be any arbitrary value and we use
+            // `start` in pointer arithmetic below, we check that we are still
+            // in bounds. Otherwise, we could materialize a pointer that is
+            // more than one past the end point of `bytes`, which is UB.
+            if start > end {
+                return Ok(None);
+            }
+            (id, None)
+        }
+    };
+    if last_match.is_some() {
+        caller_state.set(state);
+        return Ok(last_match);
+    }
+
+    let mut at = start;
+    while at < end {
+        let byte = bytes[at];
+        state = dfa.next_state(state, byte);
+        at += 1;
+        if dfa.is_special_state(state) {
+            caller_state.set(state);
+            if dfa.is_dead_state(state) {
+                return Ok(None);
+            } else if dfa.is_quit_state(state) {
+                return Err(MatchError::Quit { byte, offset: at - 1 });
+            } else {
+                return Ok(Some(at - dfa.match_offset()));
+            }
+        }
+    }
+    /*
+    // SAFETY: Other than the normal pointer arithmetic happening here, a
+    // unique aspect of safety for this function is the fact that the caller
+    // can provide the state that the search routine will start with. If this
+    // state were invalid, it would be possible to incorrectly index the
+    // transition table. We however prevent this from happening by guaranteeing
+    // that State is valid. Namely, callers cannot mutate a State. All they can
+    // do is create a "start" state or otherwise reuse a previously set state.
+    // Since callers can't mutate a state, it follows that a previously set
+    // state can only be retrieved by crate internal functions. Therefore, our
+    // use of it is safe since this code will only ever set the provided state
+    // to a valid state.
+    unsafe {
+        let mut p = bytes.as_ptr().add(start);
+        while p < bytes[end..].as_ptr() {
+            let byte = *p;
+            state = dfa.next_state_unchecked(state, byte);
+            p = p.add(1);
+            if dfa.is_special_state(state) {
+                caller_state.set(state);
+                return if dfa.is_dead_state(state) {
+                    Ok(None)
+                } else if dfa.is_quit_state(state) {
+                    Err(MatchError::Quit { byte, offset: offset(bytes, p) - 1 })
+                } else {
+                    Ok(Some(offset(bytes, p) - dfa.match_offset()))
+                };
+            }
+        }
+    }
+    */
+
+    let result = eof_fwd(dfa, bytes, end, &mut state);
+    caller_state.set(state);
+    result
+}
+
+fn init_fwd<A: Automaton + ?Sized>(
+    dfa: &A,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<(A::ID, Option<usize>), MatchError> {
+    let state = dfa.start_state_forward(bytes, start, end);
+    if dfa.is_match_state(state) {
+        Ok((state, Some(start - dfa.match_offset())))
+    } else {
+        Ok((state, None))
+    }
+}
+
+fn init_rev<A: Automaton + ?Sized>(
+    dfa: &A,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<(A::ID, Option<usize>), MatchError> {
+    let state = dfa.start_state_reverse(bytes, start, end);
+    if dfa.is_match_state(state) {
+        Ok((state, Some(end + dfa.match_offset())))
+    } else {
+        Ok((state, None))
+    }
+}
+
+fn eof_fwd<A: Automaton + ?Sized>(
+    dfa: &A,
+    bytes: &[u8],
+    end: usize,
+    state: &mut A::ID,
+) -> Result<Option<usize>, MatchError> {
+    match bytes.get(end) {
+        Some(&b) => {
+            *state = dfa.next_state(*state, b);
+            if dfa.is_match_state(*state) {
+                Ok(Some(end))
+            } else {
+                Ok(None)
+            }
+        }
+        None => {
+            *state = dfa.next_eof_state(*state);
+            if dfa.is_match_state(*state) {
+                Ok(Some(bytes.len()))
+            } else {
+                Ok(None)
+            }
+        }
+    }
+}
+
+fn eof_rev<A: Automaton + ?Sized>(
+    dfa: &A,
+    state: A::ID,
+    bytes: &[u8],
+    start: usize,
+) -> Result<Option<usize>, MatchError> {
+    if start > 0 {
+        if dfa.is_match_state(dfa.next_state(state, bytes[start - 1])) {
+            Ok(Some(start))
+        } else {
+            Ok(None)
+        }
+    } else {
+        if dfa.is_match_state(dfa.next_eof_state(state)) {
+            Ok(Some(0))
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+/// Returns the distance between the given pointer and the start of `bytes`.
+/// This assumes that the given pointer points to somewhere in the `bytes`
+/// slice given.
+fn offset(bytes: &[u8], p: *const u8) -> usize {
+    debug_assert!(bytes.as_ptr() <= p);
+    debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
+    ((p as isize) - (bytes.as_ptr() as isize)) as usize
+}
diff --git a/src/dfa/sparse.rs b/src/dfa/sparse.rs

new file mode 100644 (file)

index 0000000..3466069
--- /dev/null
+++ b/src/dfa/sparse.rs
@@ -0,0 +1,2283 @@
+/*!
+Types and routines specific to sparse DFAs.
+
+This module is the home of [`sparse::DFA`](DFA).
+
+Unlike the [`dense`](super::dense) module, this module does not contain a
+builder or configuration specific for sparse DFAs. Instead, the intended
+way to build a sparse DFA is either by using a default configuration with
+its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the
+construction of a dense DFA with [`dense::Builder`](super::dense::Builder)
+and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For
+example, this configures a sparse DFA to do an overlapping search:
+
+```
+use regex_automata::{
+    dfa::{Automaton, OverlappingState, dense},
+    HalfMatch, MatchKind,
+};
+
+let dense_re = dense::Builder::new()
+    .configure(dense::Config::new().match_kind(MatchKind::All))
+    .build(r"Samwise|Sam")?;
+let sparse_re = dense_re.to_sparse()?;
+
+// Setup our haystack and initial start state.
+let haystack = b"Samwise";
+let mut state = OverlappingState::start();
+
+// First, 'Sam' will match.
+let end1 = sparse_re.find_overlapping_fwd_at(
+    None, None, haystack, 0, haystack.len(), &mut state,
+)?;
+assert_eq!(end1, Some(HalfMatch::must(0, 3)));
+
+// And now 'Samwise' will match.
+let end2 = sparse_re.find_overlapping_fwd_at(
+    None, None, haystack, 3, haystack.len(), &mut state,
+)?;
+assert_eq!(end2, Some(HalfMatch::must(0, 7)));
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+*/
+
+#[cfg(feature = "alloc")]
+use core::iter;
+use core::{
+    convert::{TryFrom, TryInto},
+    fmt,
+    mem::size_of,
+};
+
+#[cfg(feature = "alloc")]
+use alloc::{collections::BTreeSet, vec, vec::Vec};
+
+#[cfg(feature = "alloc")]
+use crate::dfa::{dense, error::Error};
+use crate::{
+    dfa::{
+        automaton::{fmt_state_indicator, Automaton},
+        special::Special,
+        DEAD,
+    },
+    util::{
+        alphabet::ByteClasses,
+        bytes::{self, DeserializeError, Endian, SerializeError},
+        id::{PatternID, StateID},
+        start::Start,
+        DebugByte,
+    },
+};
+
+const LABEL: &str = "rust-regex-automata-dfa-sparse";
+const VERSION: u32 = 2;
+
+/// A sparse deterministic finite automaton (DFA) with variable sized states.
+///
+/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses
+/// a more space efficient representation for its transitions. Consequently,
+/// sparse DFAs may use much less memory than dense DFAs, but this comes at a
+/// price. In particular, reading the more space efficient transitions takes
+/// more work, and consequently, searching using a sparse DFA is typically
+/// slower than a dense DFA.
+///
+/// A sparse DFA can be built using the default configuration via the
+/// [`DFA::new`] constructor. Otherwise, one can configure various aspects
+/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder),
+/// and then convert a dense DFA to a sparse DFA using
+/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse).
+///
+/// In general, a sparse DFA supports all the same search operations as a dense
+/// DFA.
+///
+/// Making the choice between a dense and sparse DFA depends on your specific
+/// work load. If you can sacrifice a bit of search time performance, then a
+/// sparse DFA might be the best choice. In particular, while sparse DFAs are
+/// probably always slower than dense DFAs, you may find that they are easily
+/// fast enough for your purposes!
+///
+/// # Type parameters
+///
+/// A `DFA` has one type parameter, `T`, which is used to represent the parts
+/// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`.
+///
+/// # The `Automaton` trait
+///
+/// This type implements the [`Automaton`] trait, which means it can be used
+/// for searching. For example:
+///
+/// ```
+/// use regex_automata::{
+///     dfa::{Automaton, sparse::DFA},
+///     HalfMatch,
+/// };
+///
+/// let dfa = DFA::new("foo[0-9]+")?;
+/// let expected = HalfMatch::must(0, 8);
+/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone)]
+pub struct DFA<T> {
+    // When compared to a dense DFA, a sparse DFA *looks* a lot simpler
+    // representation-wise. In reality, it is perhaps more complicated. Namely,
+    // in a dense DFA, all information needs to be very cheaply accessible
+    // using only state IDs. In a sparse DFA however, each state uses a
+    // variable amount of space because each state encodes more information
+    // than just its transitions. Each state also includes an accelerator if
+    // one exists, along with the matching pattern IDs if the state is a match
+    // state.
+    //
+    // That is, a lot of the complexity is pushed down into how each state
+    // itself is represented.
+    trans: Transitions<T>,
+    starts: StartTable<T>,
+    special: Special,
+}
+
+#[cfg(feature = "alloc")]
+impl DFA<Vec<u8>> {
+    /// Parse the given regular expression using a default configuration and
+    /// return the corresponding sparse DFA.
+    ///
+    /// If you want a non-default configuration, then use
+    /// the [`dense::Builder`](crate::dfa::dense::Builder)
+    /// to set your own configuration, and then call
+    /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
+    /// a sparse DFA.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// let dfa = sparse::DFA::new("foo[0-9]+bar")?;
+    ///
+    /// let expected = HalfMatch::must(0, 11);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, Error> {
+        dense::Builder::new()
+            .build(pattern)
+            .and_then(|dense| dense.to_sparse())
+    }
+
+    /// Parse the given regular expressions using a default configuration and
+    /// return the corresponding multi-DFA.
+    ///
+    /// If you want a non-default configuration, then use
+    /// the [`dense::Builder`](crate::dfa::dense::Builder)
+    /// to set your own configuration, and then call
+    /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
+    /// a sparse DFA.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
+    /// let expected = HalfMatch::must(1, 3);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new_many<P: AsRef<str>>(
+        patterns: &[P],
+    ) -> Result<DFA<Vec<u8>>, Error> {
+        dense::Builder::new()
+            .build_many(patterns)
+            .and_then(|dense| dense.to_sparse())
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl DFA<Vec<u8>> {
+    /// Create a new DFA that matches every input.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// let dfa = sparse::DFA::always_match()?;
+    ///
+    /// let expected = HalfMatch::must(0, 0);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn always_match() -> Result<DFA<Vec<u8>>, Error> {
+        dense::DFA::always_match()?.to_sparse()
+    }
+
+    /// Create a new sparse DFA that never matches any input.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::dfa::{Automaton, sparse};
+    ///
+    /// let dfa = sparse::DFA::never_match()?;
+    /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?);
+    /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn never_match() -> Result<DFA<Vec<u8>>, Error> {
+        dense::DFA::never_match()?.to_sparse()
+    }
+
+    /// The implementation for constructing a sparse DFA from a dense DFA.
+    pub(crate) fn from_dense<T: AsRef<[u32]>>(
+        dfa: &dense::DFA<T>,
+    ) -> Result<DFA<Vec<u8>>, Error> {
+        // In order to build the transition table, we need to be able to write
+        // state identifiers for each of the "next" transitions in each state.
+        // Our state identifiers correspond to the byte offset in the
+        // transition table at which the state is encoded. Therefore, we do not
+        // actually know what the state identifiers are until we've allocated
+        // exactly as much space as we need for each state. Thus, construction
+        // of the transition table happens in two passes.
+        //
+        // In the first pass, we fill out the shell of each state, which
+        // includes the transition count, the input byte ranges and zero-filled
+        // space for the transitions and accelerators, if present. In this
+        // first pass, we also build up a map from the state identifier index
+        // of the dense DFA to the state identifier in this sparse DFA.
+        //
+        // In the second pass, we fill in the transitions based on the map
+        // built in the first pass.
+
+        // The capacity given here reflects a minimum. (Well, the true minimum
+        // is likely even bigger, but hopefully this saves a few reallocs.)
+        let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_count());
+        // This maps state indices from the dense DFA to StateIDs in the sparse
+        // DFA. We build out this map on the first pass, and then use it in the
+        // second pass to back-fill our transitions.
+        let mut remap: Vec<StateID> = vec![DEAD; dfa.state_count()];
+        for state in dfa.states() {
+            let pos = sparse.len();
+
+            remap[dfa.to_index(state.id())] =
+                StateID::new(pos).map_err(|_| Error::too_many_states())?;
+            // zero-filled space for the transition count
+            sparse.push(0);
+            sparse.push(0);
+
+            let mut transition_count = 0;
+            for (unit1, unit2, _) in state.sparse_transitions() {
+                match (unit1.as_u8(), unit2.as_u8()) {
+                    (Some(b1), Some(b2)) => {
+                        transition_count += 1;
+                        sparse.push(b1);
+                        sparse.push(b2);
+                    }
+                    (None, None) => {}
+                    (Some(_), None) | (None, Some(_)) => {
+                        // can never occur because sparse_transitions never
+                        // groups EOI with any other transition.
+                        unreachable!()
+                    }
+                }
+            }
+            // Add dummy EOI transition. This is never actually read while
+            // searching, but having space equivalent to the total number
+            // of transitions is convenient. Otherwise, we'd need to track
+            // a different number of transitions for the byte ranges as for
+            // the 'next' states.
+            //
+            // N.B. The loop above is not guaranteed to yield the EOI
+            // transition, since it may point to a DEAD state. By putting
+            // it here, we always write the EOI transition, and thus
+            // guarantee that our transition count is >0. Why do we always
+            // need the EOI transition? Because in order to implement
+            // Automaton::next_eoi_state, this lets us just ask for the last
+            // transition. There are probably other/better ways to do this.
+            transition_count += 1;
+            sparse.push(0);
+            sparse.push(0);
+
+            // Check some assumptions about transition count.
+            assert_ne!(
+                transition_count, 0,
+                "transition count should be non-zero",
+            );
+            assert!(
+                transition_count <= 257,
+                "expected transition count {} to be <= 257",
+                transition_count,
+            );
+
+            // Fill in the transition count.
+            // Since transition count is always <= 257, we use the most
+            // significant bit to indicate whether this is a match state or
+            // not.
+            let ntrans = if dfa.is_match_state(state.id()) {
+                transition_count | (1 << 15)
+            } else {
+                transition_count
+            };
+            bytes::NE::write_u16(ntrans, &mut sparse[pos..]);
+
+            // zero-fill the actual transitions.
+            // Unwraps are OK since transition_count <= 257 and our minimum
+            // support usize size is 16-bits.
+            let zeros = usize::try_from(transition_count)
+                .unwrap()
+                .checked_mul(StateID::SIZE)
+                .unwrap();
+            sparse.extend(iter::repeat(0).take(zeros));
+
+            // If this is a match state, write the pattern IDs matched by this
+            // state.
+            if dfa.is_match_state(state.id()) {
+                let plen = dfa.match_pattern_len(state.id());
+                // Write the actual pattern IDs with a u32 length prefix.
+                // First, zero-fill space.
+                let mut pos = sparse.len();
+                // Unwraps are OK since it's guaranteed that plen <=
+                // PatternID::LIMIT, which is in turn guaranteed to fit into a
+                // u32.
+                let zeros = size_of::<u32>()
+                    .checked_mul(plen)
+                    .unwrap()
+                    .checked_add(size_of::<u32>())
+                    .unwrap();
+                sparse.extend(iter::repeat(0).take(zeros));
+
+                // Now write the length prefix.
+                bytes::NE::write_u32(
+                    // Will never fail since u32::MAX is invalid pattern ID.
+                    // Thus, the number of pattern IDs is representable by a
+                    // u32.
+                    plen.try_into().expect("pattern ID count fits in u32"),
+                    &mut sparse[pos..],
+                );
+                pos += size_of::<u32>();
+
+                // Now write the pattern IDs.
+                for &pid in dfa.pattern_id_slice(state.id()) {
+                    pos += bytes::write_pattern_id::<bytes::NE>(
+                        pid,
+                        &mut sparse[pos..],
+                    );
+                }
+            }
+
+            // And now add the accelerator, if one exists. An accelerator is
+            // at most 4 bytes and at least 1 byte. The first byte is the
+            // length, N. N bytes follow the length. The set of bytes that
+            // follow correspond (exhaustively) to the bytes that must be seen
+            // to leave this state.
+            let accel = dfa.accelerator(state.id());
+            sparse.push(accel.len().try_into().unwrap());
+            sparse.extend_from_slice(accel);
+        }
+
+        let mut new = DFA {
+            trans: Transitions {
+                sparse,
+                classes: dfa.byte_classes().clone(),
+                count: dfa.state_count(),
+                patterns: dfa.pattern_count(),
+            },
+            starts: StartTable::from_dense_dfa(dfa, &remap)?,
+            special: dfa.special().remap(|id| remap[dfa.to_index(id)]),
+        };
+        // And here's our second pass. Iterate over all of the dense states
+        // again, and update the transitions in each of the states in the
+        // sparse DFA.
+        for old_state in dfa.states() {
+            let new_id = remap[dfa.to_index(old_state.id())];
+            let mut new_state = new.trans.state_mut(new_id);
+            let sparse = old_state.sparse_transitions();
+            for (i, (_, _, next)) in sparse.enumerate() {
+                let next = remap[dfa.to_index(next)];
+                new_state.set_next_at(i, next);
+            }
+        }
+        trace!(
+            "created sparse DFA, memory usage: {} (dense memory usage: {})",
+            new.memory_usage(),
+            dfa.memory_usage(),
+        );
+        Ok(new)
+    }
+}
+
+impl<T: AsRef<[u8]>> DFA<T> {
+    /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
+    /// DFA returned always uses `&[u8]` for its transitions.
+    pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> {
+        DFA {
+            trans: self.trans.as_ref(),
+            starts: self.starts.as_ref(),
+            special: self.special,
+        }
+    }
+
+    /// Return an owned version of this sparse DFA. Specifically, the DFA
+    /// returned always uses `Vec<u8>` for its transitions.
+    ///
+    /// Effectively, this returns a sparse DFA whose transitions live on the
+    /// heap.
+    #[cfg(feature = "alloc")]
+    pub fn to_owned(&self) -> DFA<Vec<u8>> {
+        DFA {
+            trans: self.trans.to_owned(),
+            starts: self.starts.to_owned(),
+            special: self.special,
+        }
+    }
+
+    /// Returns the memory usage, in bytes, of this DFA.
+    ///
+    /// The memory usage is computed based on the number of bytes used to
+    /// represent this DFA.
+    ///
+    /// This does **not** include the stack size used up by this DFA. To
+    /// compute that, use `std::mem::size_of::<sparse::DFA>()`.
+    pub fn memory_usage(&self) -> usize {
+        self.trans.memory_usage() + self.starts.memory_usage()
+    }
+
+    /// Returns true only if this DFA has starting states for each pattern.
+    ///
+    /// When a DFA has starting states for each pattern, then a search with the
+    /// DFA can be configured to only look for anchored matches of a specific
+    /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`]
+    /// can accept a non-None `pattern_id` if and only if this method returns
+    /// true. Otherwise, calling `find_earliest_fwd_at` will panic.
+    ///
+    /// Note that if the DFA is empty, this always returns false.
+    pub fn has_starts_for_each_pattern(&self) -> bool {
+        self.starts.patterns > 0
+    }
+}
+
+/// Routines for converting a sparse DFA to other representations, such as raw
+/// bytes suitable for persistent storage.
+impl<T: AsRef<[u8]>> DFA<T> {
+    /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
+    /// format.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
+    /// serialization methods, this does not add any initial padding to the
+    /// returned bytes. Padding isn't required for sparse DFAs since they have
+    /// no alignment requirements.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA:
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse::DFA},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// // N.B. We use native endianness here to make the example work, but
+    /// // using to_bytes_little_endian would work on a little endian target.
+    /// let buf = original_dfa.to_bytes_native_endian();
+    /// // Even if buf has initial padding, DFA::from_bytes will automatically
+    /// // ignore it.
+    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[cfg(feature = "alloc")]
+    pub fn to_bytes_little_endian(&self) -> Vec<u8> {
+        self.to_bytes::<bytes::LE>()
+    }
+
+    /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
+    /// format.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
+    /// serialization methods, this does not add any initial padding to the
+    /// returned bytes. Padding isn't required for sparse DFAs since they have
+    /// no alignment requirements.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA:
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse::DFA},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// // N.B. We use native endianness here to make the example work, but
+    /// // using to_bytes_big_endian would work on a big endian target.
+    /// let buf = original_dfa.to_bytes_native_endian();
+    /// // Even if buf has initial padding, DFA::from_bytes will automatically
+    /// // ignore it.
+    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[cfg(feature = "alloc")]
+    pub fn to_bytes_big_endian(&self) -> Vec<u8> {
+        self.to_bytes::<bytes::BE>()
+    }
+
+    /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
+    /// format.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
+    /// serialization methods, this does not add any initial padding to the
+    /// returned bytes. Padding isn't required for sparse DFAs since they have
+    /// no alignment requirements.
+    ///
+    /// Generally speaking, native endian format should only be used when
+    /// you know that the target you're compiling the DFA for matches the
+    /// endianness of the target on which you're compiling DFA. For example,
+    /// if serialization and deserialization happen in the same process or on
+    /// the same machine. Otherwise, when serializing a DFA for use in a
+    /// portable environment, you'll almost certainly want to serialize _both_
+    /// a little endian and a big endian version and then load the correct one
+    /// based on the target's configuration.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA:
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse::DFA},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// let buf = original_dfa.to_bytes_native_endian();
+    /// // Even if buf has initial padding, DFA::from_bytes will automatically
+    /// // ignore it.
+    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[cfg(feature = "alloc")]
+    pub fn to_bytes_native_endian(&self) -> Vec<u8> {
+        self.to_bytes::<bytes::NE>()
+    }
+
+    /// The implementation of the public `to_bytes` serialization methods,
+    /// which is generic over endianness.
+    #[cfg(feature = "alloc")]
+    fn to_bytes<E: Endian>(&self) -> Vec<u8> {
+        let mut buf = vec![0; self.write_to_len()];
+        // This should always succeed since the only possible serialization
+        // error is providing a buffer that's too small, but we've ensured that
+        // `buf` is big enough here.
+        self.write_to::<E>(&mut buf).unwrap();
+        buf
+    }
+
+    /// Serialize this DFA as raw bytes to the given slice, in little endian
+    /// format. Upon success, the total number of bytes written to `dst` is
+    /// returned.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// # Errors
+    ///
+    /// This returns an error if the given destination slice is not big enough
+    /// to contain the full serialized DFA. If an error occurs, then nothing
+    /// is written to `dst`.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA without
+    /// dynamic memory allocation.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse::DFA},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// // Create a 4KB buffer on the stack to store our serialized DFA.
+    /// let mut buf = [0u8; 4 * (1<<10)];
+    /// // N.B. We use native endianness here to make the example work, but
+    /// // using write_to_little_endian would work on a little endian target.
+    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn write_to_little_endian(
+        &self,
+        dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        self.write_to::<bytes::LE>(dst)
+    }
+
+    /// Serialize this DFA as raw bytes to the given slice, in big endian
+    /// format. Upon success, the total number of bytes written to `dst` is
+    /// returned.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// # Errors
+    ///
+    /// This returns an error if the given destination slice is not big enough
+    /// to contain the full serialized DFA. If an error occurs, then nothing
+    /// is written to `dst`.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA without
+    /// dynamic memory allocation.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse::DFA},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// // Create a 4KB buffer on the stack to store our serialized DFA.
+    /// let mut buf = [0u8; 4 * (1<<10)];
+    /// // N.B. We use native endianness here to make the example work, but
+    /// // using write_to_big_endian would work on a big endian target.
+    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn write_to_big_endian(
+        &self,
+        dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        self.write_to::<bytes::BE>(dst)
+    }
+
+    /// Serialize this DFA as raw bytes to the given slice, in native endian
+    /// format. Upon success, the total number of bytes written to `dst` is
+    /// returned.
+    ///
+    /// The written bytes are guaranteed to be deserialized correctly and
+    /// without errors in a semver compatible release of this crate by a
+    /// `DFA`'s deserialization APIs (assuming all other criteria for the
+    /// deserialization APIs has been satisfied):
+    ///
+    /// * [`DFA::from_bytes`]
+    /// * [`DFA::from_bytes_unchecked`]
+    ///
+    /// Generally speaking, native endian format should only be used when
+    /// you know that the target you're compiling the DFA for matches the
+    /// endianness of the target on which you're compiling DFA. For example,
+    /// if serialization and deserialization happen in the same process or on
+    /// the same machine. Otherwise, when serializing a DFA for use in a
+    /// portable environment, you'll almost certainly want to serialize _both_
+    /// a little endian and a big endian version and then load the correct one
+    /// based on the target's configuration.
+    ///
+    /// # Errors
+    ///
+    /// This returns an error if the given destination slice is not big enough
+    /// to contain the full serialized DFA. If an error occurs, then nothing
+    /// is written to `dst`.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize and deserialize a DFA without
+    /// dynamic memory allocation.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse::DFA},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// // Create a 4KB buffer on the stack to store our serialized DFA.
+    /// let mut buf = [0u8; 4 * (1<<10)];
+    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn write_to_native_endian(
+        &self,
+        dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        self.write_to::<bytes::NE>(dst)
+    }
+
+    /// The implementation of the public `write_to` serialization methods,
+    /// which is generic over endianness.
+    fn write_to<E: Endian>(
+        &self,
+        dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        let mut nw = 0;
+        nw += bytes::write_label(LABEL, &mut dst[nw..])?;
+        nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?;
+        nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?;
+        nw += {
+            // Currently unused, intended for future flexibility
+            E::write_u32(0, &mut dst[nw..]);
+            size_of::<u32>()
+        };
+        nw += self.trans.write_to::<E>(&mut dst[nw..])?;
+        nw += self.starts.write_to::<E>(&mut dst[nw..])?;
+        nw += self.special.write_to::<E>(&mut dst[nw..])?;
+        Ok(nw)
+    }
+
+    /// Return the total number of bytes required to serialize this DFA.
+    ///
+    /// This is useful for determining the size of the buffer required to pass
+    /// to one of the serialization routines:
+    ///
+    /// * [`DFA::write_to_little_endian`]
+    /// * [`DFA::write_to_big_endian`]
+    /// * [`DFA::write_to_native_endian`]
+    ///
+    /// Passing a buffer smaller than the size returned by this method will
+    /// result in a serialization error.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to dynamically allocate enough room to serialize
+    /// a sparse DFA.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse::DFA},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// // Compile our original DFA.
+    /// let original_dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// let mut buf = vec![0; original_dfa.write_to_len()];
+    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn write_to_len(&self) -> usize {
+        bytes::write_label_len(LABEL)
+        + bytes::write_endianness_check_len()
+        + bytes::write_version_len()
+        + size_of::<u32>() // unused, intended for future flexibility
+        + self.trans.write_to_len()
+        + self.starts.write_to_len()
+        + self.special.write_to_len()
+    }
+}
+
+impl<'a> DFA<&'a [u8]> {
+    /// Safely deserialize a sparse DFA with a specific state identifier
+    /// representation. Upon success, this returns both the deserialized DFA
+    /// and the number of bytes read from the given slice. Namely, the contents
+    /// of the slice beyond the DFA are not read.
+    ///
+    /// Deserializing a DFA using this routine will never allocate heap memory.
+    /// For safety purposes, the DFA's transitions will be verified such that
+    /// every transition points to a valid state. If this verification is too
+    /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
+    /// will always execute in constant time.
+    ///
+    /// The bytes given must be generated by one of the serialization APIs
+    /// of a `DFA` using a semver compatible release of this crate. Those
+    /// include:
+    ///
+    /// * [`DFA::to_bytes_little_endian`]
+    /// * [`DFA::to_bytes_big_endian`]
+    /// * [`DFA::to_bytes_native_endian`]
+    /// * [`DFA::write_to_little_endian`]
+    /// * [`DFA::write_to_big_endian`]
+    /// * [`DFA::write_to_native_endian`]
+    ///
+    /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The
+    /// `write_to` methods do not allocate and write to an existing slice
+    /// (which may be on the stack). Since deserialization always uses the
+    /// native endianness of the target platform, the serialization API you use
+    /// should match the endianness of the target platform. (It's often a good
+    /// idea to generate serialized DFAs for both forms of endianness and then
+    /// load the correct one based on endianness.)
+    ///
+    /// # Errors
+    ///
+    /// Generally speaking, it's easier to state the conditions in which an
+    /// error is _not_ returned. All of the following must be true:
+    ///
+    /// * The bytes given must be produced by one of the serialization APIs
+    ///   on this DFA, as mentioned above.
+    /// * The endianness of the target platform matches the endianness used to
+    ///   serialized the provided DFA.
+    ///
+    /// If any of the above are not true, then an error will be returned.
+    ///
+    /// Note that unlike deserializing a
+    /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has
+    /// no alignment requirements. That is, an alignment of `1` is valid.
+    ///
+    /// # Panics
+    ///
+    /// This routine will never panic for any input.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to serialize a DFA to raw bytes, deserialize it
+    /// and then use it for searching.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse::DFA},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// let initial = DFA::new("foo[0-9]+")?;
+    /// let bytes = initial.to_bytes_native_endian();
+    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0;
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// # Example: loading a DFA from static memory
+    ///
+    /// One use case this library supports is the ability to serialize a
+    /// DFA to disk and then use `include_bytes!` to store it in a compiled
+    /// Rust program. Those bytes can then be cheaply deserialized into a
+    /// `DFA` structure at runtime and used for searching without having to
+    /// re-compile the DFA (which can be quite costly).
+    ///
+    /// We can show this in two parts. The first part is serializing the DFA to
+    /// a file:
+    ///
+    /// ```no_run
+    /// use regex_automata::dfa::{Automaton, sparse::DFA};
+    ///
+    /// let dfa = DFA::new("foo[0-9]+")?;
+    ///
+    /// // Write a big endian serialized version of this DFA to a file.
+    /// let bytes = dfa.to_bytes_big_endian();
+    /// std::fs::write("foo.bigendian.dfa", &bytes)?;
+    ///
+    /// // Do it again, but this time for little endian.
+    /// let bytes = dfa.to_bytes_little_endian();
+    /// std::fs::write("foo.littleendian.dfa", &bytes)?;
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// And now the second part is embedding the DFA into the compiled program
+    /// and deserializing it at runtime on first use. We use conditional
+    /// compilation to choose the correct endianness. As mentioned above, we
+    /// do not need to employ any special tricks to ensure a proper alignment,
+    /// since a sparse DFA has no alignment requirements.
+    ///
+    /// ```no_run
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// type DFA = sparse::DFA<&'static [u8]>;
+    ///
+    /// fn get_foo() -> &'static DFA {
+    ///     use std::cell::Cell;
+    ///     use std::mem::MaybeUninit;
+    ///     use std::sync::Once;
+    ///
+    ///     # const _: &str = stringify! {
+    ///     #[cfg(target_endian = "big")]
+    ///     static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa");
+    ///     #[cfg(target_endian = "little")]
+    ///     static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa");
+    ///     # };
+    ///     # static BYTES: &[u8] = b"";
+    ///
+    ///     struct Lazy(Cell<MaybeUninit<DFA>>);
+    ///     // SAFETY: This is safe because DFA impls Sync.
+    ///     unsafe impl Sync for Lazy {}
+    ///
+    ///     static INIT: Once = Once::new();
+    ///     static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit()));
+    ///
+    ///     INIT.call_once(|| {
+    ///         let (dfa, _) = DFA::from_bytes(BYTES)
+    ///             .expect("serialized DFA should be valid");
+    ///         // SAFETY: This is guaranteed to only execute once, and all
+    ///         // we do with the pointer is write the DFA to it.
+    ///         unsafe {
+    ///             (*DFA.0.as_ptr()).as_mut_ptr().write(dfa);
+    ///         }
+    ///     });
+    ///     // SAFETY: DFA is guaranteed to by initialized via INIT and is
+    ///     // stored in static memory.
+    ///     unsafe {
+    ///         let dfa = (*DFA.0.as_ptr()).as_ptr();
+    ///         std::mem::transmute::<*const DFA, &'static DFA>(dfa)
+    ///     }
+    /// }
+    ///
+    /// let dfa = get_foo();
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345"));
+    /// ```
+    ///
+    /// Alternatively, consider using
+    /// [`lazy_static`](https://crates.io/crates/lazy_static)
+    /// or
+    /// [`once_cell`](https://crates.io/crates/once_cell),
+    /// which will guarantee safety for you.
+    pub fn from_bytes(
+        slice: &'a [u8],
+    ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
+        // SAFETY: This is safe because we validate both the sparse transitions
+        // (by trying to decode every state) and start state ID list below. If
+        // either validation fails, then we return an error.
+        let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
+        dfa.trans.validate()?;
+        dfa.starts.validate(&dfa.trans)?;
+        // N.B. dfa.special doesn't have a way to do unchecked deserialization,
+        // so it has already been validated.
+        Ok((dfa, nread))
+    }
+
+    /// Deserialize a DFA with a specific state identifier representation in
+    /// constant time by omitting the verification of the validity of the
+    /// sparse transitions.
+    ///
+    /// This is just like [`DFA::from_bytes`], except it can potentially return
+    /// a DFA that exhibits undefined behavior if its transitions contains
+    /// invalid state identifiers.
+    ///
+    /// This routine is useful if you need to deserialize a DFA cheaply and
+    /// cannot afford the transition validation performed by `from_bytes`.
+    ///
+    /// # Safety
+    ///
+    /// This routine is unsafe because it permits callers to provide
+    /// arbitrary transitions with possibly incorrect state identifiers. While
+    /// the various serialization routines will never return an incorrect
+    /// DFA, there is no guarantee that the bytes provided here
+    /// are correct. While `from_bytes_unchecked` will still do several forms
+    /// of basic validation, this routine does not check that the transitions
+    /// themselves are correct. Given an incorrect transition table, it is
+    /// possible for the search routines to access out-of-bounds memory because
+    /// of explicit bounds check elision.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     dfa::{Automaton, sparse::DFA},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// let initial = DFA::new("foo[0-9]+")?;
+    /// let bytes = initial.to_bytes_native_endian();
+    /// // SAFETY: This is guaranteed to be safe since the bytes given come
+    /// // directly from a compatible serialization routine.
+    /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
+    ///
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub unsafe fn from_bytes_unchecked(
+        slice: &'a [u8],
+    ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
+        let mut nr = 0;
+
+        nr += bytes::read_label(&slice[nr..], LABEL)?;
+        nr += bytes::read_endianness_check(&slice[nr..])?;
+        nr += bytes::read_version(&slice[nr..], VERSION)?;
+
+        let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?;
+        nr += size_of::<u32>();
+
+        let (trans, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?;
+        nr += nread;
+
+        let (starts, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
+        nr += nread;
+
+        let (special, nread) = Special::from_bytes(&slice[nr..])?;
+        nr += nread;
+        if special.max.as_usize() >= trans.sparse().len() {
+            return Err(DeserializeError::generic(
+                "max should not be greater than or equal to sparse bytes",
+            ));
+        }
+
+        Ok((DFA { trans, starts, special }, nr))
+    }
+}
+
+impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "sparse::DFA(")?;
+        for state in self.trans.states() {
+            fmt_state_indicator(f, self, state.id())?;
+            writeln!(f, "{:06?}: {:?}", state.id(), state)?;
+        }
+        writeln!(f, "")?;
+        for (i, (start_id, sty, pid)) in self.starts.iter().enumerate() {
+            if i % self.starts.stride == 0 {
+                match pid {
+                    None => writeln!(f, "START-GROUP(ALL)")?,
+                    Some(pid) => {
+                        writeln!(f, "START_GROUP(pattern: {:?})", pid)?
+                    }
+                }
+            }
+            writeln!(f, "  {:?} => {:06?}", sty, start_id.as_usize())?;
+        }
+        writeln!(f, "state count: {:?}", self.trans.count)?;
+        writeln!(f, ")")?;
+        Ok(())
+    }
+}
+
+unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
+    #[inline]
+    fn is_special_state(&self, id: StateID) -> bool {
+        self.special.is_special_state(id)
+    }
+
+    #[inline]
+    fn is_dead_state(&self, id: StateID) -> bool {
+        self.special.is_dead_state(id)
+    }
+
+    #[inline]
+    fn is_quit_state(&self, id: StateID) -> bool {
+        self.special.is_quit_state(id)
+    }
+
+    #[inline]
+    fn is_match_state(&self, id: StateID) -> bool {
+        self.special.is_match_state(id)
+    }
+
+    #[inline]
+    fn is_start_state(&self, id: StateID) -> bool {
+        self.special.is_start_state(id)
+    }
+
+    #[inline]
+    fn is_accel_state(&self, id: StateID) -> bool {
+        self.special.is_accel_state(id)
+    }
+
+    // This is marked as inline to help dramatically boost sparse searching,
+    // which decodes each state it enters to follow the next transition.
+    #[inline(always)]
+    fn next_state(&self, current: StateID, input: u8) -> StateID {
+        let input = self.trans.classes.get(input);
+        self.trans.state(current).next(input)
+    }
+
+    #[inline]
+    unsafe fn next_state_unchecked(
+        &self,
+        current: StateID,
+        input: u8,
+    ) -> StateID {
+        self.next_state(current, input)
+    }
+
+    #[inline]
+    fn next_eoi_state(&self, current: StateID) -> StateID {
+        self.trans.state(current).next_eoi()
+    }
+
+    #[inline]
+    fn pattern_count(&self) -> usize {
+        self.trans.patterns
+    }
+
+    #[inline]
+    fn match_count(&self, id: StateID) -> usize {
+        self.trans.state(id).pattern_count()
+    }
+
+    #[inline]
+    fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
+        // This is an optimization for the very common case of a DFA with a
+        // single pattern. This conditional avoids a somewhat more costly path
+        // that finds the pattern ID from the state machine, which requires
+        // a bit of slicing/pointer-chasing. This optimization tends to only
+        // matter when matches are frequent.
+        if self.trans.patterns == 1 {
+            return PatternID::ZERO;
+        }
+        self.trans.state(id).pattern_id(match_index)
+    }
+
+    #[inline]
+    fn start_state_forward(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> StateID {
+        let index = Start::from_position_fwd(bytes, start, end);
+        self.starts.start(index, pattern_id)
+    }
+
+    #[inline]
+    fn start_state_reverse(
+        &self,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> StateID {
+        let index = Start::from_position_rev(bytes, start, end);
+        self.starts.start(index, pattern_id)
+    }
+
+    #[inline]
+    fn accelerator(&self, id: StateID) -> &[u8] {
+        self.trans.state(id).accelerator()
+    }
+}
+
+/// The transition table portion of a sparse DFA.
+///
+/// The transition table is the core part of the DFA in that it describes how
+/// to move from one state to another based on the input sequence observed.
+///
+/// Unlike a typical dense table based DFA, states in a sparse transition
+/// table have variable size. That is, states with more transitions use more
+/// space than states with fewer transitions. This means that finding the next
+/// transition takes more work than with a dense DFA, but also typically uses
+/// much less space.
+#[derive(Clone)]
+struct Transitions<T> {
+    /// The raw encoding of each state in this DFA.
+    ///
+    /// Each state has the following information:
+    ///
+    /// * A set of transitions to subsequent states. Transitions to the dead
+    ///   state are omitted.
+    /// * If the state can be accelerated, then any additional accelerator
+    ///   information.
+    /// * If the state is a match state, then the state contains all pattern
+    ///   IDs that match when in that state.
+    ///
+    /// To decode a state, use Transitions::state.
+    ///
+    /// In practice, T is either Vec<u8> or &[u8].
+    sparse: T,
+    /// A set of equivalence classes, where a single equivalence class
+    /// represents a set of bytes that never discriminate between a match
+    /// and a non-match in the DFA. Each equivalence class corresponds to a
+    /// single character in this DFA's alphabet, where the maximum number of
+    /// characters is 257 (each possible value of a byte plus the special
+    /// EOI transition). Consequently, the number of equivalence classes
+    /// corresponds to the number of transitions for each DFA state. Note
+    /// though that the *space* used by each DFA state in the transition table
+    /// may be larger. The total space used by each DFA state is known as the
+    /// stride and is documented above.
+    ///
+    /// The only time the number of equivalence classes is fewer than 257 is
+    /// if the DFA's kind uses byte classes which is the default. Equivalence
+    /// classes should generally only be disabled when debugging, so that
+    /// the transitions themselves aren't obscured. Disabling them has no
+    /// other benefit, since the equivalence class map is always used while
+    /// searching. In the vast majority of cases, the number of equivalence
+    /// classes is substantially smaller than 257, particularly when large
+    /// Unicode classes aren't used.
+    ///
+    /// N.B. Equivalence classes aren't particularly useful in a sparse DFA
+    /// in the current implementation, since equivalence classes generally tend
+    /// to correspond to continuous ranges of bytes that map to the same
+    /// transition. So in a sparse DFA, equivalence classes don't really lead
+    /// to a space savings. In the future, it would be good to try and remove
+    /// them from sparse DFAs entirely, but requires a bit of work since sparse
+    /// DFAs are built from dense DFAs, which are in turn built on top of
+    /// equivalence classes.
+    classes: ByteClasses,
+    /// The total number of states in this DFA. Note that a DFA always has at
+    /// least one state---the dead state---even the empty DFA. In particular,
+    /// the dead state always has ID 0 and is correspondingly always the first
+    /// state. The dead state is never a match state.
+    count: usize,
+    /// The total number of unique patterns represented by these match states.
+    patterns: usize,
+}
+
+impl<'a> Transitions<&'a [u8]> {
+    unsafe fn from_bytes_unchecked(
+        mut slice: &'a [u8],
+    ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> {
+        let slice_start = slice.as_ptr() as usize;
+
+        let (state_count, nr) =
+            bytes::try_read_u32_as_usize(&slice, "state count")?;
+        slice = &slice[nr..];
+
+        let (pattern_count, nr) =
+            bytes::try_read_u32_as_usize(&slice, "pattern count")?;
+        slice = &slice[nr..];
+
+        let (classes, nr) = ByteClasses::from_bytes(&slice)?;
+        slice = &slice[nr..];
+
+        let (len, nr) =
+            bytes::try_read_u32_as_usize(&slice, "sparse transitions length")?;
+        slice = &slice[nr..];
+
+        bytes::check_slice_len(slice, len, "sparse states byte length")?;
+        let sparse = &slice[..len];
+        slice = &slice[len..];
+
+        let trans = Transitions {
+            sparse,
+            classes,
+            count: state_count,
+            patterns: pattern_count,
+        };
+        Ok((trans, slice.as_ptr() as usize - slice_start))
+    }
+}
+
+impl<T: AsRef<[u8]>> Transitions<T> {
+    /// Writes a serialized form of this transition table to the buffer given.
+    /// If the buffer is too small, then an error is returned. To determine
+    /// how big the buffer must be, use `write_to_len`.
+    fn write_to<E: Endian>(
+        &self,
+        mut dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        let nwrite = self.write_to_len();
+        if dst.len() < nwrite {
+            return Err(SerializeError::buffer_too_small(
+                "sparse transition table",
+            ));
+        }
+        dst = &mut dst[..nwrite];
+
+        // write state count
+        E::write_u32(u32::try_from(self.count).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+
+        // write pattern count
+        E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+
+        // write byte class map
+        let n = self.classes.write_to(dst)?;
+        dst = &mut dst[n..];
+
+        // write number of bytes in sparse transitions
+        E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+
+        // write actual transitions
+        dst.copy_from_slice(self.sparse());
+        Ok(nwrite)
+    }
+
+    /// Returns the number of bytes the serialized form of this transition
+    /// table will use.
+    fn write_to_len(&self) -> usize {
+        size_of::<u32>()   // state count
+        + size_of::<u32>() // pattern count
+        + self.classes.write_to_len()
+        + size_of::<u32>() // sparse transitions length
+        + self.sparse().len()
+    }
+
+    /// Validates that every state ID in this transition table is valid.
+    ///
+    /// That is, every state ID can be used to correctly index a state in this
+    /// table.
+    fn validate(&self) -> Result<(), DeserializeError> {
+        // In order to validate everything, we not only need to make sure we
+        // can decode every state, but that every transition in every state
+        // points to a valid state. There are many duplicative transitions, so
+        // we record state IDs that we've verified so that we don't redo the
+        // decoding work.
+        //
+        // Except, when in no_std mode, we don't have dynamic memory allocation
+        // available to us, so we skip this optimization. It's not clear
+        // whether doing something more clever is worth it just yet. If you're
+        // profiling this code and need it to run faster, please file an issue.
+        //
+        // ---AG
+        struct Seen {
+            #[cfg(feature = "alloc")]
+            set: BTreeSet<StateID>,
+            #[cfg(not(feature = "alloc"))]
+            set: core::marker::PhantomData<StateID>,
+        }
+
+        #[cfg(feature = "alloc")]
+        impl Seen {
+            fn new() -> Seen {
+                Seen { set: BTreeSet::new() }
+            }
+            fn insert(&mut self, id: StateID) {
+                self.set.insert(id);
+            }
+            fn contains(&self, id: &StateID) -> bool {
+                self.set.contains(id)
+            }
+        }
+
+        #[cfg(not(feature = "alloc"))]
+        impl Seen {
+            fn new() -> Seen {
+                Seen { set: core::marker::PhantomData }
+            }
+            fn insert(&mut self, _id: StateID) {}
+            fn contains(&self, _id: &StateID) -> bool {
+                false
+            }
+        }
+
+        let mut verified: Seen = Seen::new();
+        // We need to make sure that we decode the correct number of states.
+        // Otherwise, an empty set of transitions would validate even if the
+        // recorded state count is non-empty.
+        let mut count = 0;
+        // We can't use the self.states() iterator because it assumes the state
+        // encodings are valid. It could panic if they aren't.
+        let mut id = DEAD;
+        while id.as_usize() < self.sparse().len() {
+            let state = self.try_state(id)?;
+            verified.insert(id);
+            // The next ID should be the offset immediately following `state`.
+            id = StateID::new(bytes::add(
+                id.as_usize(),
+                state.bytes_len(),
+                "next state ID offset",
+            )?)
+            .map_err(|err| {
+                DeserializeError::state_id_error(err, "next state ID offset")
+            })?;
+            count += 1;
+
+            // Now check that all transitions in this state are correct.
+            for i in 0..state.ntrans {
+                let to = state.next_at(i);
+                if verified.contains(&to) {
+                    continue;
+                }
+                let _ = self.try_state(to)?;
+                verified.insert(id);
+            }
+        }
+        if count != self.count {
+            return Err(DeserializeError::generic(
+                "mismatching sparse state count",
+            ));
+        }
+        Ok(())
+    }
+
+    /// Converts these transitions to a borrowed value.
+    fn as_ref(&self) -> Transitions<&'_ [u8]> {
+        Transitions {
+            sparse: self.sparse(),
+            classes: self.classes.clone(),
+            count: self.count,
+            patterns: self.patterns,
+        }
+    }
+
+    /// Converts these transitions to an owned value.
+    #[cfg(feature = "alloc")]
+    fn to_owned(&self) -> Transitions<Vec<u8>> {
+        Transitions {
+            sparse: self.sparse().to_vec(),
+            classes: self.classes.clone(),
+            count: self.count,
+            patterns: self.patterns,
+        }
+    }
+
+    /// Return a convenient representation of the given state.
+    ///
+    /// This panics if the state is invalid.
+    ///
+    /// This is marked as inline to help dramatically boost sparse searching,
+    /// which decodes each state it enters to follow the next transition. Other
+    /// functions involved are also inlined, which should hopefully eliminate
+    /// a lot of the extraneous decoding that is never needed just to follow
+    /// the next transition.
+    #[inline(always)]
+    fn state(&self, id: StateID) -> State<'_> {
+        let mut state = &self.sparse()[id.as_usize()..];
+        let mut ntrans = bytes::read_u16(&state) as usize;
+        let is_match = (1 << 15) & ntrans != 0;
+        ntrans &= !(1 << 15);
+        state = &state[2..];
+
+        let (input_ranges, state) = state.split_at(ntrans * 2);
+        let (next, state) = state.split_at(ntrans * StateID::SIZE);
+        let (pattern_ids, state) = if is_match {
+            let npats = bytes::read_u32(&state) as usize;
+            state[4..].split_at(npats * 4)
+        } else {
+            (&[][..], state)
+        };
+
+        let accel_len = state[0] as usize;
+        let accel = &state[1..accel_len + 1];
+        State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel }
+    }
+
+    /// Like `state`, but will return an error if the state encoding is
+    /// invalid. This is useful for verifying states after deserialization,
+    /// which is required for a safe deserialization API.
+    ///
+    /// Note that this only verifies that this state is decodable and that
+    /// all of its data is consistent. It does not verify that its state ID
+    /// transitions point to valid states themselves, nor does it verify that
+    /// every pattern ID is valid.
+    fn try_state(&self, id: StateID) -> Result<State<'_>, DeserializeError> {
+        if id.as_usize() > self.sparse().len() {
+            return Err(DeserializeError::generic("invalid sparse state ID"));
+        }
+        let mut state = &self.sparse()[id.as_usize()..];
+        // Encoding format starts with a u16 that stores the total number of
+        // transitions in this state.
+        let (mut ntrans, _) =
+            bytes::try_read_u16_as_usize(state, "state transition count")?;
+        let is_match = ((1 << 15) & ntrans) != 0;
+        ntrans &= !(1 << 15);
+        state = &state[2..];
+        if ntrans > 257 || ntrans == 0 {
+            return Err(DeserializeError::generic("invalid transition count"));
+        }
+
+        // Each transition has two pieces: an inclusive range of bytes on which
+        // it is defined, and the state ID that those bytes transition to. The
+        // pairs come first, followed by a corresponding sequence of state IDs.
+        let input_ranges_len = ntrans.checked_mul(2).unwrap();
+        bytes::check_slice_len(state, input_ranges_len, "sparse byte pairs")?;
+        let (input_ranges, state) = state.split_at(input_ranges_len);
+        // Every range should be of the form A-B, where A<=B.
+        for pair in input_ranges.chunks(2) {
+            let (start, end) = (pair[0], pair[1]);
+            if start > end {
+                return Err(DeserializeError::generic("invalid input range"));
+            }
+        }
+
+        // And now extract the corresponding sequence of state IDs. We leave
+        // this sequence as a &[u8] instead of a &[S] because sparse DFAs do
+        // not have any alignment requirements.
+        let next_len = ntrans
+            .checked_mul(self.id_len())
+            .expect("state size * #trans should always fit in a usize");
+        bytes::check_slice_len(state, next_len, "sparse trans state IDs")?;
+        let (next, state) = state.split_at(next_len);
+        // We can at least verify that every state ID is in bounds.
+        for idbytes in next.chunks(self.id_len()) {
+            let (id, _) =
+                bytes::read_state_id(idbytes, "sparse state ID in try_state")?;
+            bytes::check_slice_len(
+                self.sparse(),
+                id.as_usize(),
+                "invalid sparse state ID",
+            )?;
+        }
+
+        // If this is a match state, then read the pattern IDs for this state.
+        // Pattern IDs is a u32-length prefixed sequence of native endian
+        // encoded 32-bit integers.
+        let (pattern_ids, state) = if is_match {
+            let (npats, nr) =
+                bytes::try_read_u32_as_usize(state, "pattern ID count")?;
+            let state = &state[nr..];
+
+            let pattern_ids_len =
+                bytes::mul(npats, 4, "sparse pattern ID byte length")?;
+            bytes::check_slice_len(
+                state,
+                pattern_ids_len,
+                "sparse pattern IDs",
+            )?;
+            let (pattern_ids, state) = state.split_at(pattern_ids_len);
+            for patbytes in pattern_ids.chunks(PatternID::SIZE) {
+                bytes::read_pattern_id(
+                    patbytes,
+                    "sparse pattern ID in try_state",
+                )?;
+            }
+            (pattern_ids, state)
+        } else {
+            (&[][..], state)
+        };
+
+        // Now read this state's accelerator info. The first byte is the length
+        // of the accelerator, which is typically 0 (for no acceleration) but
+        // is no bigger than 3. The length indicates the number of bytes that
+        // follow, where each byte corresponds to a transition out of this
+        // state.
+        if state.is_empty() {
+            return Err(DeserializeError::generic("no accelerator length"));
+        }
+        let (accel_len, state) = (state[0] as usize, &state[1..]);
+
+        if accel_len > 3 {
+            return Err(DeserializeError::generic(
+                "sparse invalid accelerator length",
+            ));
+        }
+        bytes::check_slice_len(
+            state,
+            accel_len,
+            "sparse corrupt accelerator length",
+        )?;
+        let (accel, _) = (&state[..accel_len], &state[accel_len..]);
+
+        Ok(State {
+            id,
+            is_match,
+            ntrans,
+            input_ranges,
+            next,
+            pattern_ids,
+            accel,
+        })
+    }
+
+    /// Return an iterator over all of the states in this DFA.
+    ///
+    /// The iterator returned yields tuples, where the first element is the
+    /// state ID and the second element is the state itself.
+    fn states(&self) -> StateIter<'_, T> {
+        StateIter { trans: self, id: DEAD.as_usize() }
+    }
+
+    /// Returns the sparse transitions as raw bytes.
+    fn sparse(&self) -> &[u8] {
+        self.sparse.as_ref()
+    }
+
+    /// Returns the number of bytes represented by a single state ID.
+    fn id_len(&self) -> usize {
+        StateID::SIZE
+    }
+
+    /// Return the memory usage, in bytes, of these transitions.
+    ///
+    /// This does not include the size of a `Transitions` value itself.
+    fn memory_usage(&self) -> usize {
+        self.sparse().len()
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl<T: AsMut<[u8]>> Transitions<T> {
+    /// Return a convenient mutable representation of the given state.
+    /// This panics if the state is invalid.
+    fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
+        let mut state = &mut self.sparse_mut()[id.as_usize()..];
+        let mut ntrans = bytes::read_u16(&state) as usize;
+        let is_match = (1 << 15) & ntrans != 0;
+        ntrans &= !(1 << 15);
+        state = &mut state[2..];
+
+        let (input_ranges, state) = state.split_at_mut(ntrans * 2);
+        let (next, state) = state.split_at_mut(ntrans * StateID::SIZE);
+        let (pattern_ids, state) = if is_match {
+            let npats = bytes::read_u32(&state) as usize;
+            state[4..].split_at_mut(npats * 4)
+        } else {
+            (&mut [][..], state)
+        };
+
+        let accel_len = state[0] as usize;
+        let accel = &mut state[1..accel_len + 1];
+        StateMut {
+            id,
+            is_match,
+            ntrans,
+            input_ranges,
+            next,
+            pattern_ids,
+            accel,
+        }
+    }
+
+    /// Returns the sparse transitions as raw mutable bytes.
+    fn sparse_mut(&mut self) -> &mut [u8] {
+        self.sparse.as_mut()
+    }
+}
+
+/// The set of all possible starting states in a DFA.
+///
+/// See the eponymous type in the `dense` module for more details. This type
+/// is very similar to `dense::StartTable`, except that its underlying
+/// representation is `&[u8]` instead of `&[S]`. (The latter would require
+/// sparse DFAs to be aligned, which is explicitly something we do not require
+/// because we don't really need it.)
+#[derive(Clone)]
+struct StartTable<T> {
+    /// The initial start state IDs as a contiguous table of native endian
+    /// encoded integers, represented by `S`.
+    ///
+    /// In practice, T is either Vec<u8> or &[u8] and has no alignment
+    /// requirements.
+    ///
+    /// The first `stride` (currently always 4) entries always correspond to
+    /// the start states for the entire DFA. After that, there are
+    /// `stride * patterns` state IDs, where `patterns` may be zero in the
+    /// case of a DFA with no patterns or in the case where the DFA was built
+    /// without enabling starting states for each pattern.
+    table: T,
+    /// The number of starting state IDs per pattern.
+    stride: usize,
+    /// The total number of patterns for which starting states are encoded.
+    /// This may be zero for non-empty DFAs when the DFA was built without
+    /// start states for each pattern.
+    patterns: usize,
+}
+
+#[cfg(feature = "alloc")]
+impl StartTable<Vec<u8>> {
+    fn new(patterns: usize) -> StartTable<Vec<u8>> {
+        let stride = Start::count();
+        // This is OK since the only way we're here is if a dense DFA could be
+        // constructed successfully, which uses the same space.
+        let len = stride
+            .checked_mul(patterns)
+            .unwrap()
+            .checked_add(stride)
+            .unwrap()
+            .checked_mul(StateID::SIZE)
+            .unwrap();
+        StartTable { table: vec![0; len], stride, patterns }
+    }
+
+    fn from_dense_dfa<T: AsRef<[u32]>>(
+        dfa: &dense::DFA<T>,
+        remap: &[StateID],
+    ) -> Result<StartTable<Vec<u8>>, Error> {
+        // Unless the DFA has start states compiled for each pattern, then
+        // as far as the starting state table is concerned, there are zero
+        // patterns to account for. It will instead only store starting states
+        // for the entire DFA.
+        let start_pattern_count = if dfa.has_starts_for_each_pattern() {
+            dfa.pattern_count()
+        } else {
+            0
+        };
+        let mut sl = StartTable::new(start_pattern_count);
+        for (old_start_id, sty, pid) in dfa.starts() {
+            let new_start_id = remap[dfa.to_index(old_start_id)];
+            sl.set_start(sty, pid, new_start_id);
+        }
+        Ok(sl)
+    }
+}
+
+impl<'a> StartTable<&'a [u8]> {
+    unsafe fn from_bytes_unchecked(
+        mut slice: &'a [u8],
+    ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> {
+        let slice_start = slice.as_ptr() as usize;
+
+        let (stride, nr) =
+            bytes::try_read_u32_as_usize(slice, "sparse start table stride")?;
+        slice = &slice[nr..];
+
+        let (patterns, nr) = bytes::try_read_u32_as_usize(
+            slice,
+            "sparse start table patterns",
+        )?;
+        slice = &slice[nr..];
+
+        if stride != Start::count() {
+            return Err(DeserializeError::generic(
+                "invalid sparse starting table stride",
+            ));
+        }
+        if patterns > PatternID::LIMIT {
+            return Err(DeserializeError::generic(
+                "sparse invalid number of patterns",
+            ));
+        }
+        let pattern_table_size =
+            bytes::mul(stride, patterns, "sparse invalid pattern count")?;
+        // Our start states always start with a single stride of start states
+        // for the entire automaton which permit it to match any pattern. What
+        // follows it are an optional set of start states for each pattern.
+        let start_state_count = bytes::add(
+            stride,
+            pattern_table_size,
+            "sparse invalid 'any' pattern starts size",
+        )?;
+        let table_bytes_len = bytes::mul(
+            start_state_count,
+            StateID::SIZE,
+            "sparse pattern table bytes length",
+        )?;
+        bytes::check_slice_len(
+            slice,
+            table_bytes_len,
+            "sparse start ID table",
+        )?;
+        let table_bytes = &slice[..table_bytes_len];
+        slice = &slice[table_bytes_len..];
+
+        let sl = StartTable { table: table_bytes, stride, patterns };
+        Ok((sl, slice.as_ptr() as usize - slice_start))
+    }
+}
+
+impl<T: AsRef<[u8]>> StartTable<T> {
+    fn write_to<E: Endian>(
+        &self,
+        mut dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        let nwrite = self.write_to_len();
+        if dst.len() < nwrite {
+            return Err(SerializeError::buffer_too_small(
+                "sparse starting table ids",
+            ));
+        }
+        dst = &mut dst[..nwrite];
+
+        // write stride
+        E::write_u32(u32::try_from(self.stride).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+        // write pattern count
+        E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
+        dst = &mut dst[size_of::<u32>()..];
+        // write start IDs
+        dst.copy_from_slice(self.table());
+        Ok(nwrite)
+    }
+
+    /// Returns the number of bytes the serialized form of this transition
+    /// table will use.
+    fn write_to_len(&self) -> usize {
+        size_of::<u32>() // stride
+        + size_of::<u32>() // # patterns
+        + self.table().len()
+    }
+
+    /// Validates that every starting state ID in this table is valid.
+    ///
+    /// That is, every starting state ID can be used to correctly decode a
+    /// state in the DFA's sparse transitions.
+    fn validate(
+        &self,
+        trans: &Transitions<T>,
+    ) -> Result<(), DeserializeError> {
+        for (id, _, _) in self.iter() {
+            let _ = trans.try_state(id)?;
+        }
+        Ok(())
+    }
+
+    /// Converts this start list to a borrowed value.
+    fn as_ref(&self) -> StartTable<&'_ [u8]> {
+        StartTable {
+            table: self.table(),
+            stride: self.stride,
+            patterns: self.patterns,
+        }
+    }
+
+    /// Converts this start list to an owned value.
+    #[cfg(feature = "alloc")]
+    fn to_owned(&self) -> StartTable<Vec<u8>> {
+        StartTable {
+            table: self.table().to_vec(),
+            stride: self.stride,
+            patterns: self.patterns,
+        }
+    }
+
+    /// Return the start state for the given index and pattern ID. If the
+    /// pattern ID is None, then the corresponding start state for the entire
+    /// DFA is returned. If the pattern ID is not None, then the corresponding
+    /// starting state for the given pattern is returned. If this start table
+    /// does not have individual starting states for each pattern, then this
+    /// panics.
+    fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID {
+        let start_index = index.as_usize();
+        let index = match pattern_id {
+            None => start_index,
+            Some(pid) => {
+                let pid = pid.as_usize();
+                assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
+                self.stride
+                    .checked_mul(pid)
+                    .unwrap()
+                    .checked_add(self.stride)
+                    .unwrap()
+                    .checked_add(start_index)
+                    .unwrap()
+            }
+        };
+        let start = index * StateID::SIZE;
+        // This OK since we're allowed to assume that the start table contains
+        // valid StateIDs.
+        bytes::read_state_id_unchecked(&self.table()[start..]).0
+    }
+
+    /// Return an iterator over all start IDs in this table.
+    fn iter(&self) -> StartStateIter<'_, T> {
+        StartStateIter { st: self, i: 0 }
+    }
+
+    /// Returns the total number of start state IDs in this table.
+    fn len(&self) -> usize {
+        self.table().len() / StateID::SIZE
+    }
+
+    /// Returns the table as a raw slice of bytes.
+    fn table(&self) -> &[u8] {
+        self.table.as_ref()
+    }
+
+    /// Return the memory usage, in bytes, of this start list.
+    ///
+    /// This does not include the size of a `StartTable` value itself.
+    fn memory_usage(&self) -> usize {
+        self.table().len()
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl<T: AsMut<[u8]>> StartTable<T> {
+    /// Set the start state for the given index and pattern.
+    ///
+    /// If the pattern ID or state ID are not valid, then this will panic.
+    fn set_start(
+        &mut self,
+        index: Start,
+        pattern_id: Option<PatternID>,
+        id: StateID,
+    ) {
+        let start_index = index.as_usize();
+        let index = match pattern_id {
+            None => start_index,
+            Some(pid) => {
+                let pid = pid.as_usize();
+                assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
+                self.stride
+                    .checked_mul(pid)
+                    .unwrap()
+                    .checked_add(self.stride)
+                    .unwrap()
+                    .checked_add(start_index)
+                    .unwrap()
+            }
+        };
+        let start = index * StateID::SIZE;
+        let end = start + StateID::SIZE;
+        bytes::write_state_id::<bytes::NE>(
+            id,
+            &mut self.table.as_mut()[start..end],
+        );
+    }
+}
+
+/// An iterator over all state state IDs in a sparse DFA.
+struct StartStateIter<'a, T> {
+    st: &'a StartTable<T>,
+    i: usize,
+}
+
+impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
+    type Item = (StateID, Start, Option<PatternID>);
+
+    fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> {
+        let i = self.i;
+        if i >= self.st.len() {
+            return None;
+        }
+        self.i += 1;
+
+        // This unwrap is okay since the stride of any DFA must always match
+        // the number of start state types.
+        let start_type = Start::from_usize(i % self.st.stride).unwrap();
+        let pid = if i < self.st.stride {
+            // This means we don't have start states for each pattern.
+            None
+        } else {
+            // These unwraps are OK since we may assume our table and stride
+            // is correct.
+            let pid = i
+                .checked_sub(self.st.stride)
+                .unwrap()
+                .checked_div(self.st.stride)
+                .unwrap();
+            Some(PatternID::new(pid).unwrap())
+        };
+        let start = i * StateID::SIZE;
+        let end = start + StateID::SIZE;
+        let bytes = self.st.table()[start..end].try_into().unwrap();
+        // This is OK since we're allowed to assume that any IDs in this start
+        // table are correct and valid for this DFA.
+        let id = StateID::from_ne_bytes_unchecked(bytes);
+        Some((id, start_type, pid))
+    }
+}
+
+impl<'a, T> fmt::Debug for StartStateIter<'a, T> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("StartStateIter").field("i", &self.i).finish()
+    }
+}
+
+/// An iterator over all states in a sparse DFA.
+///
+/// This iterator yields tuples, where the first element is the state ID and
+/// the second element is the state itself.
+struct StateIter<'a, T> {
+    trans: &'a Transitions<T>,
+    id: usize,
+}
+
+impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> {
+    type Item = State<'a>;
+
+    fn next(&mut self) -> Option<State<'a>> {
+        if self.id >= self.trans.sparse().len() {
+            return None;
+        }
+        let state = self.trans.state(StateID::new_unchecked(self.id));
+        self.id = self.id + state.bytes_len();
+        Some(state)
+    }
+}
+
+impl<'a, T> fmt::Debug for StateIter<'a, T> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        f.debug_struct("StateIter").field("id", &self.id).finish()
+    }
+}
+
+/// A representation of a sparse DFA state that can be cheaply materialized
+/// from a state identifier.
+#[derive(Clone)]
+struct State<'a> {
+    /// The identifier of this state.
+    id: StateID,
+    /// Whether this is a match state or not.
+    is_match: bool,
+    /// The number of transitions in this state.
+    ntrans: usize,
+    /// Pairs of input ranges, where there is one pair for each transition.
+    /// Each pair specifies an inclusive start and end byte range for the
+    /// corresponding transition.
+    input_ranges: &'a [u8],
+    /// Transitions to the next state. This slice contains native endian
+    /// encoded state identifiers, with `S` as the representation. Thus, there
+    /// are `ntrans * size_of::<S>()` bytes in this slice.
+    next: &'a [u8],
+    /// If this is a match state, then this contains the pattern IDs that match
+    /// when the DFA is in this state.
+    ///
+    /// This is a contiguous sequence of 32-bit native endian encoded integers.
+    pattern_ids: &'a [u8],
+    /// An accelerator for this state, if present. If this state has no
+    /// accelerator, then this is an empty slice. When non-empty, this slice
+    /// has length at most 3 and corresponds to the exhaustive set of bytes
+    /// that must be seen in order to transition out of this state.
+    accel: &'a [u8],
+}
+
+impl<'a> State<'a> {
+    /// Searches for the next transition given an input byte. If no such
+    /// transition could be found, then a dead state is returned.
+    ///
+    /// This is marked as inline to help dramatically boost sparse searching,
+    /// which decodes each state it enters to follow the next transition.
+    #[inline(always)]
+    fn next(&self, input: u8) -> StateID {
+        // This straight linear search was observed to be much better than
+        // binary search on ASCII haystacks, likely because a binary search
+        // visits the ASCII case last but a linear search sees it first. A
+        // binary search does do a little better on non-ASCII haystacks, but
+        // not by much. There might be a better trade off lurking here.
+        for i in 0..(self.ntrans - 1) {
+            let (start, end) = self.range(i);
+            if start <= input && input <= end {
+                return self.next_at(i);
+            }
+            // We could bail early with an extra branch: if input < b1, then
+            // we know we'll never find a matching transition. Interestingly,
+            // this extra branch seems to not help performance, or will even
+            // hurt it. It's likely very dependent on the DFA itself and what
+            // is being searched.
+        }
+        DEAD
+    }
+
+    /// Returns the next state ID for the special EOI transition.
+    fn next_eoi(&self) -> StateID {
+        self.next_at(self.ntrans - 1)
+    }
+
+    /// Returns the identifier for this state.
+    fn id(&self) -> StateID {
+        self.id
+    }
+
+    /// Returns the inclusive input byte range for the ith transition in this
+    /// state.
+    fn range(&self, i: usize) -> (u8, u8) {
+        (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
+    }
+
+    /// Returns the next state for the ith transition in this state.
+    fn next_at(&self, i: usize) -> StateID {
+        let start = i * StateID::SIZE;
+        let end = start + StateID::SIZE;
+        let bytes = self.next[start..end].try_into().unwrap();
+        StateID::from_ne_bytes_unchecked(bytes)
+    }
+
+    /// Returns the pattern ID for the given match index. If the match index
+    /// is invalid, then this panics.
+    fn pattern_id(&self, match_index: usize) -> PatternID {
+        let start = match_index * PatternID::SIZE;
+        bytes::read_pattern_id_unchecked(&self.pattern_ids[start..]).0
+    }
+
+    /// Returns the total number of pattern IDs for this state. This is always
+    /// zero when `is_match` is false.
+    fn pattern_count(&self) -> usize {
+        assert_eq!(0, self.pattern_ids.len() % 4);
+        self.pattern_ids.len() / 4
+    }
+
+    /// Return the total number of bytes that this state consumes in its
+    /// encoded form.
+    fn bytes_len(&self) -> usize {
+        let mut len = 2
+            + (self.ntrans * 2)
+            + (self.ntrans * StateID::SIZE)
+            + (1 + self.accel.len());
+        if self.is_match {
+            len += size_of::<u32>() + self.pattern_ids.len();
+        }
+        len
+    }
+
+    /// Return an accelerator for this state.
+    fn accelerator(&self) -> &'a [u8] {
+        self.accel
+    }
+}
+
+impl<'a> fmt::Debug for State<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut printed = false;
+        for i in 0..(self.ntrans - 1) {
+            let next = self.next_at(i);
+            if next == DEAD {
+                continue;
+            }
+
+            if printed {
+                write!(f, ", ")?;
+            }
+            let (start, end) = self.range(i);
+            if start == end {
+                write!(f, "{:?} => {:?}", DebugByte(start), next)?;
+            } else {
+                write!(
+                    f,
+                    "{:?}-{:?} => {:?}",
+                    DebugByte(start),
+                    DebugByte(end),
+                    next,
+                )?;
+            }
+            printed = true;
+        }
+        let eoi = self.next_at(self.ntrans - 1);
+        if eoi != DEAD {
+            if printed {
+                write!(f, ", ")?;
+            }
+            write!(f, "EOI => {:?}", eoi)?;
+        }
+        Ok(())
+    }
+}
+
+/// A representation of a mutable sparse DFA state that can be cheaply
+/// materialized from a state identifier.
+#[cfg(feature = "alloc")]
+struct StateMut<'a> {
+    /// The identifier of this state.
+    id: StateID,
+    /// Whether this is a match state or not.
+    is_match: bool,
+    /// The number of transitions in this state.
+    ntrans: usize,
+    /// Pairs of input ranges, where there is one pair for each transition.
+    /// Each pair specifies an inclusive start and end byte range for the
+    /// corresponding transition.
+    input_ranges: &'a mut [u8],
+    /// Transitions to the next state. This slice contains native endian
+    /// encoded state identifiers, with `S` as the representation. Thus, there
+    /// are `ntrans * size_of::<S>()` bytes in this slice.
+    next: &'a mut [u8],
+    /// If this is a match state, then this contains the pattern IDs that match
+    /// when the DFA is in this state.
+    ///
+    /// This is a contiguous sequence of 32-bit native endian encoded integers.
+    pattern_ids: &'a [u8],
+    /// An accelerator for this state, if present. If this state has no
+    /// accelerator, then this is an empty slice. When non-empty, this slice
+    /// has length at most 3 and corresponds to the exhaustive set of bytes
+    /// that must be seen in order to transition out of this state.
+    accel: &'a mut [u8],
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> StateMut<'a> {
+    /// Sets the ith transition to the given state.
+    fn set_next_at(&mut self, i: usize, next: StateID) {
+        let start = i * StateID::SIZE;
+        let end = start + StateID::SIZE;
+        bytes::write_state_id::<bytes::NE>(next, &mut self.next[start..end]);
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> fmt::Debug for StateMut<'a> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let state = State {
+            id: self.id,
+            is_match: self.is_match,
+            ntrans: self.ntrans,
+            input_ranges: self.input_ranges,
+            next: self.next,
+            pattern_ids: self.pattern_ids,
+            accel: self.accel,
+        };
+        fmt::Debug::fmt(&state, f)
+    }
+}
+
+/// A binary search routine specialized specifically to a sparse DFA state's
+/// transitions. Specifically, the transitions are defined as a set of pairs
+/// of input bytes that delineate an inclusive range of bytes. If the input
+/// byte is in the range, then the corresponding transition is a match.
+///
+/// This binary search accepts a slice of these pairs and returns the position
+/// of the matching pair (the ith transition), or None if no matching pair
+/// could be found.
+///
+/// Note that this routine is not currently used since it was observed to
+/// either decrease performance when searching ASCII, or did not provide enough
+/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
+/// for posterity in case we can find a way to use it.
+///
+/// In theory, we could use the standard library's search routine if we could
+/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
+/// guaranteed to be safe and is thus UB (since I don't think the in-memory
+/// representation of `(u8, u8)` has been nailed down). One could define a
+/// repr(C) type, but the casting doesn't seem justified.
+#[allow(dead_code)]
+#[inline(always)]
+fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
+    debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
+    debug_assert!(ranges.len() <= 512, "ranges should be short");
+
+    let (mut left, mut right) = (0, ranges.len() / 2);
+    while left < right {
+        let mid = (left + right) / 2;
+        let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
+        if needle < b1 {
+            right = mid;
+        } else if needle > b2 {
+            left = mid + 1;
+        } else {
+            return Some(mid);
+        }
+    }
+    None
+}
diff --git a/src/dfa/special.rs b/src/dfa/special.rs

new file mode 100644 (file)

index 0000000..3db95a7
--- /dev/null
+++ b/src/dfa/special.rs
@@ -0,0 +1,477 @@
+use crate::{
+    dfa::DEAD,
+    util::{
+        bytes::{self, DeserializeError, Endian, SerializeError},
+        id::StateID,
+    },
+};
+
+macro_rules! err {
+    ($msg:expr) => {
+        return Err(DeserializeError::generic($msg));
+    };
+}
+
+// Special represents the identifiers in a DFA that correspond to "special"
+// states. If a state is one or more of the following, then it is considered
+// special:
+//
+// * dead - A non-matching state where all outgoing transitions lead back to
+//   itself. There is only one of these, regardless of whether minimization
+//   has run. The dead state always has an ID of 0. i.e., It is always the
+//   first state in a DFA.
+// * quit - A state that is entered whenever a byte is seen that should cause
+//   a DFA to give up and stop searching. This results in a MatchError::Quit
+//   error being returned at search time. The default configuration for a DFA
+//   has no quit bytes, which means this state is unreachable by default,
+//   although it is always present for reasons of implementation simplicity.
+//   This state is only reachable when the caller configures the DFA to quit
+//   on certain bytes. There is always exactly one of these states and it
+//   is always the second state. (Its actual ID depends on the size of the
+//   alphabet in dense DFAs, since state IDs are premultiplied in order to
+//   allow them to be used directly as indices into the transition table.)
+// * match - An accepting state, i.e., indicative of a match. There may be
+//   zero or more of these states.
+// * accelerated - A state where all of its outgoing transitions, except a
+//   few, loop back to itself. These states are candidates for acceleration
+//   via memchr during search. There may be zero or more of these states.
+// * start - A non-matching state that indicates where the automaton should
+//   start during a search. There is always at least one starting state and
+//   all are guaranteed to be non-match states. (A start state cannot be a
+//   match state because the DFAs in this crate delay all matches by one byte.
+//   So every search that finds a match must move through one transition to
+//   some other match state, even when searching an empty string.)
+//
+// These are not mutually exclusive categories. Namely, the following
+// overlappings can occur:
+//
+// * {dead, start} - If a DFA can never lead to a match and it is minimized,
+//   then it will typically compile to something where all starting IDs point
+//   to the DFA's dead state.
+// * {match, accelerated} - It is possible for a match state to have the
+//   majority of its transitions loop back to itself, which means it's
+//   possible for a match state to be accelerated.
+// * {start, accelerated} - Similarly, it is possible for a start state to be
+//   accelerated. Note that it is possible for an accelerated state to be
+//   neither a match or a start state. Also note that just because both match
+//   and start states overlap with accelerated states does not mean that
+//   match and start states overlap with each other. In fact, they are
+//   guaranteed not to overlap.
+//
+// As a special mention, every DFA always has a dead and a quit state, even
+// though from the perspective of the DFA, they are equivalent. (Indeed,
+// minimization special cases them to ensure they don't get merged.) The
+// purpose of keeping them distinct is to use the quit state as a sentinel to
+// distguish between whether a search finished successfully without finding
+// anything or whether it gave up before finishing.
+//
+// So the main problem we want to solve here is the *fast* detection of whether
+// a state is special or not. And we also want to do this while storing as
+// little extra data as possible. AND we want to be able to quickly determine
+// which categories a state falls into above if it is special.
+//
+// We achieve this by essentially shuffling all special states to the beginning
+// of a DFA. That is, all special states appear before every other non-special
+// state. By representing special states this way, we can determine whether a
+// state is special or not by a single comparison, where special.max is the
+// identifier of the last special state in the DFA:
+//
+//     if current_state <= special.max:
+//         ... do something with special state
+//
+// The only thing left to do is to determine what kind of special state
+// it is. Because what we do next depends on that. Since special states
+// are typically rare, we can afford to do a bit more extra work, but we'd
+// still like this to be as fast as possible. The trick we employ here is to
+// continue shuffling states even within the special state range. Such that
+// one contiguous region corresponds to match states, another for start states
+// and then an overlapping range for accelerated states. At a high level, our
+// special state detection might look like this (for leftmost searching, where
+// we continue searching even after seeing a match):
+//
+//     byte = input[offset]
+//     current_state = next_state(current_state, byte)
+//     offset += 1
+//     if current_state <= special.max:
+//         if current_state == 0:
+//             # We can never leave a dead state, so this always marks the
+//             # end of our search.
+//             return last_match
+//         if current_state == special.quit_id:
+//             # A quit state means we give up. If he DFA has no quit state,
+//             # then special.quit_id == 0 == dead, which is handled by the
+//             # conditional above.
+//             return Err(MatchError::Quit { byte, offset: offset - 1 })
+//         if special.min_match <= current_state <= special.max_match:
+//             last_match = Some(offset)
+//             if special.min_accel <= current_state <= special.max_accel:
+//                 offset = accelerate(input, offset)
+//                 last_match = Some(offset)
+//         elif special.min_start <= current_state <= special.max_start:
+//             offset = prefilter.find(input, offset)
+//             if special.min_accel <= current_state <= special.max_accel:
+//                 offset = accelerate(input, offset)
+//         elif special.min_accel <= current_state <= special.max_accel:
+//             offset = accelerate(input, offset)
+//
+// There are some small details left out of the logic above. For example,
+// in order to accelerate a state, we need to know which bytes to search for.
+// This in turn implies some extra data we need to store in the DFA. To keep
+// things compact, we would ideally only store
+//
+//     N = special.max_accel - special.min_accel + 1
+//
+// items. But state IDs are premultiplied, which means they are not contiguous.
+// So in order to take a state ID and index an array of accelerated structures,
+// we need to do:
+//
+//     i = (state_id - special.min_accel) / stride
+//
+// (N.B. 'stride' is always a power of 2, so the above can be implemented via
+// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in
+// 2^x=stride.)
+//
+// Moreover, some of these specialty categories may be empty. For example,
+// DFAs are not required to have any match states or any accelerated states.
+// In that case, the lower and upper bounds are both set to 0 (the dead state
+// ID) and the first `current_state == 0` check subsumes cases where the
+// ranges are empty.
+//
+// Loop unrolling, if applicable, has also been left out of the logic above.
+//
+// Graphically, the ranges look like this, where asterisks indicate ranges
+// that can be empty. Each 'x' is a state.
+//
+//      quit
+//  dead|
+//     ||
+//     xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+//     | |             |    | start |                       |
+//     | |-------------|    |-------|                       |
+//     |   match*   |          |    |                       |
+//     |            |          |    |                       |
+//     |            |----------|    |                       |
+//     |                accel*      |                       |
+//     |                            |                       |
+//     |                            |                       |
+//     |----------------------------|------------------------
+//              special                   non-special*
+#[derive(Clone, Copy, Debug)]
+pub struct Special {
+    /// The identifier of the last special state in a DFA. A state is special
+    /// if and only if its identifier is less than or equal to `max`.
+    pub max: StateID,
+    /// The identifier of the quit state in a DFA. (There is no analogous field
+    /// for the dead state since the dead state's ID is always zero, regardless
+    /// of state ID size.)
+    pub quit_id: StateID,
+    /// The identifier of the first match state.
+    pub min_match: StateID,
+    /// The identifier of the last match state.
+    pub max_match: StateID,
+    /// The identifier of the first accelerated state.
+    pub min_accel: StateID,
+    /// The identifier of the last accelerated state.
+    pub max_accel: StateID,
+    /// The identifier of the first start state.
+    pub min_start: StateID,
+    /// The identifier of the last start state.
+    pub max_start: StateID,
+}
+
+impl Special {
+    /// Creates a new set of special ranges for a DFA. All ranges are initially
+    /// set to only contain the dead state. This is interpreted as an empty
+    /// range.
+    #[cfg(feature = "alloc")]
+    pub fn new() -> Special {
+        Special {
+            max: DEAD,
+            quit_id: DEAD,
+            min_match: DEAD,
+            max_match: DEAD,
+            min_accel: DEAD,
+            max_accel: DEAD,
+            min_start: DEAD,
+            max_start: DEAD,
+        }
+    }
+
+    /// Remaps all of the special state identifiers using the function given.
+    #[cfg(feature = "alloc")]
+    pub fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
+        Special {
+            max: map(self.max),
+            quit_id: map(self.quit_id),
+            min_match: map(self.min_match),
+            max_match: map(self.max_match),
+            min_accel: map(self.min_accel),
+            max_accel: map(self.max_accel),
+            min_start: map(self.min_start),
+            max_start: map(self.max_start),
+        }
+    }
+
+    /// Deserialize the given bytes into special state ranges. If the slice
+    /// given is not big enough, then this returns an error. Similarly, if
+    /// any of the expected invariants around special state ranges aren't
+    /// upheld, an error is returned. Note that this does not guarantee that
+    /// the information returned is correct.
+    ///
+    /// Upon success, this returns the number of bytes read in addition to the
+    /// special state IDs themselves.
+    pub fn from_bytes(
+        mut slice: &[u8],
+    ) -> Result<(Special, usize), DeserializeError> {
+        bytes::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;
+
+        let mut nread = 0;
+        let mut read_id = |what| -> Result<StateID, DeserializeError> {
+            let (id, nr) = bytes::try_read_state_id(slice, what)?;
+            nread += nr;
+            slice = &slice[StateID::SIZE..];
+            Ok(id)
+        };
+
+        let max = read_id("special max id")?;
+        let quit_id = read_id("special quit id")?;
+        let min_match = read_id("special min match id")?;
+        let max_match = read_id("special max match id")?;
+        let min_accel = read_id("special min accel id")?;
+        let max_accel = read_id("special max accel id")?;
+        let min_start = read_id("special min start id")?;
+        let max_start = read_id("special max start id")?;
+
+        let special = Special {
+            max,
+            quit_id,
+            min_match,
+            max_match,
+            min_accel,
+            max_accel,
+            min_start,
+            max_start,
+        };
+        special.validate()?;
+        assert_eq!(nread, special.write_to_len());
+        Ok((special, nread))
+    }
+
+    /// Validate that the information describing special states satisfies
+    /// all known invariants.
+    pub fn validate(&self) -> Result<(), DeserializeError> {
+        // Check that both ends of the range are DEAD or neither are.
+        if self.min_match == DEAD && self.max_match != DEAD {
+            err!("min_match is DEAD, but max_match is not");
+        }
+        if self.min_match != DEAD && self.max_match == DEAD {
+            err!("max_match is DEAD, but min_match is not");
+        }
+        if self.min_accel == DEAD && self.max_accel != DEAD {
+            err!("min_accel is DEAD, but max_accel is not");
+        }
+        if self.min_accel != DEAD && self.max_accel == DEAD {
+            err!("max_accel is DEAD, but min_accel is not");
+        }
+        if self.min_start == DEAD && self.max_start != DEAD {
+            err!("min_start is DEAD, but max_start is not");
+        }
+        if self.min_start != DEAD && self.max_start == DEAD {
+            err!("max_start is DEAD, but min_start is not");
+        }
+
+        // Check that ranges are well formed.
+        if self.min_match > self.max_match {
+            err!("min_match should not be greater than max_match");
+        }
+        if self.min_accel > self.max_accel {
+            err!("min_accel should not be greater than max_accel");
+        }
+        if self.min_start > self.max_start {
+            err!("min_start should not be greater than max_start");
+        }
+
+        // Check that ranges are ordered with respect to one another.
+        if self.matches() && self.quit_id >= self.min_match {
+            err!("quit_id should not be greater than min_match");
+        }
+        if self.accels() && self.quit_id >= self.min_accel {
+            err!("quit_id should not be greater than min_accel");
+        }
+        if self.starts() && self.quit_id >= self.min_start {
+            err!("quit_id should not be greater than min_start");
+        }
+        if self.matches() && self.accels() && self.min_accel < self.min_match {
+            err!("min_match should not be greater than min_accel");
+        }
+        if self.matches() && self.starts() && self.min_start < self.min_match {
+            err!("min_match should not be greater than min_start");
+        }
+        if self.accels() && self.starts() && self.min_start < self.min_accel {
+            err!("min_accel should not be greater than min_start");
+        }
+
+        // Check that max is at least as big as everything else.
+        if self.max < self.quit_id {
+            err!("quit_id should not be greater than max");
+        }
+        if self.max < self.max_match {
+            err!("max_match should not be greater than max");
+        }
+        if self.max < self.max_accel {
+            err!("max_accel should not be greater than max");
+        }
+        if self.max < self.max_start {
+            err!("max_start should not be greater than max");
+        }
+
+        Ok(())
+    }
+
+    /// Validate that the special state information is compatible with the
+    /// given state count.
+    pub fn validate_state_count(
+        &self,
+        count: usize,
+        stride2: usize,
+    ) -> Result<(), DeserializeError> {
+        // We assume that 'validate' has already passed, so we know that 'max'
+        // is truly the max. So all we need to check is that the max state
+        // ID is less than the state ID count. The max legal value here is
+        // count-1, which occurs when there are no non-special states.
+        if (self.max.as_usize() >> stride2) >= count {
+            err!("max should not be greater than or equal to state count");
+        }
+        Ok(())
+    }
+
+    /// Write the IDs and ranges for special states to the given byte buffer.
+    /// The buffer given must have enough room to store all data, otherwise
+    /// this will return an error. The number of bytes written is returned
+    /// on success. The number of bytes written is guaranteed to be a multiple
+    /// of 8.
+    pub fn write_to<E: Endian>(
+        &self,
+        dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        use crate::util::bytes::write_state_id as write;
+
+        if dst.len() < self.write_to_len() {
+            return Err(SerializeError::buffer_too_small("special state ids"));
+        }
+
+        let mut nwrite = 0;
+        nwrite += write::<E>(self.max, &mut dst[nwrite..]);
+        nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]);
+        nwrite += write::<E>(self.min_match, &mut dst[nwrite..]);
+        nwrite += write::<E>(self.max_match, &mut dst[nwrite..]);
+        nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]);
+        nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]);
+        nwrite += write::<E>(self.min_start, &mut dst[nwrite..]);
+        nwrite += write::<E>(self.max_start, &mut dst[nwrite..]);
+
+        assert_eq!(
+            self.write_to_len(),
+            nwrite,
+            "expected to write certain number of bytes",
+        );
+        assert_eq!(
+            nwrite % 8,
+            0,
+            "expected to write multiple of 8 bytes for special states",
+        );
+        Ok(nwrite)
+    }
+
+    /// Returns the total number of bytes written by `write_to`.
+    pub fn write_to_len(&self) -> usize {
+        8 * StateID::SIZE
+    }
+
+    /// Sets the maximum special state ID based on the current values. This
+    /// should be used once all possible state IDs are set.
+    #[cfg(feature = "alloc")]
+    pub fn set_max(&mut self) {
+        use core::cmp::max;
+        self.max = max(
+            self.quit_id,
+            max(self.max_match, max(self.max_accel, self.max_start)),
+        );
+    }
+
+    /// Returns true if and only if the given state ID is a special state.
+    #[inline]
+    pub fn is_special_state(&self, id: StateID) -> bool {
+        id <= self.max
+    }
+
+    /// Returns true if and only if the given state ID is a dead state.
+    #[inline]
+    pub fn is_dead_state(&self, id: StateID) -> bool {
+        id == DEAD
+    }
+
+    /// Returns true if and only if the given state ID is a quit state.
+    #[inline]
+    pub fn is_quit_state(&self, id: StateID) -> bool {
+        !self.is_dead_state(id) && self.quit_id == id
+    }
+
+    /// Returns true if and only if the given state ID is a match state.
+    #[inline]
+    pub fn is_match_state(&self, id: StateID) -> bool {
+        !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match
+    }
+
+    /// Returns true if and only if the given state ID is an accel state.
+    #[inline]
+    pub fn is_accel_state(&self, id: StateID) -> bool {
+        !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel
+    }
+
+    /// Returns true if and only if the given state ID is a start state.
+    #[inline]
+    pub fn is_start_state(&self, id: StateID) -> bool {
+        !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start
+    }
+
+    /// Returns the total number of match states for a dense table based DFA.
+    #[inline]
+    pub fn match_len(&self, stride: usize) -> usize {
+        if self.matches() {
+            (self.max_match.as_usize() - self.min_match.as_usize() + stride)
+                / stride
+        } else {
+            0
+        }
+    }
+
+    /// Returns true if and only if there is at least one match state.
+    #[inline]
+    pub fn matches(&self) -> bool {
+        self.min_match != DEAD
+    }
+
+    /// Returns the total number of accel states.
+    #[cfg(feature = "alloc")]
+    pub fn accel_len(&self, stride: usize) -> usize {
+        if self.accels() {
+            (self.max_accel.as_usize() - self.min_accel.as_usize() + stride)
+                / stride
+        } else {
+            0
+        }
+    }
+
+    /// Returns true if and only if there is at least one accel state.
+    #[inline]
+    pub fn accels(&self) -> bool {
+        self.min_accel != DEAD
+    }
+
+    /// Returns true if and only if there is at least one start state.
+    #[inline]
+    pub fn starts(&self) -> bool {
+        self.min_start != DEAD
+    }
+}
diff --git a/src/dfa/transducer.rs b/src/dfa/transducer.rs

new file mode 100644 (file)

index 0000000..58b34e0
--- /dev/null
+++ b/src/dfa/transducer.rs
@@ -0,0 +1,207 @@
+use crate::{
+    dfa::{automaton::Automaton, dense, sparse},
+    util::id::StateID,
+};
+
+impl<T: AsRef<[u32]>> fst::Automaton for dense::DFA<T> {
+    type State = StateID;
+
+    #[inline]
+    fn start(&self) -> StateID {
+        self.start_state_forward(None, &[], 0, 0)
+    }
+
+    #[inline]
+    fn is_match(&self, state: &StateID) -> bool {
+        self.is_match_state(*state)
+    }
+
+    #[inline]
+    fn accept(&self, state: &StateID, byte: u8) -> StateID {
+        if fst::Automaton::is_match(self, state) {
+            return *state;
+        }
+        self.next_state(*state, byte)
+    }
+
+    #[inline]
+    fn accept_eof(&self, state: &StateID) -> Option<StateID> {
+        if fst::Automaton::is_match(self, state) {
+            return Some(*state);
+        }
+        Some(self.next_eoi_state(*state))
+    }
+
+    #[inline]
+    fn can_match(&self, state: &StateID) -> bool {
+        !self.is_dead_state(*state)
+    }
+}
+
+impl<T: AsRef<[u8]>> fst::Automaton for sparse::DFA<T> {
+    type State = StateID;
+
+    #[inline]
+    fn start(&self) -> StateID {
+        self.start_state_forward(None, &[], 0, 0)
+    }
+
+    #[inline]
+    fn is_match(&self, state: &StateID) -> bool {
+        self.is_match_state(*state)
+    }
+
+    #[inline]
+    fn accept(&self, state: &StateID, byte: u8) -> StateID {
+        if fst::Automaton::is_match(self, state) {
+            return *state;
+        }
+        self.next_state(*state, byte)
+    }
+
+    #[inline]
+    fn accept_eof(&self, state: &StateID) -> Option<StateID> {
+        if fst::Automaton::is_match(self, state) {
+            return Some(*state);
+        }
+        Some(self.next_eoi_state(*state))
+    }
+
+    #[inline]
+    fn can_match(&self, state: &StateID) -> bool {
+        !self.is_dead_state(*state)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use bstr::BString;
+    use fst::{Automaton, IntoStreamer, Set, Streamer};
+
+    use crate::dfa::{dense, sparse};
+
+    fn search<A: Automaton, D: AsRef<[u8]>>(
+        set: &Set<D>,
+        aut: A,
+    ) -> Vec<BString> {
+        let mut stream = set.search(aut).into_stream();
+
+        let mut results = vec![];
+        while let Some(key) = stream.next() {
+            results.push(BString::from(key));
+        }
+        results
+    }
+
+    #[test]
+    fn dense_anywhere() {
+        let set =
+            Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa = dense::DFA::new("ba.*").unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
+    }
+
+    #[test]
+    fn dense_anchored() {
+        let set =
+            Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa = dense::Builder::new()
+            .configure(dense::Config::new().anchored(true))
+            .build("ba.*")
+            .unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bar", "baz"]);
+    }
+
+    #[test]
+    fn dense_assertions_start() {
+        let set =
+            Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa = dense::Builder::new().build("^ba.*").unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bar", "baz"]);
+    }
+
+    #[test]
+    fn dense_assertions_end() {
+        let set =
+            Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa = dense::Builder::new().build(".*x$").unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bax", "xbax"]);
+    }
+
+    #[test]
+    fn dense_assertions_word() {
+        let set =
+            Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap();
+        let dfa = dense::Builder::new().build(r"(?-u)\bfoo\b").unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["foo", "zzz foo zzz"]);
+    }
+
+    #[test]
+    fn sparse_anywhere() {
+        let set =
+            Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa = sparse::DFA::new("ba.*").unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
+    }
+
+    #[test]
+    fn sparse_anchored() {
+        let set =
+            Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa = dense::Builder::new()
+            .configure(dense::Config::new().anchored(true))
+            .build("ba.*")
+            .unwrap()
+            .to_sparse()
+            .unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bar", "baz"]);
+    }
+
+    #[test]
+    fn sparse_assertions_start() {
+        let set =
+            Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa =
+            dense::Builder::new().build("^ba.*").unwrap().to_sparse().unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bar", "baz"]);
+    }
+
+    #[test]
+    fn sparse_assertions_end() {
+        let set =
+            Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"])
+                .unwrap();
+        let dfa =
+            dense::Builder::new().build(".*x$").unwrap().to_sparse().unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["bax", "xbax"]);
+    }
+
+    #[test]
+    fn sparse_assertions_word() {
+        let set =
+            Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap();
+        let dfa = dense::Builder::new()
+            .build(r"(?-u)\bfoo\b")
+            .unwrap()
+            .to_sparse()
+            .unwrap();
+        let got = search(&set, &dfa);
+        assert_eq!(got, vec!["foo", "zzz foo zzz"]);
+    }
+}
diff --git a/src/hybrid/dfa.rs b/src/hybrid/dfa.rs

new file mode 100644 (file)

index 0000000..1fbce5f
--- /dev/null
+++ b/src/hybrid/dfa.rs
@@ -0,0 +1,3817 @@
+/*!
+Types and routines specific to lazy DFAs.
+
+This module is the home of [`hybrid::dfa::DFA`](DFA).
+
+This module also contains a [`hybrid::dfa::Builder`](Builder) and a
+[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA.
+*/
+
+use core::{borrow::Borrow, iter, mem::size_of};
+
+use alloc::{sync::Arc, vec::Vec};
+
+use crate::{
+    hybrid::{
+        error::{BuildError, CacheError},
+        id::{LazyStateID, LazyStateIDError, OverlappingState},
+        search,
+    },
+    nfa::thompson,
+    util::{
+        alphabet::{self, ByteClasses, ByteSet},
+        determinize::{self, State, StateBuilderEmpty, StateBuilderNFA},
+        id::{PatternID, StateID as NFAStateID},
+        matchtypes::{HalfMatch, MatchError, MatchKind},
+        prefilter,
+        sparse_set::SparseSets,
+        start::Start,
+    },
+};
+
+/// The mininum number of states that a lazy DFA's cache size must support.
+///
+/// This is checked at time of construction to ensure that at least some small
+/// number of states can fit in the given capacity allotment. If we can't fit
+/// at least this number of states, then the thinking is that it's pretty
+/// senseless to use the lazy DFA. More to the point, parts of the code do
+/// assume that the cache can fit at least some small number of states.
+const MIN_STATES: usize = 5;
+
+/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching.
+///
+/// A lazy DFA is a DFA that builds itself at search time. It otherwise has
+/// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA).
+/// Indeed, both support precisely the same regex features with precisely the
+/// same semantics.
+///
+/// Where as a `dense::DFA` must be completely built to handle any input before
+/// it may be used for search, a lazy DFA starts off effectively empty. During
+/// a search, a lazy DFA will build itself depending on whether it has already
+/// computed the next transition or not. If it has, then it looks a lot like
+/// a `dense::DFA` internally: it does a very fast table based access to find
+/// the next transition. Otherwise, if the state hasn't been computed, then it
+/// does determinization _for that specific transition_ to compute the next DFA
+/// state.
+///
+/// The main selling point of a lazy DFA is that, in practice, it has
+/// the performance profile of a `dense::DFA` without the weakness of it
+/// taking worst case exponential time to build. Indeed, for each byte of
+/// input, the lazy DFA will construct as most one new DFA state. Thus, a
+/// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~
+/// pattern.len()` and `n ~ haystack.len()`).
+///
+/// The main downsides of a lazy DFA are:
+///
+/// 1. It requires mutable "cache" space during search. This is where the
+/// transition table, among other things, is stored.
+/// 2. In pathological cases (e.g., if the cache is too small), it will run
+/// out of room and either require a bigger cache capacity or will repeatedly
+/// clear the cache and thus repeatedly regenerate DFA states. Overall, this
+/// will tend to be slower than a typical NFA simulation.
+///
+/// # Capabilities
+///
+/// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following
+/// operations:
+///
+/// 1. Detection of a match.
+/// 2. Location of the end of a match.
+/// 3. In the case of a lazy DFA with multiple patterns, which pattern matched
+/// is reported as well.
+///
+/// A notable absence from the above list of capabilities is the location of
+/// the *start* of a match. In order to provide both the start and end of
+/// a match, *two* lazy DFAs are required. This functionality is provided by a
+/// [`Regex`](crate::hybrid::regex::Regex).
+///
+/// # Example
+///
+/// This shows how to build a lazy DFA with the default configuration and
+/// execute a search. Notice how, in contrast to a `dense::DFA`, we must create
+/// a cache and pass it to our search routine.
+///
+/// ```
+/// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+///
+/// let dfa = DFA::new("foo[0-9]+")?;
+/// let mut cache = dfa.create_cache();
+///
+/// let expected = Some(HalfMatch::must(0, 8));
+/// assert_eq!(expected, dfa.find_leftmost_fwd(&mut cache, b"foo12345")?);
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct DFA {
+    nfa: Arc<thompson::NFA>,
+    stride2: usize,
+    classes: ByteClasses,
+    quitset: ByteSet,
+    anchored: bool,
+    match_kind: MatchKind,
+    starts_for_each_pattern: bool,
+    cache_capacity: usize,
+    minimum_cache_clear_count: Option<usize>,
+}
+
+impl DFA {
+    /// Parse the given regular expression using a default configuration and
+    /// return the corresponding lazy DFA.
+    ///
+    /// If you want a non-default configuration, then use the [`Builder`] to
+    /// set your own configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+    ///
+    /// let dfa = DFA::new("foo[0-9]+bar")?;
+    /// let mut cache = dfa.create_cache();
+    ///
+    /// let expected = HalfMatch::must(0, 11);
+    /// assert_eq!(
+    ///     Some(expected),
+    ///     dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?,
+    /// );
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new(pattern: &str) -> Result<DFA, BuildError> {
+        DFA::builder().build(pattern)
+    }
+
+    /// Parse the given regular expressions using a default configuration and
+    /// return the corresponding lazy multi-DFA.
+    ///
+    /// If you want a non-default configuration, then use the [`Builder`] to
+    /// set your own configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+    ///
+    /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?;
+    /// let mut cache = dfa.create_cache();
+    ///
+    /// let expected = HalfMatch::must(1, 3);
+    /// assert_eq!(
+    ///     Some(expected),
+    ///     dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?,
+    /// );
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
+        DFA::builder().build_many(patterns)
+    }
+
+    /// Create a new lazy DFA that matches every input.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+    ///
+    /// let dfa = DFA::always_match()?;
+    /// let mut cache = dfa.create_cache();
+    ///
+    /// let expected = HalfMatch::must(0, 0);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"")?);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"foo")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn always_match() -> Result<DFA, BuildError> {
+        let nfa = thompson::NFA::always_match();
+        Builder::new().build_from_nfa(Arc::new(nfa))
+    }
+
+    /// Create a new lazy DFA that never matches any input.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::hybrid::dfa::DFA;
+    ///
+    /// let dfa = DFA::never_match()?;
+    /// let mut cache = dfa.create_cache();
+    ///
+    /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"")?);
+    /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"foo")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn never_match() -> Result<DFA, BuildError> {
+        let nfa = thompson::NFA::never_match();
+        Builder::new().build_from_nfa(Arc::new(nfa))
+    }
+
+    /// Return a default configuration for a `DFA`.
+    ///
+    /// This is a convenience routine to avoid needing to import the `Config`
+    /// type when customizing the construction of a lazy DFA.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to build a lazy DFA that only executes searches
+    /// in anchored mode.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+    ///
+    /// let re = DFA::builder()
+    ///     .configure(DFA::config().anchored(true))
+    ///     .build(r"[0-9]+")?;
+    /// let mut cache = re.create_cache();
+    ///
+    /// let haystack = "abc123xyz".as_bytes();
+    /// assert_eq!(None, re.find_leftmost_fwd(&mut cache, haystack)?);
+    /// assert_eq!(
+    ///     Some(HalfMatch::must(0, 3)),
+    ///     re.find_leftmost_fwd(&mut cache, &haystack[3..6])?,
+    /// );
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn config() -> Config {
+        Config::new()
+    }
+
+    /// Return a builder for configuring the construction of a `Regex`.
+    ///
+    /// This is a convenience routine to avoid needing to import the
+    /// [`Builder`] type in common cases.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use the builder to disable UTF-8 mode
+    /// everywhere for lazy DFAs. This includes disabling it for both the
+    /// concrete syntax (e.g., `.` matches any byte and Unicode character
+    /// classes like `\p{Letter}` are not allowed) and for the unanchored
+    /// search prefix. The latter enables the regex to match anywhere in a
+    /// sequence of arbitrary bytes. (Typically, the unanchored search prefix
+    /// will only permit matching valid UTF-8.)
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     hybrid::dfa::DFA,
+    ///     nfa::thompson,
+    ///     HalfMatch, SyntaxConfig,
+    /// };
+    ///
+    /// let re = DFA::builder()
+    ///     .syntax(SyntaxConfig::new().utf8(false))
+    ///     .thompson(thompson::Config::new().utf8(false))
+    ///     .build(r"foo(?-u:[^b])ar.*")?;
+    /// let mut cache = re.create_cache();
+    ///
+    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+    /// let expected = Some(HalfMatch::must(0, 9));
+    /// let got = re.find_leftmost_fwd(&mut cache, haystack)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn builder() -> Builder {
+        Builder::new()
+    }
+
+    /// Create a new cache for this lazy DFA.
+    ///
+    /// The cache returned should only be used for searches for this
+    /// lazy DFA. If you want to reuse the cache for another DFA, then
+    /// you must call [`Cache::reset`] with that DFA (or, equivalently,
+    /// [`DFA::reset_cache`]).
+    pub fn create_cache(&self) -> Cache {
+        Cache::new(self)
+    }
+
+    /// Reset the given cache such that it can be used for searching with the
+    /// this lazy DFA (and only this DFA).
+    ///
+    /// A cache reset permits reusing memory already allocated in this cache
+    /// with a different lazy DFA.
+    ///
+    /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+    /// lazy DFA has been configured to "give up" after it has cleared the
+    /// cache a certain number of times.
+    ///
+    /// Any lazy state ID generated by the cache prior to resetting it is
+    /// invalid after the reset.
+    ///
+    /// # Example
+    ///
+    /// This shows how to re-purpose a cache for use with a different DFA.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+    ///
+    /// let dfa1 = DFA::new(r"\w")?;
+    /// let dfa2 = DFA::new(r"\W")?;
+    ///
+    /// let mut cache = dfa1.create_cache();
+    /// assert_eq!(
+    ///     Some(HalfMatch::must(0, 2)),
+    ///     dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?,
+    /// );
+    ///
+    /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
+    /// // incorrect results. In order to re-purpose the cache, we must reset
+    /// // it with the DFA we'd like to use it with.
+    /// //
+    /// // Similarly, after this reset, using the cache with 'dfa1' is also not
+    /// // allowed.
+    /// dfa2.reset_cache(&mut cache);
+    /// assert_eq!(
+    ///     Some(HalfMatch::must(0, 3)),
+    ///     dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?,
+    /// );
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn reset_cache(&self, cache: &mut Cache) {
+        Lazy::new(self, cache).reset_cache()
+    }
+
+    /// Returns the total number of patterns compiled into this lazy DFA.
+    ///
+    /// In the case of a DFA that contains no patterns, this returns `0`.
+    ///
+    /// # Example
+    ///
+    /// This example shows the pattern count for a DFA that never matches:
+    ///
+    /// ```
+    /// use regex_automata::hybrid::dfa::DFA;
+    ///
+    /// let dfa = DFA::never_match()?;
+    /// assert_eq!(dfa.pattern_count(), 0);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// And another example for a DFA that matches at every position:
+    ///
+    /// ```
+    /// use regex_automata::hybrid::dfa::DFA;
+    ///
+    /// let dfa = DFA::always_match()?;
+    /// assert_eq!(dfa.pattern_count(), 1);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// And finally, a DFA that was constructed from multiple patterns:
+    ///
+    /// ```
+    /// use regex_automata::hybrid::dfa::DFA;
+    ///
+    /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+    /// assert_eq!(dfa.pattern_count(), 3);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn pattern_count(&self) -> usize {
+        self.nfa.pattern_len()
+    }
+
+    /// Returns a reference to the underlying NFA.
+    pub fn nfa(&self) -> &Arc<thompson::NFA> {
+        &self.nfa
+    }
+
+    /// Returns the stride, as a base-2 exponent, required for these
+    /// equivalence classes.
+    ///
+    /// The stride is always the smallest power of 2 that is greater than or
+    /// equal to the alphabet length. This is done so that converting between
+    /// state IDs and indices can be done with shifts alone, which is much
+    /// faster than integer division.
+    fn stride2(&self) -> usize {
+        self.stride2
+    }
+
+    /// Returns the total stride for every state in this lazy DFA. This
+    /// corresponds to the total number of transitions used by each state in
+    /// this DFA's transition table.
+    fn stride(&self) -> usize {
+        1 << self.stride2()
+    }
+
+    /// Returns the total number of elements in the alphabet for this
+    /// transition table. This is always less than or equal to `self.stride()`.
+    /// It is only equal when the alphabet length is a power of 2. Otherwise,
+    /// it is always strictly less.
+    fn alphabet_len(&self) -> usize {
+        self.classes.alphabet_len()
+    }
+
+    /// Returns the memory usage, in bytes, of this lazy DFA.
+    ///
+    /// This does **not** include the stack size used up by this lazy DFA. To
+    /// compute that, use `std::mem::size_of::<DFA>()`. This also does
+    /// not include the size of the `Cache` used.
+    pub fn memory_usage(&self) -> usize {
+        // Everything else is on the stack.
+        self.nfa.memory_usage()
+    }
+}
+
+impl DFA {
+    /// Executes a forward search and returns the end position of the first
+    /// match that is found as early as possible. If no match exists, then
+    /// `None` is returned.
+    ///
+    /// This routine stops scanning input as soon as the search observes a
+    /// match state. This is useful for implementing boolean `is_match`-like
+    /// routines, where as little work is done as possible.
+    ///
+    /// See [`DFA::find_earliest_fwd_at`] for additional functionality, such as
+    /// providing a prefilter, a specific pattern to match and the bounds of
+    /// the search within the haystack. This routine is meant as a convenience
+    /// for common cases where the additional functionality is not needed.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// lazy DFAs generated by this crate, this only occurs in non-default
+    /// configurations where quit bytes are used, Unicode word boundaries are
+    /// heuristically enabled or limits are set on the number of times the lazy
+    /// DFA's cache may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Example
+    ///
+    /// This example demonstrates how the position returned might differ from
+    /// what one might expect when executing a traditional leftmost search.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+    ///
+    /// let dfa = DFA::new("foo[0-9]+")?;
+    /// let mut cache = dfa.create_cache();
+    /// // Normally, the end of the leftmost first match here would be 8,
+    /// // corresponding to the end of the input. But the "earliest" semantics
+    /// // this routine cause it to stop as soon as a match is known, which
+    /// // occurs once 'foo[0-9]' has matched.
+    /// let expected = HalfMatch::must(0, 4);
+    /// assert_eq!(
+    ///     Some(expected),
+    ///     dfa.find_earliest_fwd(&mut cache, b"foo12345")?,
+    /// );
+    ///
+    /// let dfa = DFA::new("abc|a")?;
+    /// let mut cache = dfa.create_cache();
+    /// // Normally, the end of the leftmost first match here would be 3,
+    /// // but the shortest match semantics detect a match earlier.
+    /// let expected = HalfMatch::must(0, 1);
+    /// assert_eq!(Some(expected), dfa.find_earliest_fwd(&mut cache, b"abc")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    pub fn find_earliest_fwd(
+        &self,
+        cache: &mut Cache,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        self.find_earliest_fwd_at(cache, None, None, bytes, 0, bytes.len())
+    }
+
+    /// Executes a reverse search and returns the start position of the first
+    /// match that is found as early as possible. If no match exists, then
+    /// `None` is returned.
+    ///
+    /// This routine stops scanning input as soon as the search observes a
+    /// match state.
+    ///
+    /// Note that while it is not technically necessary to build a reverse
+    /// automaton to use a reverse search, it is likely that you'll want to do
+    /// so. Namely, the typical use of a reverse search is to find the starting
+    /// location of a match once its end is discovered from a forward search. A
+    /// reverse DFA automaton can be built by configuring the intermediate NFA
+    /// to be reversed via
+    /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse).
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// lazy DFAs generated by this crate, this only occurs in non-default
+    /// configurations where quit bytes are used, Unicode word boundaries are
+    /// heuristically enabled or limits are set on the number of times the lazy
+    /// DFA's cache may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Example
+    ///
+    /// This example demonstrates how the position returned might differ from
+    /// what one might expect when executing a traditional leftmost reverse
+    /// search.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch};
+    ///
+    /// let dfa = DFA::builder()
+    ///     .thompson(thompson::Config::new().reverse(true))
+    ///     .build("[a-z]+[0-9]+")?;
+    /// let mut cache = dfa.create_cache();
+    /// // Normally, the end of the leftmost first match here would be 0,
+    /// // corresponding to the beginning of the input. But the "earliest"
+    /// // semantics of this routine cause it to stop as soon as a match is
+    /// // known, which occurs once '[a-z][0-9]+' has matched.
+    /// let expected = HalfMatch::must(0, 2);
+    /// assert_eq!(
+    ///     Some(expected),
+    ///     dfa.find_earliest_rev(&mut cache, b"foo12345")?,
+    /// );
+    ///
+    /// let dfa = DFA::builder()
+    ///     .thompson(thompson::Config::new().reverse(true))
+    ///     .build("abc|c")?;
+    /// let mut cache = dfa.create_cache();
+    /// // Normally, the end of the leftmost first match here would be 0,
+    /// // but the shortest match semantics detect a match earlier.
+    /// let expected = HalfMatch::must(0, 2);
+    /// assert_eq!(Some(expected), dfa.find_earliest_rev(&mut cache, b"abc")?);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    pub fn find_earliest_rev(
+        &self,
+        cache: &mut Cache,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        self.find_earliest_rev_at(cache, None, bytes, 0, bytes.len())
+    }
+
+    /// Executes a forward search and returns the end position of the leftmost
+    /// match that is found. If no match exists, then `None` is returned.
+    ///
+    /// In particular, this method continues searching even after it enters
+    /// a match state. The search only terminates once it has reached the
+    /// end of the input or when it has entered a dead or quit state. Upon
+    /// termination, the position of the last byte seen while still in a match
+    /// state is returned.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// lazy DFAs generated by this crate, this only occurs in non-default
+    /// configurations where quit bytes are used, Unicode word boundaries are
+    /// heuristically enabled or limits are set on the number of times the lazy
+    /// DFA's cache may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Example
+    ///
+    /// Leftmost first match semantics corresponds to the match with the
+    /// smallest starting offset, but where the end offset is determined by
+    /// preferring earlier branches in the original regular expression. For
+    /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
+    /// will match `Samwise` in `Samwise`.
+    ///
+    /// Generally speaking, the "leftmost first" match is how most backtracking
+    /// regular expressions tend to work. This is in contrast to POSIX-style
+    /// regular expressions that yield "leftmost longest" matches. Namely,
+    /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+    /// leftmost longest semantics. (This crate does not currently support
+    /// leftmost longest semantics.)
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+    ///
+    /// let dfa = DFA::new("foo[0-9]+")?;
+    /// let mut cache = dfa.create_cache();
+    /// let expected = HalfMatch::must(0, 8);
+    /// assert_eq!(
+    ///     Some(expected),
+    ///     dfa.find_leftmost_fwd(&mut cache, b"foo12345")?,
+    /// );
+    ///
+    /// // Even though a match is found after reading the first byte (`a`),
+    /// // the leftmost first match semantics demand that we find the earliest
+    /// // match that prefers earlier parts of the pattern over latter parts.
+    /// let dfa = DFA::new("abc|a")?;
+    /// let mut cache = dfa.create_cache();
+    /// let expected = HalfMatch::must(0, 3);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"abc")?);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    pub fn find_leftmost_fwd(
+        &self,
+        cache: &mut Cache,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        self.find_leftmost_fwd_at(cache, None, None, bytes, 0, bytes.len())
+    }
+
+    /// Executes a reverse search and returns the start of the position of the
+    /// leftmost match that is found. If no match exists, then `None` is
+    /// returned.
+    ///
+    /// In particular, this method continues searching even after it enters
+    /// a match state. The search only terminates once it has reached the
+    /// end of the input or when it has entered a dead or quit state. Upon
+    /// termination, the position of the last byte seen while still in a match
+    /// state is returned.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// lazy DFAs generated by this crate, this only occurs in non-default
+    /// configurations where quit bytes are used, Unicode word boundaries are
+    /// heuristically enabled or limits are set on the number of times the lazy
+    /// DFA's cache may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Example
+    ///
+    /// In particular, this routine is principally
+    /// useful when used in conjunction with the
+    /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::revers
+    /// e) configuration. In general, it's unlikely to be correct to use both
+    /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since
+    /// any particular DFA will only support searching in one direction with
+    /// respect to the pattern.
+    ///
+    /// ```
+    /// use regex_automata::{nfa::thompson, hybrid::dfa::DFA, HalfMatch};
+    ///
+    /// let dfa = DFA::builder()
+    ///     .thompson(thompson::Config::new().reverse(true))
+    ///     .build("foo[0-9]+")?;
+    /// let mut cache = dfa.create_cache();
+    /// let expected = HalfMatch::must(0, 0);
+    /// assert_eq!(
+    ///     Some(expected),
+    ///     dfa.find_leftmost_rev(&mut cache, b"foo12345")?,
+    /// );
+    ///
+    /// // Even though a match is found after reading the last byte (`c`),
+    /// // the leftmost first match semantics demand that we find the earliest
+    /// // match that prefers earlier parts of the pattern over latter parts.
+    /// let dfa = DFA::builder()
+    ///     .thompson(thompson::Config::new().reverse(true))
+    ///     .build("abc|c")?;
+    /// let mut cache = dfa.create_cache();
+    /// let expected = HalfMatch::must(0, 0);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_rev(&mut cache, b"abc")?);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    pub fn find_leftmost_rev(
+        &self,
+        cache: &mut Cache,
+        bytes: &[u8],
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        self.find_leftmost_rev_at(cache, None, bytes, 0, bytes.len())
+    }
+
+    /// Executes an overlapping forward search and returns the end position of
+    /// matches as they are found. If no match exists, then `None` is returned.
+    ///
+    /// This routine is principally only useful when searching for multiple
+    /// patterns on inputs where multiple patterns may match the same regions
+    /// of text. In particular, callers must preserve the automaton's search
+    /// state from prior calls so that the implementation knows where the last
+    /// match occurred.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// lazy DFAs generated by this crate, this only occurs in non-default
+    /// configurations where quit bytes are used, Unicode word boundaries are
+    /// heuristically enabled or limits are set on the number of times the lazy
+    /// DFA's cache may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to run a basic overlapping search. Notice
+    /// that we build the automaton with a `MatchKind::All` configuration.
+    /// Overlapping searches are unlikely to work as one would expect when
+    /// using the default `MatchKind::LeftmostFirst` match semantics, since
+    /// leftmost-first matching is fundamentally incompatible with overlapping
+    /// searches. Namely, overlapping searches need to report matches as they
+    /// are seen, where as leftmost-first searches will continue searching even
+    /// after a match has been observed in order to find the conventional end
+    /// position of the match. More concretely, leftmost-first searches use
+    /// dead states to terminate a search after a specific match can no longer
+    /// be extended. Overlapping searches instead do the opposite by continuing
+    /// the search to find totally new matches (potentially of other patterns).
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     hybrid::{dfa::DFA, OverlappingState},
+    ///     HalfMatch,
+    ///     MatchKind,
+    /// };
+    ///
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().match_kind(MatchKind::All))
+    ///     .build_many(&[r"\w+$", r"\S+$"])?;
+    /// let mut cache = dfa.create_cache();
+    ///
+    /// let haystack = "@foo".as_bytes();
+    /// let mut state = OverlappingState::start();
+    ///
+    /// let expected = Some(HalfMatch::must(1, 4));
+    /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// // The first pattern also matches at the same position, so re-running
+    /// // the search will yield another match. Notice also that the first
+    /// // pattern is returned after the second. This is because the second
+    /// // pattern begins its match before the first, is therefore an earlier
+    /// // match and is thus reported first.
+    /// let expected = Some(HalfMatch::must(0, 4));
+    /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    pub fn find_overlapping_fwd(
+        &self,
+        cache: &mut Cache,
+        bytes: &[u8],
+        state: &mut OverlappingState,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        self.find_overlapping_fwd_at(
+            cache,
+            None,
+            None,
+            bytes,
+            0,
+            bytes.len(),
+            state,
+        )
+    }
+
+    /// Executes a forward search and returns the end position of the first
+    /// match that is found as early as possible. If no match exists, then
+    /// `None` is returned.
+    ///
+    /// This routine stops scanning input as soon as the search observes a
+    /// match state. This is useful for implementing boolean `is_match`-like
+    /// routines, where as little work is done as possible.
+    ///
+    /// This is like [`DFA::find_earliest_fwd`], except it provides some
+    /// additional control over how the search is executed:
+    ///
+    /// * `pre` is a prefilter scanner that, when given, is used whenever the
+    /// DFA enters its starting state. This is meant to speed up searches where
+    /// one or a small number of literal prefixes are known.
+    /// * `pattern_id` specifies a specific pattern in the DFA to run an
+    /// anchored search for. If not given, then a search for any pattern is
+    /// performed. For lazy DFAs, [`Config::starts_for_each_pattern`] must be
+    /// enabled to use this functionality.
+    /// * `start` and `end` permit searching a specific region of the haystack
+    /// `bytes`. This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `bytes`. (Because the existence of look-around
+    /// operations such as `\b`, `^` and `$` need to take the surrounding
+    /// context into account. This cannot be done if the haystack doesn't
+    /// contain it.)
+    ///
+    /// The examples below demonstrate each of these additional parameters.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// lazy DFAs generated by this crate, this only occurs in non-default
+    /// configurations where quit bytes are used, Unicode word boundaries are
+    /// heuristically enabled or limits are set on the number of times the lazy
+    /// DFA's cache may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Panics
+    ///
+    /// This routine panics if a `pattern_id` is given and this lazy DFA does
+    /// not support specific pattern searches.
+    ///
+    /// It also panics if the given haystack range is not valid.
+    ///
+    /// # Example: prefilter
+    ///
+    /// This example shows how to provide a prefilter for a pattern where all
+    /// matches start with a `z` byte.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     hybrid::dfa::DFA,
+    ///     util::prefilter::{Candidate, Prefilter, Scanner, State},
+    ///     HalfMatch,
+    /// };
+    ///
+    /// #[derive(Debug)]
+    /// pub struct ZPrefilter;
+    ///
+    /// impl Prefilter for ZPrefilter {
+    ///     fn next_candidate(
+    ///         &self,
+    ///         _: &mut State,
+    ///         haystack: &[u8],
+    ///         at: usize,
+    ///     ) -> Candidate {
+    ///         // Try changing b'z' to b'q' and observe this test fail since
+    ///         // the prefilter will skip right over the match.
+    ///         match haystack.iter().position(|&b| b == b'z') {
+    ///             None => Candidate::None,
+    ///             Some(i) => Candidate::PossibleStartOfMatch(at + i),
+    ///         }
+    ///     }
+    ///
+    ///     fn heap_bytes(&self) -> usize {
+    ///         0
+    ///     }
+    /// }
+    ///
+    /// let dfa = DFA::new("z[0-9]{3}")?;
+    /// let mut cache = dfa.create_cache();
+    ///
+    /// let haystack = "foobar z123 q123".as_bytes();
+    /// // A scanner executes a prefilter while tracking some state that helps
+    /// // determine whether a prefilter is still "effective" or not.
+    /// let mut scanner = Scanner::new(&ZPrefilter);
+    ///
+    /// let expected = Some(HalfMatch::must(0, 11));
+    /// let got = dfa.find_earliest_fwd_at(
+    ///     &mut cache,
+    ///     Some(&mut scanner),
+    ///     None,
+    ///     haystack,
+    ///     0,
+    ///     haystack.len(),
+    /// )?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// # Example: specific pattern search
+    ///
+    /// This example shows how to build a lazy multi-DFA that permits searching
+    /// for specific patterns.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     hybrid::dfa::DFA,
+    ///     HalfMatch,
+    ///     PatternID,
+    /// };
+    ///
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().starts_for_each_pattern(true))
+    ///     .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
+    /// let mut cache = dfa.create_cache();
+    /// let haystack = "foo123".as_bytes();
+    ///
+    /// // Since we are using the default leftmost-first match and both
+    /// // patterns match at the same starting position, only the first pattern
+    /// // will be returned in this case when doing a search for any of the
+    /// // patterns.
+    /// let expected = Some(HalfMatch::must(0, 6));
+    /// let got = dfa.find_earliest_fwd_at(
+    ///     &mut cache,
+    ///     None,
+    ///     None,
+    ///     haystack,
+    ///     0,
+    ///     haystack.len(),
+    /// )?;
+    /// assert_eq!(expected, got);
+    ///
+    /// // But if we want to check whether some other pattern matches, then we
+    /// // can provide its pattern ID.
+    /// let expected = Some(HalfMatch::must(1, 6));
+    /// let got = dfa.find_earliest_fwd_at(
+    ///     &mut cache,
+    ///     None,
+    ///     Some(PatternID::must(1)),
+    ///     haystack,
+    ///     0,
+    ///     haystack.len(),
+    /// )?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// # Example: specifying the bounds of a search
+    ///
+    /// This example shows how providing the bounds of a search can produce
+    /// different results than simply sub-slicing the haystack.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+    ///
+    /// // N.B. We disable Unicode here so that we use a simple ASCII word
+    /// // boundary. Alternatively, we could enable heuristic support for
+    /// // Unicode word boundaries since our haystack is pure ASCII.
+    /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?;
+    /// let mut cache = dfa.create_cache();
+    /// let haystack = "foo123bar".as_bytes();
+    ///
+    /// // Since we sub-slice the haystack, the search doesn't know about the
+    /// // larger context and assumes that `123` is surrounded by word
+    /// // boundaries. And of course, the match position is reported relative
+    /// // to the sub-slice as well, which means we get `3` instead of `6`.
+    /// let expected = Some(HalfMatch::must(0, 3));
+    /// let got = dfa.find_earliest_fwd_at(
+    ///     &mut cache,
+    ///     None,
+    ///     None,
+    ///     &haystack[3..6],
+    ///     0,
+    ///     haystack[3..6].len(),
+    /// )?;
+    /// assert_eq!(expected, got);
+    ///
+    /// // But if we provide the bounds of the search within the context of the
+    /// // entire haystack, then the search can take the surrounding context
+    /// // into account. (And if we did find a match, it would be reported
+    /// // as a valid offset into `haystack` instead of its sub-slice.)
+    /// let expected = None;
+    /// let got = dfa.find_earliest_fwd_at(
+    ///     &mut cache,
+    ///     None,
+    ///     None,
+    ///     haystack,
+    ///     3,
+    ///     6,
+    /// )?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    pub fn find_earliest_fwd_at(
+        &self,
+        cache: &mut Cache,
+        pre: Option<&mut prefilter::Scanner>,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        search::find_earliest_fwd(
+            pre, self, cache, pattern_id, bytes, start, end,
+        )
+    }
+
+    /// Executes a reverse search and returns the start position of the first
+    /// match that is found as early as possible. If no match exists, then
+    /// `None` is returned.
+    ///
+    /// This routine stops scanning input as soon as the search observes a
+    /// match state.
+    ///
+    /// This is like [`DFA::find_earliest_rev`], except it provides some
+    /// additional control over how the search is executed. See the
+    /// documentation of [`DFA::find_earliest_fwd_at`] for more details
+    /// on the additional parameters along with examples of their usage.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// lazy DFAs generated by this crate, this only occurs in non-default
+    /// configurations where quit bytes are used, Unicode word boundaries are
+    /// heuristically enabled or limits are set on the number of times the lazy
+    /// DFA's cache may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Panics
+    ///
+    /// This routine panics if a `pattern_id` is given and the underlying
+    /// DFA does not support specific pattern searches.
+    ///
+    /// It also panics if the given haystack range is not valid.
+    #[inline]
+    pub fn find_earliest_rev_at(
+        &self,
+        cache: &mut Cache,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        search::find_earliest_rev(self, cache, pattern_id, bytes, start, end)
+    }
+
+    /// Executes a forward search and returns the end position of the leftmost
+    /// match that is found. If no match exists, then `None` is returned.
+    ///
+    /// This is like [`DFA::find_leftmost_fwd`], except it provides some
+    /// additional control over how the search is executed. See the
+    /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the
+    /// additional parameters along with examples of their usage.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// lazy DFAs generated by this crate, this only occurs in non-default
+    /// configurations where quit bytes are used, Unicode word boundaries are
+    /// heuristically enabled or limits are set on the number of times the lazy
+    /// DFA's cache may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Panics
+    ///
+    /// This routine panics if a `pattern_id` is given and the underlying
+    /// DFA does not support specific pattern searches.
+    ///
+    /// It also panics if the given haystack range is not valid.
+    #[inline]
+    pub fn find_leftmost_fwd_at(
+        &self,
+        cache: &mut Cache,
+        pre: Option<&mut prefilter::Scanner>,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        search::find_leftmost_fwd(
+            pre, self, cache, pattern_id, bytes, start, end,
+        )
+    }
+
+    /// Executes a reverse search and returns the start of the position of the
+    /// leftmost match that is found. If no match exists, then `None` is
+    /// returned.
+    ///
+    /// This is like [`DFA::find_leftmost_rev`], except it provides some
+    /// additional control over how the search is executed. See the
+    /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the
+    /// additional parameters along with examples of their usage.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// lazy DFAs generated by this crate, this only occurs in non-default
+    /// configurations where quit bytes are used, Unicode word boundaries are
+    /// heuristically enabled or limits are set on the number of times the lazy
+    /// DFA's cache may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Panics
+    ///
+    /// This routine panics if a `pattern_id` is given and the underlying
+    /// DFA does not support specific pattern searches.
+    ///
+    /// It also panics if the given haystack range is not valid.
+    #[inline]
+    pub fn find_leftmost_rev_at(
+        &self,
+        cache: &mut Cache,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        search::find_leftmost_rev(self, cache, pattern_id, bytes, start, end)
+    }
+
+    /// Executes an overlapping forward search and returns the end position of
+    /// matches as they are found. If no match exists, then `None` is returned.
+    ///
+    /// This routine is principally only useful when searching for multiple
+    /// patterns on inputs where multiple patterns may match the same regions
+    /// of text. In particular, callers must preserve the automaton's search
+    /// state from prior calls so that the implementation knows where the last
+    /// match occurred.
+    ///
+    /// This is like [`DFA::find_overlapping_fwd`], except it provides
+    /// some additional control over how the search is executed. See the
+    /// documentation of [`DFA::find_earliest_fwd_at`] for more details
+    /// on the additional parameters along with examples of their usage.
+    ///
+    /// When using this routine to implement an iterator of overlapping
+    /// matches, the `start` of the search should always be set to the end
+    /// of the last match. If more patterns match at the previous location,
+    /// then they will be immediately returned. (This is tracked by the given
+    /// overlapping state.) Otherwise, the search continues at the starting
+    /// position given.
+    ///
+    /// If for some reason you want the search to forget about its previous
+    /// state and restart the search at a particular position, then setting the
+    /// state to [`OverlappingState::start`] will accomplish that.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// lazy DFAs generated by this crate, this only occurs in non-default
+    /// configurations where quit bytes are used, Unicode word boundaries are
+    /// heuristically enabled or limits are set on the number of times the lazy
+    /// DFA's cache may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// # Panics
+    ///
+    /// This routine panics if a `pattern_id` is given and the underlying
+    /// DFA does not support specific pattern searches.
+    ///
+    /// It also panics if the given haystack range is not valid.
+    #[inline]
+    pub fn find_overlapping_fwd_at(
+        &self,
+        cache: &mut Cache,
+        pre: Option<&mut prefilter::Scanner>,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+        state: &mut OverlappingState,
+    ) -> Result<Option<HalfMatch>, MatchError> {
+        search::find_overlapping_fwd(
+            pre, self, cache, pattern_id, bytes, start, end, state,
+        )
+    }
+}
+
+impl DFA {
+    /// Transitions from the current state to the next state, given the next
+    /// byte of input.
+    ///
+    /// The given cache is used to either reuse pre-computed state
+    /// transitions, or to store this newly computed transition for future
+    /// reuse. Thus, this routine guarantees that it will never return a state
+    /// ID that has an "unknown" tag.
+    ///
+    /// # State identifier validity
+    ///
+    /// The only valid value for `current` is the lazy state ID returned
+    /// by the most recent call to `next_state`, `next_state_untagged`,
+    /// `next_state_untagged_unchecked`, `start_state_forward` or
+    /// `state_state_reverse` for the given `cache`. Any state ID returned from
+    /// prior calls to these routines (with the same `cache`) is considered
+    /// invalid (even if it gives an appearance of working). State IDs returned
+    /// from _any_ prior call for different `cache` values are also always
+    /// invalid.
+    ///
+    /// The returned ID is always a valid ID when `current` refers to a valid
+    /// ID. Moreover, this routine is defined for all possible values of
+    /// `input`.
+    ///
+    /// These validity rules are not checked, even in debug mode. Callers are
+    /// required to uphold these rules themselves.
+    ///
+    /// Violating these state ID validity rules will not sacrifice memory
+    /// safety, but _may_ produce an incorrect result or a panic.
+    ///
+    /// # Panics
+    ///
+    /// If the given ID does not refer to a valid state, then this routine
+    /// may panic but it also may not panic and instead return an invalid or
+    /// incorrect ID.
+    ///
+    /// # Example
+    ///
+    /// This shows a simplistic example for walking a lazy DFA for a given
+    /// haystack by using the `next_state` method.
+    ///
+    /// ```
+    /// use regex_automata::hybrid::dfa::DFA;
+    ///
+    /// let dfa = DFA::new(r"[a-z]+r")?;
+    /// let mut cache = dfa.create_cache();
+    /// let haystack = "bar".as_bytes();
+    ///
+    /// // The start state is determined by inspecting the position and the
+    /// // initial bytes of the haystack.
+    /// let mut sid = dfa.start_state_forward(
+    ///     &mut cache, None, haystack, 0, haystack.len(),
+    /// )?;
+    /// // Walk all the bytes in the haystack.
+    /// for &b in haystack {
+    ///     sid = dfa.next_state(&mut cache, sid, b)?;
+    /// }
+    /// // Matches are always delayed by 1 byte, so we must explicitly walk the
+    /// // special "EOI" transition at the end of the search.
+    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+    /// assert!(sid.is_match());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    pub fn next_state(
+        &self,
+        cache: &mut Cache,
+        current: LazyStateID,
+        input: u8,
+    ) -> Result<LazyStateID, CacheError> {
+        let class = usize::from(self.classes.get(input));
+        let offset = current.as_usize_untagged() + class;
+        let sid = cache.trans[offset];
+        if !sid.is_unknown() {
+            return Ok(sid);
+        }
+        let unit = alphabet::Unit::u8(input);
+        Lazy::new(self, cache).cache_next_state(current, unit)
+    }
+
+    /// Transitions from the current state to the next state, given the next
+    /// byte of input and a state ID that is not tagged.
+    ///
+    /// The only reason to use this routine is performance. In particular, the
+    /// `next_state` method needs to do some additional checks, among them is
+    /// to account for identifiers to states that are not yet computed. In
+    /// such a case, the transition is computed on the fly. However, if it is
+    /// known that the `current` state ID is untagged, then these checks can be
+    /// omitted.
+    ///
+    /// Since this routine does not compute states on the fly, it does not
+    /// modify the cache and thus cannot return an error. Consequently, `cache`
+    /// does not need to be mutable and it is possible for this routine to
+    /// return a state ID corresponding to the special "unknown" state. In
+    /// this case, it is the caller's responsibility to use the prior state
+    /// ID and `input` with `next_state` in order to force the computation of
+    /// the unknown transition. Otherwise, trying to use the "unknown" state
+    /// ID will just result in transitioning back to itself, and thus never
+    /// terminating. (This is technically a special exemption to the state ID
+    /// validity rules, but is permissible since this routine is guarateed to
+    /// never mutate the given `cache`, and thus the identifier is guaranteed
+    /// to remain valid.)
+    ///
+    /// See [`LazyStateID`] for more details on what it means for a state ID
+    /// to be tagged. Also, see
+    /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked)
+    /// for this same idea, but with bounds checks forcefully elided.
+    ///
+    /// # State identifier validity
+    ///
+    /// The only valid value for `current` is an **untagged** lazy
+    /// state ID returned by the most recent call to `next_state`,
+    /// `next_state_untagged`, `next_state_untagged_unchecked`,
+    /// `start_state_forward` or `state_state_reverse` for the given `cache`.
+    /// Any state ID returned from prior calls to these routines (with the
+    /// same `cache`) is considered invalid (even if it gives an appearance
+    /// of working). State IDs returned from _any_ prior call for different
+    /// `cache` values are also always invalid.
+    ///
+    /// The returned ID is always a valid ID when `current` refers to a valid
+    /// ID, although it may be tagged. Moreover, this routine is defined for
+    /// all possible values of `input`.
+    ///
+    /// Not all validity rules are checked, even in debug mode. Callers are
+    /// required to uphold these rules themselves.
+    ///
+    /// Violating these state ID validity rules will not sacrifice memory
+    /// safety, but _may_ produce an incorrect result or a panic.
+    ///
+    /// # Panics
+    ///
+    /// If the given ID does not refer to a valid state, then this routine
+    /// may panic but it also may not panic and instead return an invalid or
+    /// incorrect ID.
+    ///
+    /// # Example
+    ///
+    /// This shows a simplistic example for walking a lazy DFA for a given
+    /// haystack by using the `next_state_untagged` method where possible.
+    ///
+    /// ```
+    /// use regex_automata::hybrid::dfa::DFA;
+    ///
+    /// let dfa = DFA::new(r"[a-z]+r")?;
+    /// let mut cache = dfa.create_cache();
+    /// let haystack = "bar".as_bytes();
+    ///
+    /// // The start state is determined by inspecting the position and the
+    /// // initial bytes of the haystack.
+    /// let mut sid = dfa.start_state_forward(
+    ///     &mut cache, None, haystack, 0, haystack.len(),
+    /// )?;
+    /// // Walk all the bytes in the haystack.
+    /// let mut at = 0;
+    /// while at < haystack.len() {
+    ///     if sid.is_tagged() {
+    ///         sid = dfa.next_state(&mut cache, sid, haystack[at])?;
+    ///     } else {
+    ///         let mut prev_sid = sid;
+    ///         // We attempt to chew through as much as we can while moving
+    ///         // through untagged state IDs. Thus, the transition function
+    ///         // does less work on average per byte. (Unrolling this loop
+    ///         // may help even more.)
+    ///         while at < haystack.len() {
+    ///             prev_sid = sid;
+    ///             sid = dfa.next_state_untagged(
+    ///                 &mut cache, sid, haystack[at],
+    ///             );
+    ///             at += 1;
+    ///             if sid.is_tagged() {
+    ///                 break;
+    ///             }
+    ///         }
+    ///         // We must ensure that we never proceed to the next iteration
+    ///         // with an unknown state ID. If we don't account for this
+    ///         // case, then search isn't guaranteed to terminate since all
+    ///         // transitions on unknown states loop back to itself.
+    ///         if sid.is_unknown() {
+    ///             sid = dfa.next_state(
+    ///                 &mut cache, prev_sid, haystack[at - 1],
+    ///             )?;
+    ///         }
+    ///     }
+    /// }
+    /// // Matches are always delayed by 1 byte, so we must explicitly walk the
+    /// // special "EOI" transition at the end of the search.
+    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+    /// assert!(sid.is_match());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    pub fn next_state_untagged(
+        &self,
+        cache: &Cache,
+        current: LazyStateID,
+        input: u8,
+    ) -> LazyStateID {
+        debug_assert!(!current.is_tagged());
+        let class = usize::from(self.classes.get(input));
+        let offset = current.as_usize_unchecked() + class;
+        cache.trans[offset]
+    }
+
+    /// Transitions from the current state to the next state, eliding bounds
+    /// checks, given the next byte of input and a state ID that is not tagged.
+    ///
+    /// The only reason to use this routine is performance. In particular, the
+    /// `next_state` method needs to do some additional checks, among them is
+    /// to account for identifiers to states that are not yet computed. In
+    /// such a case, the transition is computed on the fly. However, if it is
+    /// known that the `current` state ID is untagged, then these checks can be
+    /// omitted.
+    ///
+    /// Since this routine does not compute states on the fly, it does not
+    /// modify the cache and thus cannot return an error. Consequently, `cache`
+    /// does not need to be mutable and it is possible for this routine to
+    /// return a state ID corresponding to the special "unknown" state. In
+    /// this case, it is the caller's responsibility to use the prior state
+    /// ID and `input` with `next_state` in order to force the computation of
+    /// the unknown transition. Otherwise, trying to use the "unknown" state
+    /// ID will just result in transitioning back to itself, and thus never
+    /// terminating. (This is technically a special exemption to the state ID
+    /// validity rules, but is permissible since this routine is guarateed to
+    /// never mutate the given `cache`, and thus the identifier is guaranteed
+    /// to remain valid.)
+    ///
+    /// See [`LazyStateID`] for more details on what it means for a state ID
+    /// to be tagged. Also, see
+    /// [`next_state_untagged`](DFA::next_state_untagged)
+    /// for this same idea, but with memory safety guaranteed by retaining
+    /// bounds checks.
+    ///
+    /// # State identifier validity
+    ///
+    /// The only valid value for `current` is an **untagged** lazy
+    /// state ID returned by the most recent call to `next_state`,
+    /// `next_state_untagged`, `next_state_untagged_unchecked`,
+    /// `start_state_forward` or `state_state_reverse` for the given `cache`.
+    /// Any state ID returned from prior calls to these routines (with the
+    /// same `cache`) is considered invalid (even if it gives an appearance
+    /// of working). State IDs returned from _any_ prior call for different
+    /// `cache` values are also always invalid.
+    ///
+    /// The returned ID is always a valid ID when `current` refers to a valid
+    /// ID, although it may be tagged. Moreover, this routine is defined for
+    /// all possible values of `input`.
+    ///
+    /// Not all validity rules are checked, even in debug mode. Callers are
+    /// required to uphold these rules themselves.
+    ///
+    /// Violating these state ID validity rules will not sacrifice memory
+    /// safety, but _may_ produce an incorrect result or a panic.
+    ///
+    /// # Safety
+    ///
+    /// Callers of this method must guarantee that `current` refers to a valid
+    /// state ID according to the rules described above. If `current` is not a
+    /// valid state ID for this automaton, then calling this routine may result
+    /// in undefined behavior.
+    ///
+    /// If `current` is valid, then the ID returned is valid for all possible
+    /// values of `input`.
+    #[inline]
+    pub unsafe fn next_state_untagged_unchecked(
+        &self,
+        cache: &Cache,
+        current: LazyStateID,
+        input: u8,
+    ) -> LazyStateID {
+        debug_assert!(!current.is_tagged());
+        let class = usize::from(self.classes.get(input));
+        let offset = current.as_usize_unchecked() + class;
+        *cache.trans.get_unchecked(offset)
+    }
+
+    /// Transitions from the current state to the next state for the special
+    /// EOI symbol.
+    ///
+    /// The given cache is used to either reuse pre-computed state
+    /// transitions, or to store this newly computed transition for future
+    /// reuse. Thus, this routine guarantees that it will never return a state
+    /// ID that has an "unknown" tag.
+    ///
+    /// This routine must be called at the end of every search in a correct
+    /// implementation of search. Namely, lazy DFAs in this crate delay matches
+    /// by one byte in order to support look-around operators. Thus, after
+    /// reaching the end of a haystack, a search implementation must follow one
+    /// last EOI transition.
+    ///
+    /// It is best to think of EOI as an additional symbol in the alphabet of a
+    /// DFA that is distinct from every other symbol. That is, the alphabet of
+    /// lazy DFAs in this crate has a logical size of 257 instead of 256, where
+    /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
+    /// physical alphabet size may be smaller because of alphabet compression
+    /// via equivalence classes, but EOI is always represented somehow in the
+    /// alphabet.)
+    ///
+    /// # State identifier validity
+    ///
+    /// The only valid value for `current` is the lazy state ID returned
+    /// by the most recent call to `next_state`, `next_state_untagged`,
+    /// `next_state_untagged_unchecked`, `start_state_forward` or
+    /// `state_state_reverse` for the given `cache`. Any state ID returned from
+    /// prior calls to these routines (with the same `cache`) is considered
+    /// invalid (even if it gives an appearance of working). State IDs returned
+    /// from _any_ prior call for different `cache` values are also always
+    /// invalid.
+    ///
+    /// The returned ID is always a valid ID when `current` refers to a valid
+    /// ID.
+    ///
+    /// These validity rules are not checked, even in debug mode. Callers are
+    /// required to uphold these rules themselves.
+    ///
+    /// Violating these state ID validity rules will not sacrifice memory
+    /// safety, but _may_ produce an incorrect result or a panic.
+    ///
+    /// # Panics
+    ///
+    /// If the given ID does not refer to a valid state, then this routine
+    /// may panic but it also may not panic and instead return an invalid or
+    /// incorrect ID.
+    ///
+    /// # Example
+    ///
+    /// This shows a simplistic example for walking a DFA for a given haystack,
+    /// and then finishing the search with the final EOI transition.
+    ///
+    /// ```
+    /// use regex_automata::hybrid::dfa::DFA;
+    ///
+    /// let dfa = DFA::new(r"[a-z]+r")?;
+    /// let mut cache = dfa.create_cache();
+    /// let haystack = "bar".as_bytes();
+    ///
+    /// // The start state is determined by inspecting the position and the
+    /// // initial bytes of the haystack.
+    /// let mut sid = dfa.start_state_forward(
+    ///     &mut cache, None, haystack, 0, haystack.len(),
+    /// )?;
+    /// // Walk all the bytes in the haystack.
+    /// for &b in haystack {
+    ///     sid = dfa.next_state(&mut cache, sid, b)?;
+    /// }
+    /// // Matches are always delayed by 1 byte, so we must explicitly walk
+    /// // the special "EOI" transition at the end of the search. Without this
+    /// // final transition, the assert below will fail since the DFA will not
+    /// // have entered a match state yet!
+    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+    /// assert!(sid.is_match());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    pub fn next_eoi_state(
+        &self,
+        cache: &mut Cache,
+        current: LazyStateID,
+    ) -> Result<LazyStateID, CacheError> {
+        let eoi = self.classes.eoi().as_usize();
+        let offset = current.as_usize_untagged() + eoi;
+        let sid = cache.trans[offset];
+        if !sid.is_unknown() {
+            return Ok(sid);
+        }
+        let unit = self.classes.eoi();
+        Lazy::new(self, cache).cache_next_state(current, unit)
+    }
+
+    /// Return the ID of the start state for this lazy DFA when executing a
+    /// forward search.
+    ///
+    /// Unlike typical DFA implementations, the start state for DFAs in this
+    /// crate is dependent on a few different factors:
+    ///
+    /// * The pattern ID, if present. When the underlying DFA has been
+    /// configured with multiple patterns _and_ the DFA has been configured to
+    /// build an anchored start state for each pattern, then a pattern ID may
+    /// be specified to execute an anchored search for that specific pattern.
+    /// If `pattern_id` is invalid or if the DFA isn't configured to build
+    /// start states for each pattern, then implementations must panic. DFAs in
+    /// this crate can be configured to build start states for each pattern via
+    /// [`Config::starts_for_each_pattern`].
+    /// * When `start > 0`, the byte at index `start - 1` may influence the
+    /// start state if the regex uses `^` or `\b`.
+    /// * Similarly, when `start == 0`, it may influence the start state when
+    /// the regex uses `^` or `\A`.
+    /// * Currently, `end` is unused.
+    /// * Whether the search is a forward or reverse search. This routine can
+    /// only be used for forward searches.
+    ///
+    /// # Panics
+    ///
+    /// This panics if `start..end` is not a valid sub-slice of `bytes`. This
+    /// also panics if `pattern_id` is non-None and does not refer to a valid
+    /// pattern, or if the DFA was not configured to build anchored start
+    /// states for each pattern.
+    #[inline]
+    pub fn start_state_forward(
+        &self,
+        cache: &mut Cache,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<LazyStateID, CacheError> {
+        let mut lazy = Lazy::new(self, cache);
+        let start_type = Start::from_position_fwd(bytes, start, end);
+        let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type);
+        if !sid.is_unknown() {
+            return Ok(sid);
+        }
+        lazy.cache_start_group(pattern_id, start_type)
+    }
+
+    /// Return the ID of the start state for this lazy DFA when executing a
+    /// reverse search.
+    ///
+    /// Unlike typical DFA implementations, the start state for DFAs in this
+    /// crate is dependent on a few different factors:
+    ///
+    /// * The pattern ID, if present. When the underlying DFA has been
+    /// configured with multiple patterns _and_ the DFA has been configured to
+    /// build an anchored start state for each pattern, then a pattern ID may
+    /// be specified to execute an anchored search for that specific pattern.
+    /// If `pattern_id` is invalid or if the DFA isn't configured to build
+    /// start states for each pattern, then implementations must panic. DFAs in
+    /// this crate can be configured to build start states for each pattern via
+    /// [`Config::starts_for_each_pattern`].
+    /// * When `end < bytes.len()`, the byte at index `end` may influence the
+    /// start state if the regex uses `$` or `\b`.
+    /// * Similarly, when `end == bytes.len()`, it may influence the start
+    /// state when the regex uses `$` or `\z`.
+    /// * Currently, `start` is unused.
+    /// * Whether the search is a forward or reverse search. This routine can
+    /// only be used for reverse searches.
+    ///
+    /// # Panics
+    ///
+    /// This panics if `start..end` is not a valid sub-slice of `bytes`. This
+    /// also panics if `pattern_id` is non-None and does not refer to a valid
+    /// pattern, or if the DFA was not configured to build anchored start
+    /// states for each pattern.
+    #[inline]
+    pub fn start_state_reverse(
+        &self,
+        cache: &mut Cache,
+        pattern_id: Option<PatternID>,
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<LazyStateID, CacheError> {
+        let mut lazy = Lazy::new(self, cache);
+        let start_type = Start::from_position_rev(bytes, start, end);
+        let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type);
+        if !sid.is_unknown() {
+            return Ok(sid);
+        }
+        lazy.cache_start_group(pattern_id, start_type)
+    }
+
+    /// Returns the total number of patterns that match in this state.
+    ///
+    /// If the lazy DFA was compiled with one pattern, then this must
+    /// necessarily always return `1` for all match states.
+    ///
+    /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with
+    /// indices up to (but not including) the count returned by this routine
+    /// without panicking.
+    ///
+    /// If the given state is not a match state, then this may either panic
+    /// or return an incorrect result.
+    ///
+    /// # Example
+    ///
+    /// This example shows a simple instance of implementing overlapping
+    /// matches. In particular, it shows not only how to determine how many
+    /// patterns have matched in a particular state, but also how to access
+    /// which specific patterns have matched.
+    ///
+    /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All)
+    /// when building the DFA. If we used
+    /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst)
+    /// instead, then the DFA would not be constructed in a way that supports
+    /// overlapping matches. (It would only report a single pattern that
+    /// matches at any particular point in time.)
+    ///
+    /// Another thing to take note of is the patterns used and the order in
+    /// which the pattern IDs are reported. In the example below, pattern `3`
+    /// is yielded first. Why? Because it corresponds to the match that
+    /// appears first. Namely, the `@` symbol is part of `\S+` but not part
+    /// of any of the other patterns. Since the `\S+` pattern has a match that
+    /// starts to the left of any other pattern, its ID is returned before any
+    /// other.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, MatchKind};
+    ///
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().match_kind(MatchKind::All))
+    ///     .build_many(&[
+    ///         r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+",
+    ///     ])?;
+    /// let mut cache = dfa.create_cache();
+    /// let haystack = "@bar".as_bytes();
+    ///
+    /// // The start state is determined by inspecting the position and the
+    /// // initial bytes of the haystack.
+    /// let mut sid = dfa.start_state_forward(
+    ///     &mut cache, None, haystack, 0, haystack.len(),
+    /// )?;
+    /// // Walk all the bytes in the haystack.
+    /// for &b in haystack {
+    ///     sid = dfa.next_state(&mut cache, sid, b)?;
+    /// }
+    /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+    ///
+    /// assert!(sid.is_match());
+    /// assert_eq!(dfa.match_count(&mut cache, sid), 3);
+    /// // The following calls are guaranteed to not panic since `match_count`
+    /// // returned `3` above.
+    /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3);
+    /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0);
+    /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    #[inline]
+    pub fn match_count(&self, cache: &Cache, id: LazyStateID) -> usize {
+        assert!(id.is_match());
+        LazyRef::new(self, cache).get_cached_state(id).match_count()
+    }
+
+    /// Returns the pattern ID corresponding to the given match index in the
+    /// given state.
+    ///
+    /// See [`DFA::match_count`] for an example of how to use this method
+    /// correctly. Note that if you know your lazy DFA is configured with a
+    /// single pattern, then this routine is never necessary since it will
+    /// always return a pattern ID of `0` for an index of `0` when `id`
+    /// corresponds to a match state.
+    ///
+    /// Typically, this routine is used when implementing an overlapping
+    /// search, as the example for `DFA::match_count` does.
+    ///
+    /// # Panics
+    ///
+    /// If the state ID is not a match state or if the match index is out
+    /// of bounds for the given state, then this routine may either panic
+    /// or produce an incorrect result. If the state ID is correct and the
+    /// match index is correct, then this routine always produces a valid
+    /// `PatternID`.
+    #[inline]
+    pub fn match_pattern(
+        &self,
+        cache: &Cache,
+        id: LazyStateID,
+        match_index: usize,
+    ) -> PatternID {
+        // This is an optimization for the very common case of a DFA with a
+        // single pattern. This conditional avoids a somewhat more costly path
+        // that finds the pattern ID from the corresponding `State`, which
+        // requires a bit of slicing/pointer-chasing. This optimization tends
+        // to only matter when matches are frequent.
+        if self.pattern_count() == 1 {
+            return PatternID::ZERO;
+        }
+        LazyRef::new(self, cache)
+            .get_cached_state(id)
+            .match_pattern(match_index)
+    }
+}
+
+/// A cache represents a partially computed DFA.
+///
+/// A cache is the key component that differentiates a classical DFA and a
+/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
+/// complete transition table that can handle all possible inputs, a hybrid
+/// NFA/DFA starts with an empty transition table and builds only the parts
+/// required during search. The parts that are built are stored in a cache. For
+/// this reason, a cache is a required parameter for nearly every operation on
+/// a [`DFA`].
+///
+/// Caches can be created from their corresponding DFA via
+/// [`DFA::create_cache`]. A cache can only be used with either the DFA that
+/// created it, or the DFA that was most recently used to reset it with
+/// [`Cache::reset`]. Using a cache with any other DFA may result in panics
+/// or incorrect results.
+#[derive(Clone, Debug)]
+pub struct Cache {
+    // N.B. If you're looking to understand how determinization works, it
+    // is probably simpler to first grok src/dfa/determinize.rs, since that
+    // doesn't have the "laziness" component.
+    /// The transition table.
+    ///
+    /// Given a `current` LazyStateID and an `input` byte, the next state can
+    /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice
+    /// that no multiplication is used. That's because state identifiers are
+    /// "premultiplied."
+    ///
+    /// Note that the next state may be the "unknown" state. In this case, the
+    /// next state is not known and determinization for `current` on `input`
+    /// must be performed.
+    trans: Vec<LazyStateID>,
+    /// The starting states for this DFA.
+    ///
+    /// These are computed lazily. Initially, these are all set to "unknown"
+    /// lazy state IDs.
+    ///
+    /// When 'starts_for_each_pattern' is disabled (the default), then the size
+    /// of this is constrained to the possible starting configurations based
+    /// on the search parameters. (At time of writing, that's 4.) However,
+    /// when starting states for each pattern is enabled, then there are N
+    /// additional groups of starting states, where each group reflects the
+    /// different possible configurations and N is the number of patterns.
+    starts: Vec<LazyStateID>,
+    /// A sequence of NFA/DFA powerset states that have been computed for this
+    /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every
+    /// tagged LazyStateID can be used to index this sequence by converting it
+    /// to its untagged form.)
+    states: Vec<State>,
+    /// A map from states to their corresponding IDs. This map may be accessed
+    /// via the raw byte representation of a state, which means that a `State`
+    /// does not need to be allocated to determine whether it already exists
+    /// in this map. Indeed, the existence of such a state is what determines
+    /// whether we allocate a new `State` or not.
+    ///
+    /// The higher level idea here is that we do just enough determinization
+    /// for a state to check whether we've already computed it. If we have,
+    /// then we can save a little (albeit not much) work. The real savings is
+    /// in memory usage. If we never checked for trivially duplicate states,
+    /// then our memory usage would explode to unreasonable levels.
+    states_to_id: StateMap,
+    /// Sparse sets used to track which NFA states have been visited during
+    /// various traversals.
+    sparses: SparseSets,
+    /// Scratch space for traversing the NFA graph. (We use space on the heap
+    /// instead of the call stack.)
+    stack: Vec<NFAStateID>,
+    /// Scratch space for building a NFA/DFA powerset state. This is used to
+    /// help amortize allocation since not every powerset state generated is
+    /// added to the cache. In particular, if it already exists in the cache,
+    /// then there is no need to allocate a new `State` for it.
+    scratch_state_builder: StateBuilderEmpty,
+    /// A simple abstraction for handling the saving of at most a single state
+    /// across a cache clearing. This is required for correctness. Namely, if
+    /// adding a new state after clearing the cache fails, then the caller
+    /// must retain the ability to continue using the state ID given. The
+    /// state corresponding to the state ID is what we preserve across cache
+    /// clearings.
+    state_saver: StateSaver,
+    /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We
+    /// track this as new states are added since states use a variable amount
+    /// of heap. Tracking this as we add states makes it possible to compute
+    /// the total amount of memory used by the determinizer in constant time.
+    memory_usage_state: usize,
+    /// The number of times the cache has been cleared. When a minimum cache
+    /// clear count is set, then the cache will return an error instead of
+    /// clearing the cache if the count has been exceeded.
+    clear_count: usize,
+}
+
+impl Cache {
+    /// Create a new cache for the given lazy DFA.
+    ///
+    /// The cache returned should only be used for searches for the given DFA.
+    /// If you want to reuse the cache for another DFA, then you must call
+    /// [`Cache::reset`] with that DFA.
+    pub fn new(dfa: &DFA) -> Cache {
+        let mut cache = Cache {
+            trans: alloc::vec![],
+            starts: alloc::vec![],
+            states: alloc::vec![],
+            states_to_id: StateMap::new(),
+            sparses: SparseSets::new(dfa.nfa.len()),
+            stack: alloc::vec![],
+            scratch_state_builder: StateBuilderEmpty::new(),
+            state_saver: StateSaver::none(),
+            memory_usage_state: 0,
+            clear_count: 0,
+        };
+        Lazy { dfa, cache: &mut cache }.init_cache();
+        cache
+    }
+
+    /// Reset this cache such that it can be used for searching with the given
+    /// lazy DFA (and only that DFA).
+    ///
+    /// A cache reset permits reusing memory already allocated in this cache
+    /// with a different lazy DFA.
+    ///
+    /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+    /// lazy DFA has been configured to "give up" after it has cleared the
+    /// cache a certain number of times.
+    ///
+    /// Any lazy state ID generated by the cache prior to resetting it is
+    /// invalid after the reset.
+    ///
+    /// # Example
+    ///
+    /// This shows how to re-purpose a cache for use with a different DFA.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+    ///
+    /// let dfa1 = DFA::new(r"\w")?;
+    /// let dfa2 = DFA::new(r"\W")?;
+    ///
+    /// let mut cache = dfa1.create_cache();
+    /// assert_eq!(
+    ///     Some(HalfMatch::must(0, 2)),
+    ///     dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?,
+    /// );
+    ///
+    /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
+    /// // incorrect results. In order to re-purpose the cache, we must reset
+    /// // it with the DFA we'd like to use it with.
+    /// //
+    /// // Similarly, after this reset, using the cache with 'dfa1' is also not
+    /// // allowed.
+    /// cache.reset(&dfa2);
+    /// assert_eq!(
+    ///     Some(HalfMatch::must(0, 3)),
+    ///     dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?,
+    /// );
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn reset(&mut self, dfa: &DFA) {
+        Lazy::new(dfa, self).reset_cache()
+    }
+
+    /// Returns the total number of times this cache has been cleared since it
+    /// was either created or last reset.
+    ///
+    /// This is useful for informational purposes or if you want to change
+    /// search strategies based on the number of times the cache has been
+    /// cleared.
+    pub fn clear_count(&self) -> usize {
+        self.clear_count
+    }
+
+    /// Returns the heap memory usage, in bytes, of this cache.
+    ///
+    /// This does **not** include the stack size used up by this cache. To
+    /// compute that, use `std::mem::size_of::<Cache>()`.
+    pub fn memory_usage(&self) -> usize {
+        const ID_SIZE: usize = size_of::<LazyStateID>();
+        const STATE_SIZE: usize = size_of::<State>();
+
+        self.trans.len() * ID_SIZE
+        + self.starts.len() * ID_SIZE
+        + self.states.len() * STATE_SIZE
+        // Maps likely use more memory than this, but it's probably close.
+        + self.states_to_id.len() * (STATE_SIZE + ID_SIZE)
+        + self.sparses.memory_usage()
+        + self.stack.capacity() * ID_SIZE
+        + self.scratch_state_builder.capacity()
+        // Heap memory used by 'State' in both 'states' and 'states_to_id'.
+        + self.memory_usage_state
+    }
+}
+
+/// A map from states to state identifiers. When using std, we use a standard
+/// hashmap, since it's a bit faster for this use case. (Other maps, like
+/// one's based on FNV, have not yet been benchmarked.)
+///
+/// The main purpose of this map is to reuse states where possible. This won't
+/// fully minimize the DFA, but it works well in a lot of cases.
+#[cfg(feature = "std")]
+type StateMap = std::collections::HashMap<State, LazyStateID>;
+#[cfg(not(feature = "std"))]
+type StateMap = alloc::collections::BTreeMap<State, LazyStateID>;
+
+/// A type that groups methods that require the base NFA/DFA and writable
+/// access to the cache.
+#[derive(Debug)]
+struct Lazy<'i, 'c> {
+    dfa: &'i DFA,
+    cache: &'c mut Cache,
+}
+
+impl<'i, 'c> Lazy<'i, 'c> {
+    /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
+    fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> {
+        Lazy { dfa, cache }
+    }
+
+    /// Return an immutable view by downgrading a writable cache to a read-only
+    /// cache.
+    fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> {
+        LazyRef::new(self.dfa, self.cache)
+    }
+
+    /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA'
+    /// like 'next_state' and 'next_eoi_state' that are called in critical
+    /// areas. The idea is to let the optimizer focus on the other areas of
+    /// those methods as the hot path.
+    ///
+    /// Here's an example that justifies 'inline(never)'
+    ///
+    /// ```ignore
+    /// regex-cli find hybrid dfa \
+    ///   @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000
+    /// ```
+    ///
+    /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
+    /// codepoint, in sequence, repeated 100 times.
+    ///
+    /// With 'inline(never)' hyperfine reports 1.1s per run. With
+    /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement.
+    #[inline(never)]
+    fn cache_next_state(
+        &mut self,
+        mut current: LazyStateID,
+        unit: alphabet::Unit,
+    ) -> Result<LazyStateID, CacheError> {
+        let stride2 = self.dfa.stride2();
+        let empty_builder = self.get_state_builder();
+        let builder = determinize::next(
+            &self.dfa.nfa,
+            self.dfa.match_kind,
+            &mut self.cache.sparses,
+            &mut self.cache.stack,
+            &self.cache.states[current.as_usize_untagged() >> stride2],
+            unit,
+            empty_builder,
+        );
+        let save_state = !self.as_ref().state_builder_fits_in_cache(&builder);
+        if save_state {
+            self.save_state(current);
+        }
+        let next = self.add_builder_state(builder, |sid| sid)?;
+        if save_state {
+            current = self.saved_state_id();
+        }
+        // This is the payoff. The next time 'next_state' is called with this
+        // state and alphabet unit, it will find this transition and avoid
+        // having to re-determinize this transition.
+        self.set_transition(current, unit, next);
+        Ok(next)
+    }
+
+    /// Compute and cache the starting state for the given pattern ID (if
+    /// present) and the starting configuration.
+    ///
+    /// This panics if a pattern ID is given and the DFA isn't configured to
+    /// build anchored start states for each pattern.
+    ///
+    /// This will never return an unknown lazy state ID.
+    ///
+    /// If caching this state would otherwise result in a cache that has been
+    /// cleared too many times, then an error is returned.
+    fn cache_start_group(
+        &mut self,
+        pattern_id: Option<PatternID>,
+        start: Start,
+    ) -> Result<LazyStateID, CacheError> {
+        let nfa_start_id = match pattern_id {
+            Some(pid) => {
+                assert!(
+                    self.dfa.starts_for_each_pattern,
+                    "attempted to search for a specific pattern \
+                     without enabling starts_for_each_pattern",
+                );
+                self.dfa.nfa.start_pattern(pid)
+            }
+            None if self.dfa.anchored => self.dfa.nfa.start_anchored(),
+            None => self.dfa.nfa.start_unanchored(),
+        };
+
+        let id = self.cache_start_one(nfa_start_id, start)?;
+        self.set_start_state(pattern_id, start, id);
+        Ok(id)
+    }
+
+    /// Compute and cache the starting state for the given NFA state ID and the
+    /// starting configuration. The NFA state ID might be one of the following:
+    ///
+    /// 1) An unanchored start state to match any pattern.
+    /// 2) An anchored start state to match any pattern.
+    /// 3) An anchored start state for a particular pattern.
+    ///
+    /// This will never return an unknown lazy state ID.
+    ///
+    /// If caching this state would otherwise result in a cache that has been
+    /// cleared too many times, then an error is returned.
+    fn cache_start_one(
+        &mut self,
+        nfa_start_id: NFAStateID,
+        start: Start,
+    ) -> Result<LazyStateID, CacheError> {
+        let mut builder_matches = self.get_state_builder().into_matches();
+        determinize::set_lookbehind_from_start(&start, &mut builder_matches);
+        self.cache.sparses.set1.clear();
+        determinize::epsilon_closure(
+            self.dfa.nfa.borrow(),
+            nfa_start_id,
+            *builder_matches.look_have(),
+            &mut self.cache.stack,
+            &mut self.cache.sparses.set1,
+        );
+        let mut builder = builder_matches.into_nfa();
+        determinize::add_nfa_states(
+            self.dfa.nfa.borrow(),
+            &self.cache.sparses.set1,
+            &mut builder,
+        );
+        self.add_builder_state(builder, |id| id.to_start())
+    }
+
+    /// Either add the given builder state to this cache, or return an ID to an
+    /// equivalent state already in this cache.
+    ///
+    /// In the case where no equivalent state exists, the idmap function given
+    /// may be used to transform the identifier allocated. This is useful if
+    /// the caller needs to tag the ID with additional information.
+    ///
+    /// This will never return an unknown lazy state ID.
+    ///
+    /// If caching this state would otherwise result in a cache that has been
+    /// cleared too many times, then an error is returned.
+    fn add_builder_state(
+        &mut self,
+        builder: StateBuilderNFA,
+        idmap: impl Fn(LazyStateID) -> LazyStateID,
+    ) -> Result<LazyStateID, CacheError> {
+        if let Some(&cached_id) =
+            self.cache.states_to_id.get(builder.as_bytes())
+        {
+            // Since we have a cached state, put the constructed state's
+            // memory back into our scratch space, so that it can be reused.
+            self.put_state_builder(builder);
+            return Ok(cached_id);
+        }
+        let result = self.add_state(builder.to_state(), idmap);
+        self.put_state_builder(builder);
+        result
+    }
+
+    /// Allocate a new state ID and add the given state to this cache.
+    ///
+    /// The idmap function given may be used to transform the identifier
+    /// allocated. This is useful if the caller needs to tag the ID with
+    /// additional information.
+    ///
+    /// This will never return an unknown lazy state ID.
+    ///
+    /// If caching this state would otherwise result in a cache that has been
+    /// cleared too many times, then an error is returned.
+    fn add_state(
+        &mut self,
+        state: State,
+        idmap: impl Fn(LazyStateID) -> LazyStateID,
+    ) -> Result<LazyStateID, CacheError> {
+        if !self.as_ref().state_fits_in_cache(&state) {
+            self.try_clear_cache()?;
+        }
+        // It's important for this to come second, since the above may clear
+        // the cache. If we clear the cache after ID generation, then the ID
+        // is likely bunk since it would have been generated based on a larger
+        // transition table.
+        let mut id = idmap(self.next_state_id()?);
+        if state.is_match() {
+            id = id.to_match();
+        }
+        // Add room in the transition table. Since this is a fresh state, all
+        // of its transitions are unknown.
+        self.cache.trans.extend(
+            iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()),
+        );
+        // When we add a sentinel state, we never want to set any quit
+        // transitions. Technically, this is harmless, since sentinel states
+        // have all of their transitions set to loop back to themselves. But
+        // when creating sentinel states before the quit sentinel state,
+        // this will try to call 'set_transition' on a state ID that doesn't
+        // actually exist yet, which isn't allowed. So we just skip doing so
+        // entirely.
+        if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) {
+            let quit_id = self.as_ref().quit_id();
+            for b in self.dfa.quitset.iter() {
+                self.set_transition(id, alphabet::Unit::u8(b), quit_id);
+            }
+        }
+        self.cache.memory_usage_state += state.memory_usage();
+        self.cache.states.push(state.clone());
+        self.cache.states_to_id.insert(state, id);
+        Ok(id)
+    }
+
+    /// Allocate a new state ID.
+    ///
+    /// This will never return an unknown lazy state ID.
+    ///
+    /// If caching this state would otherwise result in a cache that has been
+    /// cleared too many times, then an error is returned.
+    fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> {
+        let sid = match LazyStateID::new(self.cache.trans.len()) {
+            Ok(sid) => sid,
+            Err(_) => {
+                self.try_clear_cache()?;
+                // This has to pass since we check that ID capacity at
+                // construction time can fit at least MIN_STATES states.
+                LazyStateID::new(self.cache.trans.len()).unwrap()
+            }
+        };
+        Ok(sid)
+    }
+
+    /// Attempt to clear the cache used by this lazy DFA.
+    ///
+    /// If clearing the cache exceeds the minimum number of required cache
+    /// clearings, then this will return a cache error. In this case,
+    /// callers should bubble this up as the cache can't be used until it is
+    /// reset. Implementations of search should convert this error into a
+    /// `MatchError::GaveUp`.
+    ///
+    /// If 'self.state_saver' is set to save a state, then this state is
+    /// persisted through cache clearing. Otherwise, the cache is returned to
+    /// its state after initialization with two exceptions: its clear count
+    /// is incremented and some of its memory likely has additional capacity.
+    /// That is, clearing a cache does _not_ release memory.
+    ///
+    /// Otherwise, any lazy state ID generated by the cache prior to resetting
+    /// it is invalid after the reset.
+    fn try_clear_cache(&mut self) -> Result<(), CacheError> {
+        // Currently, the only heuristic we use is the minimum cache clear
+        // count. If we pass that minimum, then we give up.
+        //
+        // It would be good to also add a heuristic based on "bytes searched
+        // per generated state," but this requires API design work. Namely,
+        // we really do not want to add a counter increment to the transition
+        // function, which implies we need to expose APIs to update the number
+        // of bytes searched by implementers of the search routines. And that
+        // doesn't seem great... But we should do it if this heuristic isn't
+        // enough. (The original lazy DFA implementation in the 'regex' crate
+        // had this heuristic, since the lazy DFA was coupled with the search
+        // routines.)
+        if let Some(min_count) = self.dfa.minimum_cache_clear_count {
+            if self.cache.clear_count >= min_count {
+                return Err(CacheError::too_many_cache_clears());
+            }
+        }
+        self.clear_cache();
+        Ok(())
+    }
+
+    /// Clears _and_ resets the cache. Resetting the cache means that no
+    /// states are persisted and the clear count is reset to 0. No heap memory
+    /// is released.
+    ///
+    /// Note that the caller may reset a cache with a different DFA than what
+    /// it was created from. In which case, the cache can now be used with the
+    /// new DFA (and not the old DFA).
+    fn reset_cache(&mut self) {
+        self.cache.state_saver = StateSaver::none();
+        self.clear_cache();
+        // If a new DFA is used, it might have a different number of NFA
+        // states, so we need to make sure our sparse sets have the appropriate
+        // size.
+        self.cache.sparses.resize(self.dfa.nfa.len());
+        self.cache.clear_count = 0;
+    }
+
+    /// Clear the cache used by this lazy DFA.
+    ///
+    /// If clearing the cache exceeds the minimum number of required cache
+    /// clearings, then this will return a cache error. In this case,
+    /// callers should bubble this up as the cache can't be used until it is
+    /// reset. Implementations of search should convert this error into a
+    /// `MatchError::GaveUp`.
+    ///
+    /// If 'self.state_saver' is set to save a state, then this state is
+    /// persisted through cache clearing. Otherwise, the cache is returned to
+    /// its state after initialization with two exceptions: its clear count
+    /// is incremented and some of its memory likely has additional capacity.
+    /// That is, clearing a cache does _not_ release memory.
+    ///
+    /// Otherwise, any lazy state ID generated by the cache prior to resetting
+    /// it is invalid after the reset.
+    fn clear_cache(&mut self) {
+        self.cache.trans.clear();
+        self.cache.starts.clear();
+        self.cache.states.clear();
+        self.cache.states_to_id.clear();
+        self.cache.memory_usage_state = 0;
+        self.cache.clear_count += 1;
+        trace!(
+            "lazy DFA cache has been cleared (count: {})",
+            self.cache.clear_count
+        );
+        self.init_cache();
+        // If the state we want to save is one of the sentinel
+        // (unknown/dead/quit) states, then 'init_cache' adds those back, and
+        // their identifier values remains invariant. So there's no need to add
+        // it again. (And indeed, doing so would be incorrect!)
+        if let Some((old_id, state)) = self.cache.state_saver.take_to_save() {
+            // If the state is one of the special sentinel states, then it is
+            // automatically added by cache initialization and its ID always
+            // remains the same. With that said, this should never occur since
+            // the sentinel states are all loop states back to themselves. So
+            // we should never be in a position where we're attempting to save
+            // a sentinel state since we never compute transitions out of a
+            // sentinel state.
+            assert!(
+                !self.as_ref().is_sentinel(old_id),
+                "cannot save sentinel state"
+            );
+            let new_id = self
+                .add_state(state, |id| {
+                    if old_id.is_start() {
+                        id.to_start()
+                    } else {
+                        id
+                    }
+                })
+                // The unwrap here is OK because lazy DFA creation ensures that
+                // we have room in the cache to add MIN_STATES states. Since
+                // 'init_cache' above adds 3, this adds a 4th.
+                .expect("adding one state after cache clear must work");
+            self.cache.state_saver = StateSaver::Saved(new_id);
+        }
+    }
+
+    /// Initialize this cache from emptiness to a place where it can be used
+    /// for search.
+    ///
+    /// This is called both at cache creation time and after the cache has been
+    /// cleared.
+    ///
+    /// Primarily, this adds the three sentinel states and allocates some
+    /// initial memory.
+    fn init_cache(&mut self) {
+        let mut starts_len = Start::count();
+        if self.dfa.starts_for_each_pattern {
+            starts_len += Start::count() * self.dfa.pattern_count();
+        }
+        self.cache
+            .starts
+            .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len));
+        // This is the set of NFA states that corresponds to each of our three
+        // sentinel states: the empty set.
+        let dead = State::dead();
+        // This sets up some states that we use as sentinels that are present
+        // in every DFA. While it would be technically possible to implement
+        // this DFA without explicitly putting these states in the transition
+        // table, this is convenient to do to make `next_state` correct for all
+        // valid state IDs without needing explicit conditionals to special
+        // case these sentinel states.
+        //
+        // All three of these states are "dead" states. That is, all of
+        // them transition only to themselves. So once you enter one of
+        // these states, it's impossible to leave them. Thus, any correct
+        // search routine must explicitly check for these state types. (Sans
+        // `unknown`, since that is only used internally to represent missing
+        // states.)
+        let unk_id =
+            self.add_state(dead.clone(), |id| id.to_unknown()).unwrap();
+        let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap();
+        let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap();
+        assert_eq!(unk_id, self.as_ref().unknown_id());
+        assert_eq!(dead_id, self.as_ref().dead_id());
+        assert_eq!(quit_id, self.as_ref().quit_id());
+        // The idea here is that if you start in an unknown/dead/quit state and
+        // try to transition on them, then you should end up where you started.
+        self.set_all_transitions(unk_id, unk_id);
+        self.set_all_transitions(dead_id, dead_id);
+        self.set_all_transitions(quit_id, quit_id);
+        // All of these states are technically equivalent from the FSM
+        // perspective, so putting all three of them in the cache isn't
+        // possible. (They are distinct merely because we use their
+        // identifiers as sentinels to mean something, as indicated by the
+        // names.) Moreover, we wouldn't want to do that. Unknown and quit
+        // states are special in that they are artificial constructions
+        // this implementation. But dead states are a natural part of
+        // determinization. When you reach a point in the NFA where you cannot
+        // go anywhere else, a dead state will naturally arise and we MUST
+        // reuse the canonical dead state that we've created here. Why? Because
+        // it is the state ID that tells the search routine whether a state is
+        // dead or not, and thus, whether to stop the search. Having a bunch of
+        // distinct dead states would be quite wasteful!
+        self.cache.states_to_id.insert(dead, dead_id);
+    }
+
+    /// Save the state corresponding to the ID given such that the state
+    /// persists through a cache clearing.
+    ///
+    /// While the state may persist, the ID may not. In order to discover the
+    /// new state ID, one must call 'saved_state_id' after a cache clearing.
+    fn save_state(&mut self, id: LazyStateID) {
+        let state = self.as_ref().get_cached_state(id).clone();
+        self.cache.state_saver = StateSaver::ToSave { id, state };
+    }
+
+    /// Returns the updated lazy state ID for a state that was persisted
+    /// through a cache clearing.
+    ///
+    /// It is only correct to call this routine when both a state has been
+    /// saved and the cache has just been cleared. Otherwise, this panics.
+    fn saved_state_id(&mut self) -> LazyStateID {
+        self.cache
+            .state_saver
+            .take_saved()
+            .expect("state saver does not have saved state ID")
+    }
+
+    /// Set all transitions on the state 'from' to 'to'.
+    fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) {
+        for unit in self.dfa.classes.representatives() {
+            self.set_transition(from, unit, to);
+        }
+    }
+
+    /// Set the transition on 'from' for 'unit' to 'to'.
+    ///
+    /// This panics if either 'from' or 'to' is invalid.
+    ///
+    /// All unit values are OK.
+    fn set_transition(
+        &mut self,
+        from: LazyStateID,
+        unit: alphabet::Unit,
+        to: LazyStateID,
+    ) {
+        assert!(self.as_ref().is_valid(from), "invalid 'from' id: {:?}", from);
+        assert!(self.as_ref().is_valid(to), "invalid 'to' id: {:?}", to);
+        let offset =
+            from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit);
+        self.cache.trans[offset] = to;
+    }
+
+    /// Set the start ID for the given pattern ID (if given) and starting
+    /// configuration to the ID given.
+    ///
+    /// This panics if 'id' is not valid or if a pattern ID is given and
+    /// 'starts_for_each_pattern' is not enabled.
+    fn set_start_state(
+        &mut self,
+        pattern_id: Option<PatternID>,
+        start: Start,
+        id: LazyStateID,
+    ) {
+        assert!(self.as_ref().is_valid(id));
+        let start_index = start.as_usize();
+        let index = match pattern_id {
+            None => start_index,
+            Some(pid) => {
+                assert!(
+                    self.dfa.starts_for_each_pattern,
+                    "attempted to search for a specific pattern \
+                     without enabling starts_for_each_pattern",
+                );
+                let pid = pid.as_usize();
+                Start::count() + (Start::count() * pid) + start_index
+            }
+        };
+        self.cache.starts[index] = id;
+    }
+
+    /// Returns a state builder from this DFA that might have existing
+    /// capacity. This helps avoid allocs in cases where a state is built that
+    /// turns out to already be cached.
+    ///
+    /// Callers must put the state builder back with 'put_state_builder',
+    /// otherwise the allocation reuse won't work.
+    fn get_state_builder(&mut self) -> StateBuilderEmpty {
+        core::mem::replace(
+            &mut self.cache.scratch_state_builder,
+            StateBuilderEmpty::new(),
+        )
+    }
+
+    /// Puts the given state builder back into this DFA for reuse.
+    ///
+    /// Note that building a 'State' from a builder always creates a new alloc,
+    /// so callers should always put the builder back.
+    fn put_state_builder(&mut self, builder: StateBuilderNFA) {
+        let _ = core::mem::replace(
+            &mut self.cache.scratch_state_builder,
+            builder.clear(),
+        );
+    }
+}
+
+/// A type that groups methods that require the base NFA/DFA and read-only
+/// access to the cache.
+#[derive(Debug)]
+struct LazyRef<'i, 'c> {
+    dfa: &'i DFA,
+    cache: &'c Cache,
+}
+
+impl<'i, 'c> LazyRef<'i, 'c> {
+    /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
+    fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> {
+        LazyRef { dfa, cache }
+    }
+
+    /// Return the ID of the start state for the given configuration.
+    ///
+    /// If the start state has not yet been computed, then this returns an
+    /// unknown lazy state ID.
+    fn get_cached_start_id(
+        &self,
+        pattern_id: Option<PatternID>,
+        start: Start,
+    ) -> LazyStateID {
+        let start_index = start.as_usize();
+        let index = match pattern_id {
+            None => start_index,
+            Some(pid) => {
+                let pid = pid.as_usize();
+                assert!(
+                    pid < self.dfa.pattern_count(),
+                    "invalid pattern ID: {:?}",
+                    pid
+                );
+                Start::count() + (Start::count() * pid) + start_index
+            }
+        };
+        self.cache.starts[index]
+    }
+
+    /// Return the cached NFA/DFA powerset state for the given ID.
+    ///
+    /// This panics if the given ID does not address a valid state.
+    fn get_cached_state(&self, sid: LazyStateID) -> &State {
+        let index = sid.as_usize_untagged() >> self.dfa.stride2();
+        &self.cache.states[index]
+    }
+
+    /// Returns true if and only if the given ID corresponds to a "sentinel"
+    /// state.
+    ///
+    /// A sentinel state is a state that signifies a special condition of
+    /// search, and where every transition maps back to itself. See LazyStateID
+    /// for more details. Note that start and match states are _not_ sentinels
+    /// since they may otherwise be real states with non-trivial transitions.
+    /// The purposes of sentinel states is purely to indicate something. Their
+    /// transitions are not meant to be followed.
+    fn is_sentinel(&self, id: LazyStateID) -> bool {
+        id == self.unknown_id() || id == self.dead_id() || id == self.quit_id()
+    }
+
+    /// Returns the ID of the unknown state for this lazy DFA.
+    fn unknown_id(&self) -> LazyStateID {
+        // This unwrap is OK since 0 is always a valid state ID.
+        LazyStateID::new(0).unwrap().to_unknown()
+    }
+
+    /// Returns the ID of the dead state for this lazy DFA.
+    fn dead_id(&self) -> LazyStateID {
+        // This unwrap is OK since the maximum value here is 1 * 512 = 512,
+        // which is <= 2047 (the maximum state ID on 16-bit systems). Where
+        // 512 is the worst case for our equivalence classes (every byte is a
+        // distinct class).
+        LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead()
+    }
+
+    /// Returns the ID of the quit state for this lazy DFA.
+    fn quit_id(&self) -> LazyStateID {
+        // This unwrap is OK since the maximum value here is 2 * 512 = 1024,
+        // which is <= 2047 (the maximum state ID on 16-bit systems). Where
+        // 512 is the worst case for our equivalence classes (every byte is a
+        // distinct class).
+        LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit()
+    }
+
+    /// Returns true if and only if the given ID is valid.
+    ///
+    /// An ID is valid if it is both a valid index into the transition table
+    /// and is a multiple of the DFA's stride.
+    fn is_valid(&self, id: LazyStateID) -> bool {
+        let id = id.as_usize_untagged();
+        id < self.cache.trans.len() && id % self.dfa.stride() == 0
+    }
+
+    /// Returns true if adding the state given would fit in this cache.
+    fn state_fits_in_cache(&self, state: &State) -> bool {
+        let needed = self.cache.memory_usage()
+            + self.memory_usage_for_one_more_state(state.memory_usage());
+        needed <= self.dfa.cache_capacity
+    }
+
+    /// Returns true if adding the state to be built by the given builder would
+    /// fit in this cache.
+    fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool {
+        let needed = self.cache.memory_usage()
+            + self.memory_usage_for_one_more_state(state.as_bytes().len());
+        needed <= self.dfa.cache_capacity
+    }
+
+    /// Returns the additional memory usage, in bytes, required to add one more
+    /// state to this cache. The given size should be the heap size, in bytes,
+    /// that would be used by the new state being added.
+    fn memory_usage_for_one_more_state(
+        &self,
+        state_heap_size: usize,
+    ) -> usize {
+        const ID_SIZE: usize = size_of::<LazyStateID>();
+        const STATE_SIZE: usize = size_of::<State>();
+
+        self.dfa.stride() * ID_SIZE // additional space needed in trans table
+        + STATE_SIZE // space in cache.states
+        + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id
+        + state_heap_size // heap memory used by state itself
+    }
+}
+
+/// A simple type that encapsulates the saving of a state ID through a cache
+/// clearing.
+///
+/// A state ID can be marked for saving with ToSave, while a state ID can be
+/// saved itself with Saved.
+#[derive(Clone, Debug)]
+enum StateSaver {
+    /// An empty state saver. In this case, no states (other than the special
+    /// sentinel states) are preserved after clearing the cache.
+    None,
+    /// An ID of a state (and the state itself) that should be preserved after
+    /// the lazy DFA's cache has been cleared. After clearing, the updated ID
+    /// is stored in 'Saved' since it may have changed.
+    ToSave { id: LazyStateID, state: State },
+    /// An ID that of a state that has been persisted through a lazy DFA
+    /// cache clearing. The ID recorded here corresonds to an ID that was
+    /// once marked as ToSave. The IDs are likely not equivalent even though
+    /// the states they point to are.
+    Saved(LazyStateID),
+}
+
+impl StateSaver {
+    /// Create an empty state saver.
+    fn none() -> StateSaver {
+        StateSaver::None
+    }
+
+    /// Replace this state saver with an empty saver, and if this saver is a
+    /// request to save a state, return that request.
+    fn take_to_save(&mut self) -> Option<(LazyStateID, State)> {
+        match core::mem::replace(self, StateSaver::None) {
+            StateSaver::None | StateSaver::Saved(_) => None,
+            StateSaver::ToSave { id, state } => Some((id, state)),
+        }
+    }
+
+    /// Replace this state saver with an empty saver, and if this saver is a
+    /// saved state (or a request to save a state), return that state's ID.
+    ///
+    /// The idea here is that a request to save a state isn't necessarily
+    /// honored because it might not be needed. e.g., Some higher level code
+    /// might request a state to be saved on the off chance that the cache gets
+    /// cleared when a new state is added at a lower level. But if that new
+    /// state is never added, then the cache is never cleared and the state and
+    /// its ID remain unchanged.
+    fn take_saved(&mut self) -> Option<LazyStateID> {
+        match core::mem::replace(self, StateSaver::None) {
+            StateSaver::None => None,
+            StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id),
+        }
+    }
+}
+
+/// The configuration used for building a lazy DFA.
+///
+/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
+/// advantage of the former is that it often lets you avoid importing the
+/// `Config` type directly.
+///
+/// A lazy DFA configuration is a simple data object that is typically used
+/// with [`Builder::configure`].
+///
+/// The default configuration guarantees that a search will _never_ return
+/// a [`MatchError`] for any haystack or pattern. Setting a quit byte with
+/// [`Config::quit`], enabling heuristic support for Unicode word boundaries
+/// with [`Config::unicode_word_boundary`], or setting a minimum cache clear
+/// count with [`Config::minimum_cache_clear_count`] can in turn cause a search
+/// to return an error. See the corresponding configuration options for more
+/// details on when those error conditions arise.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+    // As with other configuration types in this crate, we put all our knobs
+    // in options so that we can distinguish between "default" and "not set."
+    // This makes it possible to easily combine multiple configurations
+    // without default values overwriting explicitly specified values. See the
+    // 'overwrite' method.
+    //
+    // For docs on the fields below, see the corresponding method setters.
+    anchored: Option<bool>,
+    match_kind: Option<MatchKind>,
+    starts_for_each_pattern: Option<bool>,
+    byte_classes: Option<bool>,
+    unicode_word_boundary: Option<bool>,
+    quitset: Option<ByteSet>,
+    cache_capacity: Option<usize>,
+    skip_cache_capacity_check: Option<bool>,
+    minimum_cache_clear_count: Option<Option<usize>>,
+}
+
+impl Config {
+    /// Return a new default lazy DFA builder configuration.
+    pub fn new() -> Config {
+        Config::default()
+    }
+
+    /// Set whether matching must be anchored at the beginning of the input.
+    ///
+    /// When enabled, a match must begin at the start of a search. When
+    /// disabled (the default), the lazy DFA will act as if the pattern started
+    /// with a `(?s:.)*?`, which enables a match to appear anywhere.
+    ///
+    /// Note that if you want to run both anchored and unanchored
+    /// searches without building multiple automatons, you can enable the
+    /// [`Config::starts_for_each_pattern`] configuration instead. This will
+    /// permit unanchored any-pattern searches and pattern-specific anchored
+    /// searches. See the documentation for that configuration for an example.
+    ///
+    /// By default this is disabled.
+    ///
+    /// **WARNING:** this is subtly different than using a `^` at the start of
+    /// your regex. A `^` forces a regex to match exclusively at the start of
+    /// input, regardless of where you begin your search. In contrast, enabling
+    /// this option will allow your regex to match anywhere in your input,
+    /// but the match must start at the beginning of a search. (Most of the
+    /// higher level convenience search routines make "start of input" and
+    /// "start of search" equivalent, but some routines allow treating these as
+    /// orthogonal.)
+    ///
+    /// For example, consider the haystack `aba` and the following searches:
+    ///
+    /// 1. The regex `^a` is compiled with `anchored=false` and searches
+    ///    `aba` starting at position `2`. Since `^` requires the match to
+    ///    start at the beginning of the input and `2 > 0`, no match is found.
+    /// 2. The regex `a` is compiled with `anchored=true` and searches `aba`
+    ///    starting at position `2`. This reports a match at `[2, 3]` since
+    ///    the match starts where the search started. Since there is no `^`,
+    ///    there is no requirement for the match to start at the beginning of
+    ///    the input.
+    /// 3. The regex `a` is compiled with `anchored=true` and searches `aba`
+    ///    starting at position `1`. Since `b` corresponds to position `1` and
+    ///    since the regex is anchored, it finds no match.
+    /// 4. The regex `a` is compiled with `anchored=false` and searches `aba`
+    ///    startting at position `1`. Since the regex is neither anchored nor
+    ///    starts with `^`, the regex is compiled with an implicit `(?s:.)*?`
+    ///    prefix that permits it to match anywhere. Thus, it reports a match
+    ///    at `[2, 3]`.
+    ///
+    /// # Example
+    ///
+    /// This demonstrates the differences between an anchored search and
+    /// a pattern that begins with `^` (as described in the above warning
+    /// message).
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+    ///
+    /// let haystack = "aba".as_bytes();
+    ///
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().anchored(false)) // default
+    ///     .build(r"^a")?;
+    /// let mut cache = dfa.create_cache();
+    /// let got = dfa.find_leftmost_fwd_at(
+    ///     &mut cache, None, None, haystack, 2, 3,
+    /// )?;
+    /// // No match is found because 2 is not the beginning of the haystack,
+    /// // which is what ^ requires.
+    /// let expected = None;
+    /// assert_eq!(expected, got);
+    ///
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().anchored(true))
+    ///     .build(r"a")?;
+    /// let mut cache = dfa.create_cache();
+    /// let got = dfa.find_leftmost_fwd_at(
+    ///     &mut cache, None, None, haystack, 2, 3,
+    /// )?;
+    /// // An anchored search can still match anywhere in the haystack, it just
+    /// // must begin at the start of the search which is '2' in this case.
+    /// let expected = Some(HalfMatch::must(0, 3));
+    /// assert_eq!(expected, got);
+    ///
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().anchored(true))
+    ///     .build(r"a")?;
+    /// let mut cache = dfa.create_cache();
+    /// let got = dfa.find_leftmost_fwd_at(
+    ///     &mut cache, None, None, haystack, 1, 3,
+    /// )?;
+    /// // No match is found since we start searching at offset 1 which
+    /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
+    /// // is found.
+    /// let expected = None;
+    /// assert_eq!(expected, got);
+    ///
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().anchored(false))
+    ///     .build(r"a")?;
+    /// let mut cache = dfa.create_cache();
+    /// let got = dfa.find_leftmost_fwd_at(
+    ///     &mut cache, None, None, haystack, 1, 3,
+    /// )?;
+    /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the
+    /// // pattern. Even though the search starts at 'b', the 'match anything'
+    /// // prefix allows the search to match 'a'.
+    /// let expected = Some(HalfMatch::must(0, 3));
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn anchored(mut self, yes: bool) -> Config {
+        self.anchored = Some(yes);
+        self
+    }
+
+    /// Set the desired match semantics.
+    ///
+    /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
+    /// match semantics of Perl-like regex engines. That is, when multiple
+    /// patterns would match at the same leftmost position, the pattern that
+    /// appears first in the concrete syntax is chosen.
+    ///
+    /// Currently, the only other kind of match semantics supported is
+    /// [`MatchKind::All`]. This corresponds to classical DFA construction
+    /// where all possible matches are added to the lazy DFA.
+    ///
+    /// Typically, `All` is used when one wants to execute an overlapping
+    /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
+    /// sense to use `All` with the various "leftmost" find routines, since the
+    /// leftmost routines depend on the `LeftmostFirst` automata construction
+    /// strategy. Specifically, `LeftmostFirst` adds dead states to the
+    /// lazy DFA as a way to terminate the search and report a match.
+    /// `LeftmostFirst` also supports non-greedy matches using this strategy
+    /// where as `All` does not.
+    ///
+    /// # Example: overlapping search
+    ///
+    /// This example shows the typical use of `MatchKind::All`, which is to
+    /// report overlapping matches.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     hybrid::{dfa::DFA, OverlappingState},
+    ///     HalfMatch, MatchKind,
+    /// };
+    ///
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().match_kind(MatchKind::All))
+    ///     .build_many(&[r"\w+$", r"\S+$"])?;
+    /// let mut cache = dfa.create_cache();
+    /// let haystack = "@foo".as_bytes();
+    /// let mut state = OverlappingState::start();
+    ///
+    /// let expected = Some(HalfMatch::must(1, 4));
+    /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// // The first pattern also matches at the same position, so re-running
+    /// // the search will yield another match. Notice also that the first
+    /// // pattern is returned after the second. This is because the second
+    /// // pattern begins its match before the first, is therefore an earlier
+    /// // match and is thus reported first.
+    /// let expected = Some(HalfMatch::must(0, 4));
+    /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// # Example: reverse automaton to find start of match
+    ///
+    /// Another example for using `MatchKind::All` is for constructing a
+    /// reverse automaton to find the start of a match. `All` semantics are
+    /// used for this in order to find the longest possible match, which
+    /// corresponds to the leftmost starting position.
+    ///
+    /// Note that if you need the starting position then
+    /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this
+    /// for you, so it's usually not necessary to do this yourself.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchKind};
+    ///
+    /// let haystack = "123foobar456".as_bytes();
+    /// let pattern = r"[a-z]+";
+    ///
+    /// let dfa_fwd = DFA::new(pattern)?;
+    /// let dfa_rev = DFA::builder()
+    ///     .configure(DFA::config()
+    ///         .anchored(true)
+    ///         .match_kind(MatchKind::All)
+    ///     )
+    ///     .build(pattern)?;
+    /// let mut cache_fwd = dfa_fwd.create_cache();
+    /// let mut cache_rev = dfa_rev.create_cache();
+    ///
+    /// let expected_fwd = HalfMatch::must(0, 9);
+    /// let expected_rev = HalfMatch::must(0, 3);
+    /// let got_fwd = dfa_fwd.find_leftmost_fwd(
+    ///     &mut cache_fwd, haystack,
+    /// )?.unwrap();
+    /// // Here we don't specify the pattern to search for since there's only
+    /// // one pattern and we're doing a leftmost search. But if this were an
+    /// // overlapping search, you'd need to specify the pattern that matched
+    /// // in the forward direction. (Otherwise, you might wind up finding the
+    /// // starting position of a match of some other pattern.) That in turn
+    /// // requires building the reverse automaton with starts_for_each_pattern
+    /// // enabled. Indeed, this is what Regex does internally.
+    /// let got_rev = dfa_rev.find_leftmost_rev_at(
+    ///     &mut cache_rev, None, haystack, 0, got_fwd.offset(),
+    /// )?.unwrap();
+    /// assert_eq!(expected_fwd, got_fwd);
+    /// assert_eq!(expected_rev, got_rev);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn match_kind(mut self, kind: MatchKind) -> Config {
+        self.match_kind = Some(kind);
+        self
+    }
+
+    /// Whether to compile a separate start state for each pattern in the
+    /// lazy DFA.
+    ///
+    /// When enabled, a separate **anchored** start state is added for each
+    /// pattern in the lazy DFA. When this start state is used, then the DFA
+    /// will only search for matches for the pattern specified, even if there
+    /// are other patterns in the DFA.
+    ///
+    /// The main downside of this option is that it can potentially increase
+    /// the size of the DFA and/or increase the time it takes to build the
+    /// DFA at search time. However, since this is configuration for a lazy
+    /// DFA, these states aren't actually built unless they're used. Enabling
+    /// this isn't necessarily free, however, as it may result in higher cache
+    /// usage.
+    ///
+    /// There are a few reasons one might want to enable this (it's disabled
+    /// by default):
+    ///
+    /// 1. When looking for the start of an overlapping match (using a reverse
+    /// DFA), doing it correctly requires starting the reverse search using the
+    /// starting state of the pattern that matched in the forward direction.
+    /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it
+    /// will automatically enable this option when building the reverse DFA
+    /// internally.
+    /// 2. When you want to use a DFA with multiple patterns to both search
+    /// for matches of any pattern or to search for anchored matches of one
+    /// particular pattern while using the same DFA. (Otherwise, you would need
+    /// to compile a new DFA for each pattern.)
+    /// 3. Since the start states added for each pattern are anchored, if you
+    /// compile an unanchored DFA with one pattern while also enabling this
+    /// option, then you can use the same DFA to perform anchored or unanchored
+    /// searches. The latter you get with the standard search APIs. The former
+    /// you get from the various `_at` search methods that allow you specify a
+    /// pattern ID to search for.
+    ///
+    /// By default this is disabled.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use this option to permit the same lazy DFA
+    /// to run both anchored and unanchored searches for a single pattern.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, PatternID};
+    ///
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().starts_for_each_pattern(true))
+    ///     .build(r"foo[0-9]+")?;
+    /// let mut cache = dfa.create_cache();
+    /// let haystack = b"quux foo123";
+    ///
+    /// // Here's a normal unanchored search. Notice that we use 'None' for the
+    /// // pattern ID. Since the DFA was built as an unanchored machine, it
+    /// // uses its default unanchored starting state.
+    /// let expected = HalfMatch::must(0, 11);
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
+    ///     &mut cache, None, None, haystack, 0, haystack.len(),
+    /// )?);
+    /// // But now if we explicitly specify the pattern to search ('0' being
+    /// // the only pattern in the DFA), then it will use the starting state
+    /// // for that specific pattern which is always anchored. Since the
+    /// // pattern doesn't have a match at the beginning of the haystack, we
+    /// // find nothing.
+    /// assert_eq!(None, dfa.find_leftmost_fwd_at(
+    ///     &mut cache, None, Some(PatternID::must(0)), haystack, 0, haystack.len(),
+    /// )?);
+    /// // And finally, an anchored search is not the same as putting a '^' at
+    /// // beginning of the pattern. An anchored search can only match at the
+    /// // beginning of the *search*, which we can change:
+    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
+    ///     &mut cache, None, Some(PatternID::must(0)), haystack, 5, haystack.len(),
+    /// )?);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
+        self.starts_for_each_pattern = Some(yes);
+        self
+    }
+
+    /// Whether to attempt to shrink the size of the lazy DFA's alphabet or
+    /// not.
+    ///
+    /// This option is enabled by default and should never be disabled unless
+    /// one is debugging the lazy DFA.
+    ///
+    /// When enabled, the lazy DFA will use a map from all possible bytes
+    /// to their corresponding equivalence class. Each equivalence class
+    /// represents a set of bytes that does not discriminate between a match
+    /// and a non-match in the DFA. For example, the pattern `[ab]+` has at
+    /// least two equivalence classes: a set containing `a` and `b` and a set
+    /// containing every byte except for `a` and `b`. `a` and `b` are in the
+    /// same equivalence classes because they never discriminate between a
+    /// match and a non-match.
+    ///
+    /// The advantage of this map is that the size of the transition table
+    /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)`
+    /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of
+    /// equivalence classes (rounded up to the nearest power of 2). As a
+    /// result, total space usage can decrease substantially. Moreover, since a
+    /// smaller alphabet is used, DFA compilation during search becomes faster
+    /// as well since it will potentially be able to reuse a single transition
+    /// for multiple bytes.
+    ///
+    /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling
+    /// this does not yield any speed advantages. Namely, even when this is
+    /// disabled, a byte class map is still used while searching. The only
+    /// difference is that every byte will be forced into its own distinct
+    /// equivalence class. This is useful for debugging the actual generated
+    /// transitions because it lets one see the transitions defined on actual
+    /// bytes instead of the equivalence classes.
+    pub fn byte_classes(mut self, yes: bool) -> Config {
+        self.byte_classes = Some(yes);
+        self
+    }
+
+    /// Heuristically enable Unicode word boundaries.
+    ///
+    /// When set, this will attempt to implement Unicode word boundaries as if
+    /// they were ASCII word boundaries. This only works when the search input
+    /// is ASCII only. If a non-ASCII byte is observed while searching, then a
+    /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned.
+    ///
+    /// A possible alternative to enabling this option is to simply use an
+    /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
+    /// option is if you absolutely need Unicode support. This option lets one
+    /// use a fast search implementation (a DFA) for some potentially very
+    /// common cases, while providing the option to fall back to some other
+    /// regex engine to handle the general case when an error is returned.
+    ///
+    /// If the pattern provided has no Unicode word boundary in it, then this
+    /// option has no effect. (That is, quitting on a non-ASCII byte only
+    /// occurs when this option is enabled _and_ a Unicode word boundary is
+    /// present in the pattern.)
+    ///
+    /// This is almost equivalent to setting all non-ASCII bytes to be quit
+    /// bytes. The only difference is that this will cause non-ASCII bytes to
+    /// be quit bytes _only_ when a Unicode word boundary is present in the
+    /// pattern.
+    ///
+    /// When enabling this option, callers _must_ be prepared to handle
+    /// a [`MatchError`](crate::MatchError) error during search.
+    /// When using a [`Regex`](crate::hybrid::regex::Regex), this
+    /// corresponds to using the `try_` suite of methods. Alternatively,
+    /// if callers can guarantee that their input is ASCII only, then a
+    /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be
+    /// returned while searching.
+    ///
+    /// This is disabled by default.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to heuristically enable Unicode word boundaries
+    /// in a pattern. It also shows what happens when a search comes across a
+    /// non-ASCII byte.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     hybrid::dfa::DFA,
+    ///     HalfMatch, MatchError, MatchKind,
+    /// };
+    ///
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().unicode_word_boundary(true))
+    ///     .build(r"\b[0-9]+\b")?;
+    /// let mut cache = dfa.create_cache();
+    ///
+    /// // The match occurs before the search ever observes the snowman
+    /// // character, so no error occurs.
+    /// let haystack = "foo 123 ☃".as_bytes();
+    /// let expected = Some(HalfMatch::must(0, 7));
+    /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// // Notice that this search fails, even though the snowman character
+    /// // occurs after the ending match offset. This is because search
+    /// // routines read one byte past the end of the search to account for
+    /// // look-around, and indeed, this is required here to determine whether
+    /// // the trailing \b matches.
+    /// let haystack = "foo 123☃".as_bytes();
+    /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 };
+    /// let got = dfa.find_leftmost_fwd(&mut cache, haystack);
+    /// assert_eq!(Err(expected), got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
+        // We have a separate option for this instead of just setting the
+        // appropriate quit bytes here because we don't want to set quit bytes
+        // for every regex. We only want to set them when the regex contains a
+        // Unicode word boundary.
+        self.unicode_word_boundary = Some(yes);
+        self
+    }
+
+    /// Add a "quit" byte to the lazy DFA.
+    ///
+    /// When a quit byte is seen during search time, then search will return
+    /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the
+    /// offset at which the search stopped.
+    ///
+    /// A quit byte will always overrule any other aspects of a regex. For
+    /// example, if the `x` byte is added as a quit byte and the regex `\w` is
+    /// used, then observing `x` will cause the search to quit immediately
+    /// despite the fact that `x` is in the `\w` class.
+    ///
+    /// This mechanism is primarily useful for heuristically enabling certain
+    /// features like Unicode word boundaries in a DFA. Namely, if the input
+    /// to search is ASCII, then a Unicode word boundary can be implemented
+    /// via an ASCII word boundary with no change in semantics. Thus, a DFA
+    /// can attempt to match a Unicode word boundary but give up as soon as it
+    /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
+    /// to be quit bytes, then Unicode word boundaries will be permitted when
+    /// building lazy DFAs. Of course, callers should enable
+    /// [`Config::unicode_word_boundary`] if they want this behavior instead.
+    /// (The advantage being that non-ASCII quit bytes will only be added if a
+    /// Unicode word boundary is in the pattern.)
+    ///
+    /// When enabling this option, callers _must_ be prepared to handle a
+    /// [`MatchError`](crate::MatchError) error during search. When using a
+    /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
+    /// `try_` suite of methods.
+    ///
+    /// By default, there are no quit bytes set.
+    ///
+    /// # Panics
+    ///
+    /// This panics if heuristic Unicode word boundaries are enabled and any
+    /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
+    /// Unicode word boundaries requires setting every non-ASCII byte to a quit
+    /// byte. So if the caller attempts to undo any of that, then this will
+    /// panic.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to cause a search to terminate if it sees a
+    /// `\n` byte. This could be useful if, for example, you wanted to prevent
+    /// a user supplied pattern from matching across a line boundary.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+    ///
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().quit(b'\n', true))
+    ///     .build(r"foo\p{any}+bar")?;
+    /// let mut cache = dfa.create_cache();
+    ///
+    /// let haystack = "foo\nbar".as_bytes();
+    /// // Normally this would produce a match, since \p{any} contains '\n'.
+    /// // But since we instructed the automaton to enter a quit state if a
+    /// // '\n' is observed, this produces a match error instead.
+    /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
+    /// let got = dfa.find_leftmost_fwd(&mut cache, haystack).unwrap_err();
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn quit(mut self, byte: u8, yes: bool) -> Config {
+        if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
+            panic!(
+                "cannot set non-ASCII byte to be non-quit when \
+                 Unicode word boundaries are enabled"
+            );
+        }
+        if self.quitset.is_none() {
+            self.quitset = Some(ByteSet::empty());
+        }
+        if yes {
+            self.quitset.as_mut().unwrap().add(byte);
+        } else {
+            self.quitset.as_mut().unwrap().remove(byte);
+        }
+        self
+    }
+
+    /// Sets the maximum amount of heap memory, in bytes, to allocate to the
+    /// cache for use during a lazy DFA search. If the lazy DFA would otherwise
+    /// use more heap memory, then, depending on other configuration knobs,
+    /// either stop the search and return an error or clear the cache and
+    /// continue the search.
+    ///
+    /// The default cache capacity is some "reasonable" number that will
+    /// accommodate most regular expressions. You may find that if you need
+    /// to build a large DFA then it may be necessary to increase the cache
+    /// capacity.
+    ///
+    /// Note that while building a lazy DFA will do a "minimum" check to ensure
+    /// the capacity is big enough, this is more or less about correctness.
+    /// If the cache is bigger than the minimum but still too small, then the
+    /// lazy DFA could wind up spending a lot of time clearing the cache and
+    /// recomputing transitions, thus negating the performance benefits of a
+    /// lazy DFA. Thus, setting the cache capacity is mostly an experimental
+    /// endeavor. For most common patterns, however, the default should be
+    /// sufficient.
+    ///
+    /// For more details on how the lazy DFA's cache is used, see the
+    /// documentation for [`Cache`].
+    ///
+    /// # Example
+    ///
+    /// This example shows what happens if the configured cache capacity is
+    /// too small. In such cases, one can override the cache capacity to make
+    /// it bigger. Alternatively, one might want to use less memory by setting
+    /// a smaller cache capacity.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+    ///
+    /// let pattern = r"\p{L}{1000}";
+    ///
+    /// // The default cache capacity is likely too small to deal with regexes
+    /// // that are very large. Large repetitions of large Unicode character
+    /// // classes are a common way to make very large regexes.
+    /// let _ = DFA::new(pattern).unwrap_err();
+    /// // Bump up the capacity to something bigger.
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB
+    ///     .build(pattern)?;
+    /// let mut cache = dfa.create_cache();
+    ///
+    /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
+    /// let expected = Some(HalfMatch::must(0, 2000));
+    /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn cache_capacity(mut self, bytes: usize) -> Config {
+        self.cache_capacity = Some(bytes);
+        self
+    }
+
+    /// Configures construction of a lazy DFA to use the minimum cache capacity
+    /// if the configured capacity is otherwise too small for the provided NFA.
+    ///
+    /// This is useful if you never want lazy DFA construction to fail because
+    /// of a capacity that is too small.
+    ///
+    /// In general, this option is typically not a good idea. In particular,
+    /// while a minimum cache capacity does permit the lazy DFA to function
+    /// where it otherwise couldn't, it's plausible that it may not function
+    /// well if it's constantly running out of room. In that case, the speed
+    /// advantages of the lazy DFA may be negated.
+    ///
+    /// This is disabled by default.
+    ///
+    /// # Example
+    ///
+    /// This example shows what happens if the configured cache capacity is
+    /// too small. In such cases, one could override the capacity explicitly.
+    /// An alternative, demonstrated here, let's us force construction to use
+    /// the minimum cache capacity if the configured capacity is otherwise
+    /// too small.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+    ///
+    /// let pattern = r"\p{L}{1000}";
+    ///
+    /// // The default cache capacity is likely too small to deal with regexes
+    /// // that are very large. Large repetitions of large Unicode character
+    /// // classes are a common way to make very large regexes.
+    /// let _ = DFA::new(pattern).unwrap_err();
+    /// // Configure construction such it automatically selects the minimum
+    /// // cache capacity if it would otherwise be too small.
+    /// let dfa = DFA::builder()
+    ///     .configure(DFA::config().skip_cache_capacity_check(true))
+    ///     .build(pattern)?;
+    /// let mut cache = dfa.create_cache();
+    ///
+    /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
+    /// let expected = Some(HalfMatch::must(0, 2000));
+    /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config {
+        self.skip_cache_capacity_check = Some(yes);
+        self
+    }
+
+    /// Configure a lazy DFA search to quit after a certain number of cache
+    /// clearings.
+    ///
+    /// When a minimum is set, then a lazy DFA search will "give up" after
+    /// the minimum number of cache clearings has occurred. This is typically
+    /// useful in scenarios where callers want to detect whether the lazy DFA
+    /// search is "efficient" or not. If the cache is cleared too many times,
+    /// this is a good indicator that it is not efficient, and thus, the caller
+    /// may wish to use some other regex engine.
+    ///
+    /// Note that the number of times a cache is cleared is a property of
+    /// the cache itself. Thus, if a cache is used in a subsequent search
+    /// with a similarly configured lazy DFA, then it would cause the
+    /// search to "give up" if the cache needed to be cleared. The cache
+    /// clear count can only be reset to `0` via [`DFA::reset_cache`] (or
+    /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if
+    /// you're using the `Regex` API).
+    ///
+    /// By default, no minimum is configured. Thus, a lazy DFA search will
+    /// never give up due to cache clearings.
+    ///
+    /// # Example
+    ///
+    /// This example uses a somewhat pathological configuration to demonstrate
+    /// the _possible_ behavior of cache clearing and how it might result
+    /// in a search that returns an error.
+    ///
+    /// It is important to note that the precise mechanics of how and when
+    /// a cache gets cleared is an implementation detail. Thus, the asserts
+    /// in the tests below with respect to the particular offsets at which a
+    /// search gave up should be viewed strictly as a demonstration. They are
+    /// not part of any API guarantees offered by this crate.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::dfa::DFA, MatchError};
+    ///
+    /// // This is a carefully chosen regex. The idea is to pick one
+    /// // that requires some decent number of states (hence the bounded
+    /// // repetition). But we specifically choose to create a class with an
+    /// // ASCII letter and a non-ASCII letter so that we can check that no new
+    /// // states are created once the cache is full. Namely, if we fill up the
+    /// // cache on a haystack of 'a's, then in order to match one 'β', a new
+    /// // state will need to be created since a 'β' is encoded with multiple
+    /// // bytes. Since there's no room for this state, the search should quit
+    /// // at the very first position.
+    /// let pattern = r"[aβ]{100}";
+    /// let dfa = DFA::builder()
+    ///     .configure(
+    ///         // Configure it so that we have the minimum cache capacity
+    ///         // possible. And that if any clearings occur, the search quits.
+    ///         DFA::config()
+    ///             .skip_cache_capacity_check(true)
+    ///             .cache_capacity(0)
+    ///             .minimum_cache_clear_count(Some(0)),
+    ///     )
+    ///     .build(pattern)?;
+    /// let mut cache = dfa.create_cache();
+    ///
+    /// let haystack = "a".repeat(101).into_bytes();
+    /// assert_eq!(
+    ///     dfa.find_leftmost_fwd(&mut cache, &haystack),
+    ///     Err(MatchError::GaveUp { offset: 25 }),
+    /// );
+    ///
+    /// // Now that we know the cache is full, if we search a haystack that we
+    /// // know will require creating at least one new state, it should not
+    /// // be able to make any progress.
+    /// let haystack = "β".repeat(101).into_bytes();
+    /// assert_eq!(
+    ///     dfa.find_leftmost_fwd(&mut cache, &haystack),
+    ///     Err(MatchError::GaveUp { offset: 0 }),
+    /// );
+    ///
+    /// // If we reset the cache, then we should be able to create more states
+    /// // and make more progress with searching for betas.
+    /// cache.reset(&dfa);
+    /// let haystack = "β".repeat(101).into_bytes();
+    /// assert_eq!(
+    ///     dfa.find_earliest_fwd(&mut cache, &haystack),
+    ///     Err(MatchError::GaveUp { offset: 26 }),
+    /// );
+    ///
+    /// // ... switching back to ASCII still makes progress since it just needs
+    /// // to set transitions on existing states!
+    /// let haystack = "a".repeat(101).into_bytes();
+    /// assert_eq!(
+    ///     dfa.find_earliest_fwd(&mut cache, &haystack),
+    ///     Err(MatchError::GaveUp { offset: 13 }),
+    /// );
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config {
+        self.minimum_cache_clear_count = Some(min);
+        self
+    }
+
+    /// Returns whether this configuration has enabled anchored searches.
+    pub fn get_anchored(&self) -> bool {
+        self.anchored.unwrap_or(false)
+    }
+
+    /// Returns the match semantics set in this configuration.
+    pub fn get_match_kind(&self) -> MatchKind {
+        self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
+    }
+
+    /// Returns whether this configuration has enabled anchored starting states
+    /// for every pattern in the DFA.
+    pub fn get_starts_for_each_pattern(&self) -> bool {
+        self.starts_for_each_pattern.unwrap_or(false)
+    }
+
+    /// Returns whether this configuration has enabled byte classes or not.
+    /// This is typically a debugging oriented option, as disabling it confers
+    /// no speed benefit.
+    pub fn get_byte_classes(&self) -> bool {
+        self.byte_classes.unwrap_or(true)
+    }
+
+    /// Returns whether this configuration has enabled heuristic Unicode word
+    /// boundary support. When enabled, it is possible for a search to return
+    /// an error.
+    pub fn get_unicode_word_boundary(&self) -> bool {
+        self.unicode_word_boundary.unwrap_or(false)
+    }
+
+    /// Returns whether this configuration will instruct the DFA to enter a
+    /// quit state whenever the given byte is seen during a search. When at
+    /// least one byte has this enabled, it is possible for a search to return
+    /// an error.
+    pub fn get_quit(&self, byte: u8) -> bool {
+        self.quitset.map_or(false, |q| q.contains(byte))
+    }
+
+    /// Returns the cache capacity set on this configuration.
+    pub fn get_cache_capacity(&self) -> usize {
+        self.cache_capacity.unwrap_or(2 * (1 << 20))
+    }
+
+    /// Returns whether the cache capacity check should be skipped.
+    pub fn get_skip_cache_capacity_check(&self) -> bool {
+        self.skip_cache_capacity_check.unwrap_or(false)
+    }
+
+    /// Returns, if set, the minimum number of times the cache must be cleared
+    /// before a lazy DFA search can give up. When no minimum is set, then a
+    /// search will never quit and will always clear the cache whenever it
+    /// fills up.
+    pub fn get_minimum_cache_clear_count(&self) -> Option<usize> {
+        self.minimum_cache_clear_count.unwrap_or(None)
+    }
+
+    /// Returns the minimum lazy DFA cache capacity required for the given NFA.
+    ///
+    /// The cache capacity required for a particular NFA may change without
+    /// notice. Callers should not rely on it being stable.
+    ///
+    /// This is useful for informational purposes, but can also be useful for
+    /// other reasons. For example, if one wants to check the minimum cache
+    /// capacity themselves or if one wants to set the capacity based on the
+    /// minimum.
+    ///
+    /// This may return an error if this configuration does not support all of
+    /// the instructions used in the given NFA. For example, if the NFA has a
+    /// Unicode word boundary but this configuration does not enable heuristic
+    /// support for Unicode word boundaries.
+    pub fn get_minimum_cache_capacity(
+        &self,
+        nfa: &thompson::NFA,
+    ) -> Result<usize, BuildError> {
+        let quitset = self.quit_set_from_nfa(nfa)?;
+        let classes = self.byte_classes_from_nfa(nfa, &quitset);
+        let starts = self.get_starts_for_each_pattern();
+        Ok(minimum_cache_capacity(nfa, &classes, starts))
+    }
+
+    /// Returns the byte class map used during search from the given NFA.
+    ///
+    /// If byte classes are disabled on this configuration, then a map is
+    /// returned that puts each byte in its own equivalent class.
+    fn byte_classes_from_nfa(
+        &self,
+        nfa: &thompson::NFA,
+        quit: &ByteSet,
+    ) -> ByteClasses {
+        if !self.get_byte_classes() {
+            // The lazy DFA will always use the equivalence class map, but
+            // enabling this option is useful for debugging. Namely, this will
+            // cause all transitions to be defined over their actual bytes
+            // instead of an opaque equivalence class identifier. The former is
+            // much easier to grok as a human.
+            ByteClasses::singletons()
+        } else {
+            let mut set = nfa.byte_class_set().clone();
+            // It is important to distinguish any "quit" bytes from all other
+            // bytes. Otherwise, a non-quit byte may end up in the same class
+            // as a quit byte, and thus cause the DFA stop when it shouldn't.
+            if !quit.is_empty() {
+                set.add_set(&quit);
+            }
+            set.byte_classes()
+        }
+    }
+
+    /// Return the quit set for this configuration and the given NFA.
+    ///
+    /// This may return an error if the NFA is incompatible with this
+    /// configuration's quit set. For example, if the NFA has a Unicode word
+    /// boundary and the quit set doesn't include non-ASCII bytes.
+    fn quit_set_from_nfa(
+        &self,
+        nfa: &thompson::NFA,
+    ) -> Result<ByteSet, BuildError> {
+        let mut quit = self.quitset.unwrap_or(ByteSet::empty());
+        if nfa.has_word_boundary_unicode() {
+            if self.get_unicode_word_boundary() {
+                for b in 0x80..=0xFF {
+                    quit.add(b);
+                }
+            } else {
+                // If heuristic support for Unicode word boundaries wasn't
+                // enabled, then we can still check if our quit set is correct.
+                // If the caller set their quit bytes in a way that causes the
+                // DFA to quit on at least all non-ASCII bytes, then that's all
+                // we need for heuristic support to work.
+                if !quit.contains_range(0x80, 0xFF) {
+                    return Err(
+                        BuildError::unsupported_dfa_word_boundary_unicode(),
+                    );
+                }
+            }
+        }
+        Ok(quit)
+    }
+
+    /// Overwrite the default configuration such that the options in `o` are
+    /// always used. If an option in `o` is not set, then the corresponding
+    /// option in `self` is used. If it's not set in `self` either, then it
+    /// remains not set.
+    fn overwrite(self, o: Config) -> Config {
+        Config {
+            anchored: o.anchored.or(self.anchored),
+            match_kind: o.match_kind.or(self.match_kind),
+            starts_for_each_pattern: o
+                .starts_for_each_pattern
+                .or(self.starts_for_each_pattern),
+            byte_classes: o.byte_classes.or(self.byte_classes),
+            unicode_word_boundary: o
+                .unicode_word_boundary
+                .or(self.unicode_word_boundary),
+            quitset: o.quitset.or(self.quitset),
+            cache_capacity: o.cache_capacity.or(self.cache_capacity),
+            skip_cache_capacity_check: o
+                .skip_cache_capacity_check
+                .or(self.skip_cache_capacity_check),
+            minimum_cache_clear_count: o
+                .minimum_cache_clear_count
+                .or(self.minimum_cache_clear_count),
+        }
+    }
+}
+
+/// A builder for constructing a lazy deterministic finite automaton from
+/// regular expressions.
+///
+/// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The
+/// advantage of the former is that it often lets you avoid importing the
+/// `Builder` type directly.
+///
+/// This builder provides two main things:
+///
+/// 1. It provides a few different `build` routines for actually constructing
+/// a DFA from different kinds of inputs. The most convenient is
+/// [`Builder::build`], which builds a DFA directly from a pattern string. The
+/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
+/// from an NFA.
+/// 2. The builder permits configuring a number of things.
+/// [`Builder::configure`] is used with [`Config`] to configure aspects of
+/// the DFA and the construction process itself. [`Builder::syntax`] and
+/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
+/// construction, respectively. The syntax and thompson configurations only
+/// apply when building from a pattern string.
+///
+/// This builder always constructs a *single* lazy DFA. As such, this builder
+/// can only be used to construct regexes that either detect the presence
+/// of a match or find the end location of a match. A single DFA cannot
+/// produce both the start and end of a match. For that information, use a
+/// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured
+/// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason
+/// to use a DFA directly is if the end location of a match is enough for your
+/// use case. Namely, a `Regex` will construct two lazy DFAs instead of one,
+/// since a second reverse DFA is needed to find the start of a match.
+///
+/// # Example
+///
+/// This example shows how to build a lazy DFA that uses a tiny cache capacity
+/// and completely disables Unicode. That is:
+///
+/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
+///   and `\b` are ASCII-only while `.` matches any byte except for `\n`
+///   (instead of any UTF-8 encoding of a Unicode scalar value except for
+///   `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
+/// * The pattern itself is permitted to match invalid UTF-8. For example,
+///   things like `[^a]` that match any byte except for `a` are permitted.
+/// * Unanchored patterns can search through invalid UTF-8. That is, for
+///   unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of
+///   `(?s:.)*?`.
+///
+/// ```
+/// use regex_automata::{
+///     hybrid::dfa::DFA,
+///     nfa::thompson,
+///     HalfMatch, SyntaxConfig,
+/// };
+///
+/// let dfa = DFA::builder()
+///     .configure(DFA::config().cache_capacity(5_000))
+///     .syntax(SyntaxConfig::new().unicode(false).utf8(false))
+///     .thompson(thompson::Config::new().utf8(false))
+///     .build(r"foo[^b]ar.*")?;
+/// let mut cache = dfa.create_cache();
+///
+/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
+/// let expected = Some(HalfMatch::must(0, 10));
+/// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+    config: Config,
+    thompson: thompson::Builder,
+}
+
+impl Builder {
+    /// Create a new lazy DFA builder with the default configuration.
+    pub fn new() -> Builder {
+        Builder {
+            config: Config::default(),
+            thompson: thompson::Builder::new(),
+        }
+    }
+
+    /// Build a lazy DFA from the given pattern.
+    ///
+    /// If there was a problem parsing or compiling the pattern, then an error
+    /// is returned.
+    pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
+        self.build_many(&[pattern])
+    }
+
+    /// Build a lazy DFA from the given patterns.
+    ///
+    /// When matches are returned, the pattern ID corresponds to the index of
+    /// the pattern in the slice given.
+    pub fn build_many<P: AsRef<str>>(
+        &self,
+        patterns: &[P],
+    ) -> Result<DFA, BuildError> {
+        let nfa =
+            self.thompson.build_many(patterns).map_err(BuildError::nfa)?;
+        self.build_from_nfa(Arc::new(nfa))
+    }
+
+    /// Build a DFA from the given NFA.
+    ///
+    /// Note that this requires an `Arc<thompson::NFA>` instead of a
+    /// `&thompson::NFA` because the lazy DFA builds itself from the NFA at
+    /// search time. This means that the lazy DFA must hold on to its source
+    /// NFA for the entirety of its lifetime. An `Arc` is used so that callers
+    /// aren't forced to clone the NFA if it is needed elsewhere.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to build a lazy DFA if you already have an NFA
+    /// in hand.
+    ///
+    /// ```
+    /// use std::sync::Arc;
+    /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch};
+    ///
+    /// let haystack = "foo123bar".as_bytes();
+    ///
+    /// // This shows how to set non-default options for building an NFA.
+    /// let nfa = thompson::Builder::new()
+    ///     .configure(thompson::Config::new().shrink(false))
+    ///     .build(r"[0-9]+")?;
+    /// let dfa = DFA::builder().build_from_nfa(Arc::new(nfa))?;
+    /// let mut cache = dfa.create_cache();
+    /// let expected = Some(HalfMatch::must(0, 6));
+    /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn build_from_nfa(
+        &self,
+        nfa: Arc<thompson::NFA>,
+    ) -> Result<DFA, BuildError> {
+        let quitset = self.config.quit_set_from_nfa(&nfa)?;
+        let classes = self.config.byte_classes_from_nfa(&nfa, &quitset);
+        // Check that we can fit at least a few states into our cache,
+        // otherwise it's pretty senseless to use the lazy DFA. This does have
+        // a possible failure mode though. This assumes the maximum size of a
+        // state in powerset space (so, the total number of NFA states), which
+        // may never actually materialize, and could be quite a bit larger
+        // than the actual biggest state. If this turns out to be a problem,
+        // we could expose a knob that disables this check. But if so, we have
+        // to be careful not to panic in other areas of the code (the cache
+        // clearing and init code) that tend to assume some minimum useful
+        // cache capacity.
+        let min_cache = minimum_cache_capacity(
+            &nfa,
+            &classes,
+            self.config.get_starts_for_each_pattern(),
+        );
+        let mut cache_capacity = self.config.get_cache_capacity();
+        if cache_capacity < min_cache {
+            // When the caller has asked us to skip the cache capacity check,
+            // then we simply force the cache capacity to its minimum amount
+            // and mush on.
+            if self.config.get_skip_cache_capacity_check() {
+                trace!(
+                    "given capacity ({}) is too small, \
+                     since skip_cache_capacity_check is enabled, \
+                     setting cache capacity to minimum ({})",
+                    cache_capacity,
+                    min_cache,
+                );
+                cache_capacity = min_cache;
+            } else {
+                return Err(BuildError::insufficient_cache_capacity(
+                    min_cache,
+                    cache_capacity,
+                ));
+            }
+        }
+        // We also need to check that we can fit at least some small number
+        // of states in our state ID space. This is unlikely to trigger in
+        // >=32-bit systems, but 16-bit systems have a pretty small state ID
+        // space since a number of bits are used up as sentinels.
+        if let Err(err) = minimum_lazy_state_id(&nfa, &classes) {
+            return Err(BuildError::insufficient_state_id_capacity(err));
+        }
+        let stride2 = classes.stride2();
+        Ok(DFA {
+            nfa,
+            stride2,
+            classes,
+            quitset,
+            anchored: self.config.get_anchored(),
+            match_kind: self.config.get_match_kind(),
+            starts_for_each_pattern: self.config.get_starts_for_each_pattern(),
+            cache_capacity,
+            minimum_cache_clear_count: self
+                .config
+                .get_minimum_cache_clear_count(),
+        })
+    }
+
+    /// Apply the given lazy DFA configuration options to this builder.
+    pub fn configure(&mut self, config: Config) -> &mut Builder {
+        self.config = self.config.overwrite(config);
+        self
+    }
+
+    /// Set the syntax configuration for this builder using
+    /// [`SyntaxConfig`](crate::SyntaxConfig).
+    ///
+    /// This permits setting things like case insensitivity, Unicode and multi
+    /// line mode.
+    ///
+    /// These settings only apply when constructing a lazy DFA directly from a
+    /// pattern.
+    pub fn syntax(
+        &mut self,
+        config: crate::util::syntax::SyntaxConfig,
+    ) -> &mut Builder {
+        self.thompson.syntax(config);
+        self
+    }
+
+    /// Set the Thompson NFA configuration for this builder using
+    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+    ///
+    /// This permits setting things like whether the DFA should match the regex
+    /// in reverse or if additional time should be spent shrinking the size of
+    /// the NFA.
+    ///
+    /// These settings only apply when constructing a DFA directly from a
+    /// pattern.
+    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+        self.thompson.configure(config);
+        self
+    }
+}
+
+/// Based on the minimum number of states required for a useful lazy DFA cache,
+/// this returns the minimum lazy state ID that must be representable.
+///
+/// It's likely not plausible for this to impose constraints on 32-bit systems
+/// (or higher), but on 16-bit systems, the lazy state ID space is quite
+/// constrained and thus may be insufficient for bigger regexes.
+fn minimum_lazy_state_id(
+    nfa: &thompson::NFA,
+    classes: &ByteClasses,
+) -> Result<LazyStateID, LazyStateIDError> {
+    let stride = 1 << classes.stride2();
+    let min_state_index = MIN_STATES.checked_sub(1).unwrap();
+    LazyStateID::new(min_state_index * stride)
+}
+
+/// Based on the minimum number of states required for a useful lazy DFA cache,
+/// this returns a heuristic minimum number of bytes of heap space required.
+///
+/// This is a "heuristic" because the minimum it returns is likely bigger than
+/// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses
+/// the maximum number of NFA states (all of them). This is likely bigger
+/// than what is required in practice. Computing the true minimum effectively
+/// requires determinization, which is probably too much work to do for a
+/// simple check like this.
+fn minimum_cache_capacity(
+    nfa: &thompson::NFA,
+    classes: &ByteClasses,
+    starts_for_each_pattern: bool,
+) -> usize {
+    const ID_SIZE: usize = size_of::<LazyStateID>();
+    let stride = 1 << classes.stride2();
+
+    let sparses = 2 * nfa.len() * NFAStateID::SIZE;
+    let trans = MIN_STATES * stride * ID_SIZE;
+
+    let mut starts = Start::count() * ID_SIZE;
+    if starts_for_each_pattern {
+        starts += (Start::count() * nfa.pattern_len()) * ID_SIZE;
+    }
+
+    // Every `State` has three bytes for flags, 4 bytes (max) for the number
+    // of patterns, followed by 32-bit encodings of patterns and then delta
+    // varint encodings of NFA state IDs. We use the worst case (which isn't
+    // technically possible) of 5 bytes for each NFA state ID.
+    //
+    // HOWEVER, three of the states needed by a lazy DFA are just the sentinel
+    // unknown, dead and quit states. Those states have a known size and it is
+    // small.
+    assert!(MIN_STATES >= 3, "minimum number of states has to be at least 3");
+    let dead_state_size = State::dead().memory_usage();
+    let max_state_size = 3 + 4 + (nfa.pattern_len() * 4) + (nfa.len() * 5);
+    let states = (3 * (size_of::<State>() + dead_state_size))
+        + ((MIN_STATES - 3) * (size_of::<State>() + max_state_size));
+    let states_to_sid = states + (MIN_STATES * ID_SIZE);
+    let stack = nfa.len() * NFAStateID::SIZE;
+    let scratch_state_builder = max_state_size;
+
+    trans
+        + starts
+        + states
+        + states_to_sid
+        + sparses
+        + stack
+        + scratch_state_builder
+}
diff --git a/src/hybrid/error.rs b/src/hybrid/error.rs

new file mode 100644 (file)

index 0000000..715da39
--- /dev/null
+++ b/src/hybrid/error.rs
@@ -0,0 +1,130 @@
+use crate::{hybrid::id::LazyStateIDError, nfa};
+
+/// An error that occurs when initial construction of a lazy DFA fails.
+///
+/// A build error can occur when insufficient cache capacity is configured or
+/// if something about the NFA is unsupported. (For example, if one attempts
+/// to build a lazy DFA without heuristic Unicode support but with an NFA that
+/// contains a Unicode word boundary.)
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct BuildError {
+    kind: BuildErrorKind,
+}
+
+#[derive(Clone, Debug)]
+enum BuildErrorKind {
+    NFA(nfa::thompson::Error),
+    InsufficientCacheCapacity { minimum: usize, given: usize },
+    InsufficientStateIDCapacity { err: LazyStateIDError },
+    Unsupported(&'static str),
+}
+
+impl BuildError {
+    fn kind(&self) -> &BuildErrorKind {
+        &self.kind
+    }
+
+    pub(crate) fn nfa(err: nfa::thompson::Error) -> BuildError {
+        BuildError { kind: BuildErrorKind::NFA(err) }
+    }
+
+    pub(crate) fn insufficient_cache_capacity(
+        minimum: usize,
+        given: usize,
+    ) -> BuildError {
+        BuildError {
+            kind: BuildErrorKind::InsufficientCacheCapacity { minimum, given },
+        }
+    }
+
+    pub(crate) fn insufficient_state_id_capacity(
+        err: LazyStateIDError,
+    ) -> BuildError {
+        BuildError {
+            kind: BuildErrorKind::InsufficientStateIDCapacity { err },
+        }
+    }
+
+    pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError {
+        let msg = "cannot build lazy DFAs for regexes with Unicode word \
+                   boundaries; switch to ASCII word boundaries, or \
+                   heuristically enable Unicode word boundaries or use a \
+                   different regex engine";
+        BuildError { kind: BuildErrorKind::Unsupported(msg) }
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for BuildError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self.kind() {
+            BuildErrorKind::NFA(ref err) => Some(err),
+            BuildErrorKind::InsufficientCacheCapacity { .. } => None,
+            // LazyStateIDError is an implementation detail, don't expose it.
+            BuildErrorKind::InsufficientStateIDCapacity { .. } => None,
+            BuildErrorKind::Unsupported(_) => None,
+        }
+    }
+}
+
+impl core::fmt::Display for BuildError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match self.kind() {
+            BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
+            BuildErrorKind::InsufficientCacheCapacity { minimum, given } => {
+                write!(
+                    f,
+                    "given cache capacity ({}) is smaller than \
+                     minimum required ({})",
+                    given, minimum,
+                )
+            }
+            BuildErrorKind::InsufficientStateIDCapacity { ref err } => {
+                err.fmt(f)
+            }
+            BuildErrorKind::Unsupported(ref msg) => {
+                write!(f, "unsupported regex feature for DFAs: {}", msg)
+            }
+        }
+    }
+}
+
+/// An error that occurs when cache usage has become inefficient.
+///
+/// One of the weaknesses of a lazy DFA is that it may need to clear its
+/// cache repeatedly if it's not big enough. If this happens too much, then it
+/// can slow searching down significantly. A mitigation to this is to use
+/// heuristics to detect whether the cache is being used efficiently or not.
+/// If not, then a lazy DFA can return a `CacheError`.
+///
+/// The default configuration of a lazy DFA in this crate is
+/// set such that a `CacheError` will never occur. Instead,
+/// callers must opt into this behavior with settings like
+/// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count).
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct CacheError(());
+
+impl CacheError {
+    pub(crate) fn too_many_cache_clears() -> CacheError {
+        CacheError(())
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for CacheError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        None
+    }
+}
+
+impl core::fmt::Display for CacheError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "lazy DFA cache has been cleared too many times")
+    }
+}
diff --git a/src/hybrid/id.rs b/src/hybrid/id.rs

new file mode 100644 (file)

index 0000000..a6fcde5
--- /dev/null
+++ b/src/hybrid/id.rs
@@ -0,0 +1,415 @@
+/// A state identifier especially tailored for lazy DFAs.
+///
+/// A lazy state ID logically represents a pointer to a DFA state. In practice,
+/// by limiting the number of DFA states it can address, it reserves some
+/// bits of its representation to encode some additional information. That
+/// additional information is called a "tag." That tag is used to record
+/// whether the state it points to is an unknown, dead, quit, start or match
+/// state.
+///
+/// When implementing a low level search routine with a lazy DFA, it is
+/// necessary to query the type of the current state to know what to do:
+///
+/// * **Unknown** - The state has not yet been computed. The
+/// parameters used to get this state ID must be re-passed to
+/// [`DFA::next_state`](crate::hybrid::dfa::DFA), which will never return an
+/// unknown state ID.
+/// * **Dead** - A dead state only has transitions to itself. It indicates that
+/// the search cannot do anything else and should stop with whatever result it
+/// has.
+/// * **Quit** - A quit state indicates that the automaton could not answer
+/// whether a match exists or not. Correct search implementations must return a
+/// [`MatchError::Quit`](crate::MatchError::Quit).
+/// * **Start** - A start state indicates that the automaton will begin
+/// searching at a starting state. Branching on this isn't required for
+/// correctness, but a common optimization is to use this to more quickly look
+/// for a prefix.
+/// * **Match** - A match state indicates that a match has been found.
+/// Depending on the semantics of your search implementation, it may either
+/// continue until the end of the haystack or a dead state, or it might quit
+/// and return the match immediately.
+///
+/// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate
+/// can be used to determine if a tag exists at all. This is useful to avoid
+/// branching on all of the above types for every byte searched.
+///
+/// # Example
+///
+/// This example shows how `LazyStateID` can be used to implement a correct
+/// search routine with minimal branching. In particular, this search routine
+/// implements "leftmost" matching, which means that it doesn't immediately
+/// stop once a match is found. Instead, it continues until it reaches a dead
+/// state.
+///
+/// Notice also how a correct search implementation deals with
+/// [`CacheError`](crate::hybrid::CacheError)s returned by some of
+/// the lazy DFA routines. When a `CacheError` occurs, it returns
+/// [`MatchError::GaveUp`](crate::MatchError::GaveUp).
+///
+/// ```
+/// use regex_automata::{
+///     hybrid::dfa::{Cache, DFA},
+///     HalfMatch, MatchError, PatternID,
+/// };
+///
+/// fn find_leftmost_first(
+///     dfa: &DFA,
+///     cache: &mut Cache,
+///     haystack: &[u8],
+/// ) -> Result<Option<HalfMatch>, MatchError> {
+///     // The start state is determined by inspecting the position and the
+///     // initial bytes of the haystack. Note that start states can never
+///     // be match states (since DFAs in this crate delay matches by 1
+///     // byte), so we don't need to check if the start state is a match.
+///     let mut sid = dfa.start_state_forward(
+///         cache, None, haystack, 0, haystack.len(),
+///     ).map_err(|_| MatchError::GaveUp { offset: 0 })?;
+///     let mut last_match = None;
+///     // Walk all the bytes in the haystack. We can quit early if we see
+///     // a dead or a quit state. The former means the automaton will
+///     // never transition to any other state. The latter means that the
+///     // automaton entered a condition in which its search failed.
+///     for (i, &b) in haystack.iter().enumerate() {
+///         sid = dfa
+///             .next_state(cache, sid, b)
+///             .map_err(|_| MatchError::GaveUp { offset: i })?;
+///         if sid.is_tagged() {
+///             if sid.is_match() {
+///                 last_match = Some(HalfMatch::new(
+///                     dfa.match_pattern(cache, sid, 0),
+///                     i,
+///                 ));
+///             } else if sid.is_dead() {
+///                 return Ok(last_match);
+///             } else if sid.is_quit() {
+///                 // It is possible to enter into a quit state after
+///                 // observing a match has occurred. In that case, we
+///                 // should return the match instead of an error.
+///                 if last_match.is_some() {
+///                     return Ok(last_match);
+///                 }
+///                 return Err(MatchError::Quit { byte: b, offset: i });
+///             }
+///             // Implementors may also want to check for start states and
+///             // handle them differently for performance reasons. But it is
+///             // not necessary for correctness.
+///         }
+///     }
+///     // Matches are always delayed by 1 byte, so we must explicitly walk
+///     // the special "EOI" transition at the end of the search.
+///     sid = dfa
+///         .next_eoi_state(cache, sid)
+///         .map_err(|_| MatchError::GaveUp { offset: haystack.len() })?;
+///     if sid.is_match() {
+///         last_match = Some(HalfMatch::new(
+///             dfa.match_pattern(cache, sid, 0),
+///             haystack.len(),
+///         ));
+///     }
+///     Ok(last_match)
+/// }
+///
+/// // We use a greedy '+' operator to show how the search doesn't just stop
+/// // once a match is detected. It continues extending the match. Using
+/// // '[a-z]+?' would also work as expected and stop the search early.
+/// // Greediness is built into the automaton.
+/// let dfa = DFA::new(r"[a-z]+")?;
+/// let mut cache = dfa.create_cache();
+/// let haystack = "123 foobar 4567".as_bytes();
+/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 0);
+/// assert_eq!(mat.offset(), 10);
+///
+/// // Here's another example that tests our handling of the special
+/// // EOI transition. This will fail to find a match if we don't call
+/// // 'next_eoi_state' at the end of the search since the match isn't found
+/// // until the final byte in the haystack.
+/// let dfa = DFA::new(r"[0-9]{4}")?;
+/// let mut cache = dfa.create_cache();
+/// let haystack = "123 foobar 4567".as_bytes();
+/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 0);
+/// assert_eq!(mat.offset(), 15);
+///
+/// // And note that our search implementation above automatically works
+/// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
+/// // the appropriate pattern ID for us.
+/// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
+/// let mut cache = dfa.create_cache();
+/// let haystack = "123 foobar 4567".as_bytes();
+/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 1);
+/// assert_eq!(mat.offset(), 3);
+/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 0);
+/// assert_eq!(mat.offset(), 7);
+/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 1);
+/// assert_eq!(mat.offset(), 5);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(
+    Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+pub struct LazyStateID(u32);
+
+impl LazyStateID {
+    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+    const MAX_BIT: usize = 31;
+
+    #[cfg(target_pointer_width = "16")]
+    const MAX_BIT: usize = 15;
+
+    const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT);
+    const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1);
+    const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2);
+    const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3);
+    const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4);
+    const MAX: usize = LazyStateID::MASK_MATCH - 1;
+
+    /// Create a new lazy state ID.
+    ///
+    /// If the given identifier exceeds [`LazyStateID::MAX`], then this returns
+    /// an error.
+    #[inline]
+    pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> {
+        if id > LazyStateID::MAX {
+            return Err(LazyStateIDError { attempted: id as u64 });
+        }
+        Ok(LazyStateID::new_unchecked(id))
+    }
+
+    /// Create a new lazy state ID without checking whether the given value
+    /// exceeds [`LazyStateID::MAX`].
+    ///
+    /// While this is unchecked, providing an incorrect value must never
+    /// sacrifice memory safety.
+    #[inline]
+    const fn new_unchecked(id: usize) -> LazyStateID {
+        LazyStateID(id as u32)
+    }
+
+    /// Return this lazy state ID as its raw value if and only if it is not
+    /// tagged (and thus not an unknown, dead, quit, start or match state ID).
+    #[inline]
+    pub(crate) fn as_usize(&self) -> Option<usize> {
+        if self.is_tagged() {
+            None
+        } else {
+            Some(self.as_usize_unchecked())
+        }
+    }
+
+    /// Return this lazy state ID as an untagged `usize`.
+    ///
+    /// If this lazy state ID is tagged, then the usize returned is the state
+    /// ID without the tag. If the ID was not tagged, then the usize returned
+    /// is equivalent to the state ID.
+    #[inline]
+    pub(crate) fn as_usize_untagged(&self) -> usize {
+        self.as_usize_unchecked() & LazyStateID::MAX
+    }
+
+    /// Return this lazy state ID as its raw internal `usize` value, which may
+    /// be tagged (and thus greater than LazyStateID::MAX).
+    #[inline]
+    pub(crate) const fn as_usize_unchecked(&self) -> usize {
+        self.0 as usize
+    }
+
+    #[inline]
+    pub(crate) const fn to_unknown(&self) -> LazyStateID {
+        LazyStateID::new_unchecked(
+            self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN,
+        )
+    }
+
+    #[inline]
+    pub(crate) const fn to_dead(&self) -> LazyStateID {
+        LazyStateID::new_unchecked(
+            self.as_usize_unchecked() | LazyStateID::MASK_DEAD,
+        )
+    }
+
+    #[inline]
+    pub(crate) const fn to_quit(&self) -> LazyStateID {
+        LazyStateID::new_unchecked(
+            self.as_usize_unchecked() | LazyStateID::MASK_QUIT,
+        )
+    }
+
+    /// Return this lazy state ID as a state ID that is tagged as a start
+    /// state.
+    #[inline]
+    pub(crate) const fn to_start(&self) -> LazyStateID {
+        LazyStateID::new_unchecked(
+            self.as_usize_unchecked() | LazyStateID::MASK_START,
+        )
+    }
+
+    /// Return this lazy state ID as a lazy state ID that is tagged as a match
+    /// state.
+    #[inline]
+    pub(crate) const fn to_match(&self) -> LazyStateID {
+        LazyStateID::new_unchecked(
+            self.as_usize_unchecked() | LazyStateID::MASK_MATCH,
+        )
+    }
+
+    /// Return true if and only if this lazy state ID is tagged.
+    ///
+    /// When a lazy state ID is tagged, then one can conclude that it is one
+    /// of a match, start, dead, quit or unknown state.
+    #[inline]
+    pub const fn is_tagged(&self) -> bool {
+        self.as_usize_unchecked() > LazyStateID::MAX
+    }
+
+    /// Return true if and only if this represents a lazy state ID that is
+    /// "unknown." That is, the state has not yet been created. When a caller
+    /// sees this state ID, it generally means that a state has to be computed
+    /// in order to proceed.
+    #[inline]
+    pub const fn is_unknown(&self) -> bool {
+        self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0
+    }
+
+    /// Return true if and only if this represents a dead state. A dead state
+    /// is a state that can never transition to any other state except the
+    /// dead state. When a dead state is seen, it generally indicates that a
+    /// search should stop.
+    #[inline]
+    pub const fn is_dead(&self) -> bool {
+        self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0
+    }
+
+    /// Return true if and only if this represents a quit state. A quit state
+    /// is a state that is representationally equivalent to a dead state,
+    /// except it indicates the automaton has reached a point at which it can
+    /// no longer determine whether a match exists or not. In general, this
+    /// indicates an error during search and the caller must either pass this
+    /// error up or use a different search technique.
+    #[inline]
+    pub const fn is_quit(&self) -> bool {
+        self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0
+    }
+
+    /// Return true if and only if this lazy state ID has been tagged as a
+    /// start state.
+    #[inline]
+    pub const fn is_start(&self) -> bool {
+        self.as_usize_unchecked() & LazyStateID::MASK_START > 0
+    }
+
+    /// Return true if and only if this lazy state ID has been tagged as a
+    /// match state.
+    #[inline]
+    pub const fn is_match(&self) -> bool {
+        self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0
+    }
+}
+
+/// This error occurs when a lazy state ID could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum lazy state ID
+/// value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub(crate) struct LazyStateIDError {
+    attempted: u64,
+}
+
+impl LazyStateIDError {
+    /// Returns the value that failed to constructed a lazy state ID.
+    pub(crate) fn attempted(&self) -> u64 {
+        self.attempted
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for LazyStateIDError {}
+
+impl core::fmt::Display for LazyStateIDError {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(
+            f,
+            "failed to create LazyStateID from {:?}, which exceeds {:?}",
+            self.attempted(),
+            LazyStateID::MAX,
+        )
+    }
+}
+
+/// Represents the current state of an overlapping search.
+///
+/// This is used for overlapping searches since they need to know something
+/// about the previous search. For example, when multiple patterns match at the
+/// same position, this state tracks the last reported pattern so that the next
+/// search knows whether to report another matching pattern or continue with
+/// the search at the next position. Additionally, it also tracks which state
+/// the last search call terminated in.
+///
+/// This type provides no introspection capabilities. The only thing a caller
+/// can do is construct it and pass it around to permit search routines to use
+/// it to track state.
+///
+/// Callers should always provide a fresh state constructed via
+/// [`OverlappingState::start`] when starting a new search. Reusing state from
+/// a previous search may result in incorrect results.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct OverlappingState {
+    /// The state ID of the state at which the search was in when the call
+    /// terminated. When this is a match state, `last_match` must be set to a
+    /// non-None value.
+    ///
+    /// A `None` value indicates the start state of the corresponding
+    /// automaton. We cannot use the actual ID, since any one automaton may
+    /// have many start states, and which one is in use depends on several
+    /// search-time factors.
+    id: Option<LazyStateID>,
+    /// Information associated with a match when `id` corresponds to a match
+    /// state.
+    last_match: Option<StateMatch>,
+}
+
+/// Internal state about the last match that occurred. This records both the
+/// offset of the match and the match index.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub(crate) struct StateMatch {
+    /// The index into the matching patterns for the current match state.
+    pub(crate) match_index: usize,
+    /// The offset in the haystack at which the match occurred. This is used
+    /// when reporting multiple matches at the same offset. That is, when
+    /// an overlapping search runs, the first thing it checks is whether it's
+    /// already in a match state, and if so, whether there are more patterns
+    /// to report as matches in that state. If so, it increments `match_index`
+    /// and returns the pattern and this offset. Once `match_index` exceeds the
+    /// number of matching patterns in the current state, the search continues.
+    pub(crate) offset: usize,
+}
+
+impl OverlappingState {
+    /// Create a new overlapping state that begins at the start state of any
+    /// automaton.
+    pub fn start() -> OverlappingState {
+        OverlappingState { id: None, last_match: None }
+    }
+
+    pub(crate) fn id(&self) -> Option<LazyStateID> {
+        self.id
+    }
+
+    pub(crate) fn set_id(&mut self, id: LazyStateID) {
+        self.id = Some(id);
+    }
+
+    pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> {
+        self.last_match.as_mut()
+    }
+
+    pub(crate) fn set_last_match(&mut self, last_match: StateMatch) {
+        self.last_match = Some(last_match);
+    }
+}
diff --git a/src/hybrid/mod.rs b/src/hybrid/mod.rs

new file mode 100644 (file)

index 0000000..4c8ca7e
--- /dev/null
+++ b/src/hybrid/mod.rs
@@ -0,0 +1,179 @@
+/*!
+A module for building and searching with lazy determinstic finite automata
+(DFAs).
+
+Like other modules in this crate, lazy DFAs support a rich regex syntax with
+Unicode features. The key feature of a lazy DFA is that it builds itself
+incrementally during search, and never uses more than a configured capacity of
+memory. Thus, when searching with a lazy DFA, one must supply a mutable "cache"
+in which the actual DFA's transition table is stored.
+
+If you're looking for fully compiled DFAs, then please see the top-level
+[`dfa` module](crate::dfa).
+
+# Overview
+
+This section gives a brief overview of the primary types in this module:
+
+* A [`regex::Regex`] provides a way to search for matches of a regular
+expression using lazy DFAs. This includes iterating over matches with both the
+start and end positions of each match.
+* A [`dfa::DFA`] provides direct low level access to a lazy DFA.
+
+# Example: basic regex searching
+
+This example shows how to compile a regex using the default configuration
+and then use it to find matches in a byte string:
+
+```
+use regex_automata::{hybrid::regex::Regex, MultiMatch};
+
+let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
+let mut cache = re.create_cache();
+
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> =
+    re.find_leftmost_iter(&mut cache, text).collect();
+assert_eq!(matches, vec![
+    MultiMatch::must(0, 0, 10),
+    MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: searching with regex sets
+
+The lazy DFAs in this module all fully support searching with multiple regexes
+simultaneously. You can use this support with standard leftmost-first style
+searching to find non-overlapping matches:
+
+```
+use regex_automata::{hybrid::regex::Regex, MultiMatch};
+
+let re = Regex::new_many(&[r"\w+", r"\S+"])?;
+let mut cache = re.create_cache();
+
+let text = b"@foo bar";
+let matches: Vec<MultiMatch> =
+    re.find_leftmost_iter(&mut cache, text).collect();
+assert_eq!(matches, vec![
+    MultiMatch::must(1, 0, 4),
+    MultiMatch::must(0, 5, 8),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+Or use overlapping style searches to find all possible occurrences:
+
+```
+use regex_automata::{hybrid::{dfa, regex::Regex}, MatchKind, MultiMatch};
+
+// N.B. For overlapping searches, we need the underlying lazy DFA to report all
+// possible matches.
+let re = Regex::builder()
+    .dfa(dfa::Config::new().match_kind(MatchKind::All))
+    .build_many(&[r"\w{3}", r"\S{3}"])?;
+let mut cache = re.create_cache();
+
+let text = b"@foo bar";
+let matches: Vec<MultiMatch> =
+    re.find_overlapping_iter(&mut cache, text).collect();
+assert_eq!(matches, vec![
+    MultiMatch::must(1, 0, 3),
+    MultiMatch::must(0, 1, 4),
+    MultiMatch::must(1, 1, 4),
+    MultiMatch::must(0, 5, 8),
+    MultiMatch::must(1, 5, 8),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# When should I use this?
+
+Generally speaking, if you can abide the use of mutable state during search,
+and you don't need things like capturing groups or Unicode word boundary
+support in non-ASCII text, then a lazy DFA is likely a robust choice with
+respect to both search speed and memory usage. Note however that its speed
+may be worse than a general purpose regex engine if you don't select a good
+[prefilter](crate::util::prefilter).
+
+If you know ahead of time that your pattern would result in a very large DFA
+if it was fully compiled, it may be better to use an NFA simulation instead
+of a lazy DFA. Either that, or increase the cache capacity of your lazy DFA
+to something that is big enough to hold the state machine (likely through
+experimentation). The issue here is that if the cache is too small, then it
+could wind up being reset too frequently and this might decrease searching
+speed significantly.
+
+# Differences with fully compiled DFAs
+
+A [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) and a
+[`dfa::regex::Regex`](crate::dfa::regex::Regex) both have the same capabilities
+(and similarly for their underlying DFAs), but they achieve them through
+different means. The main difference is that a hybrid or "lazy" regex builds
+its DFA lazily during search, where as a fully compiled regex will build its
+DFA at construction time. While building a DFA at search time might sound like
+it's slow, it tends to work out where most bytes seen during a search will
+reuse pre-built parts of the DFA and thus can be almost as fast as a fully
+compiled DFA. The main downside is that searching requires mutable space to
+store the DFA, and, in the worst case, a search can result in a new state being
+created for each byte seen, which would make searching quite a bit slower.
+
+A fully compiled DFA never has to worry about searches being slower once
+it's built. (Aside from, say, the transition table being so large that it
+is subject to harsh CPU cache effects.) However, of course, building a full
+DFA can be quite time consuming and memory hungry. Particularly when it's
+so easy to build large DFAs when Unicode mode is enabled.
+
+A lazy DFA strikes a nice balance _in practice_, particularly in the
+presence of Unicode mode, by only building what is needed. It avoids the
+worst case exponential time complexity of DFA compilation by guaranteeing that
+it will only build at most one state per byte searched. While the worst
+case here can lead to a very high constant, it will never be exponential.
+
+# Syntax
+
+This module supports the same syntax as the `regex` crate, since they share the
+same parser. You can find an exhaustive list of supported syntax in the
+[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
+
+There are two things that are not supported by the lazy DFAs in this module:
+
+* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
+of them) can only find the offsets of an entire match, but cannot resolve
+the offsets of each capturing group. This is because DFAs do not have the
+expressive power necessary.
+* Unicode word boundaries. These present particularly difficult challenges for
+DFA construction and would result in an explosion in the number of states.
+One can enable [`dfa::Config::unicode_word_boundary`] though, which provides
+heuristic support for Unicode word boundaries that only works on ASCII text.
+Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
+on any input.
+
+There are no plans to lift either of these limitations.
+
+Note that these restrictions are identical to the restrictions on fully
+compiled DFAs.
+
+# Support for `alloc`-only
+
+This crate comes with `alloc` and `std` features that are enabled by default.
+One can disable the `std` feature and still use the full API of a lazy DFA.
+(You should use `std` when possible, since it permits providing implementations
+of the `std::error::Error` trait, and does enable some minor internal
+optimizations.)
+
+This module does require at least the `alloc` feature though. It is not
+available in any capacity without `alloc`.
+*/
+
+pub use self::{
+    error::{BuildError, CacheError},
+    id::{LazyStateID, OverlappingState},
+};
+
+pub mod dfa;
+mod error;
+mod id;
+pub mod regex;
+mod search;
diff --git a/src/hybrid/regex.rs b/src/hybrid/regex.rs

new file mode 100644 (file)

index 0000000..7cc6b90
--- /dev/null
+++ b/src/hybrid/regex.rs
@@ -0,0 +1,2124 @@
+/*!
+A lazy DFA backed `Regex`.
+
+This module provides [`Regex`] using lazy DFA. A `Regex` implements convenience
+routines you might have come to expect, such as finding a match and iterating
+over all non-overlapping matches. This `Regex` type is limited in its
+capabilities to what a lazy DFA can provide. Therefore, APIs involving
+capturing groups, for example, are not provided.
+
+Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
+finds the end offset of a match, where as the other is a "reverse" DFA that
+find the start offset of a match.
+
+See the [parent module](crate::hybrid) for examples.
+*/
+
+use core::borrow::Borrow;
+
+use alloc::boxed::Box;
+
+use crate::{
+    hybrid::{
+        dfa::{self, DFA},
+        error::BuildError,
+        OverlappingState,
+    },
+    nfa::thompson,
+    util::{
+        matchtypes::{MatchError, MatchKind, MultiMatch},
+        prefilter::{self, Prefilter},
+    },
+};
+
+/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs")
+/// for searching.
+///
+/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a
+/// "reverse" DFA. The forward DFA is responsible for detecting the end of
+/// a match while the reverse DFA is responsible for detecting the start
+/// of a match. Thus, in order to find the bounds of any given match, a
+/// forward search must first be run followed by a reverse search. A match
+/// found by the forward DFA guarantees that the reverse DFA will also find
+/// a match.
+///
+/// A `Regex` can also have a prefilter set via the
+/// [`set_prefilter`](Regex::set_prefilter) method. By default, no prefilter is
+/// enabled.
+///
+/// # Earliest vs Leftmost vs Overlapping
+///
+/// The search routines exposed on a `Regex` reflect three different ways
+/// of searching:
+///
+/// * "earliest" means to stop as soon as a match has been detected.
+/// * "leftmost" means to continue matching until the underlying
+///   automaton cannot advance. This reflects "standard" searching you
+///   might be used to in other regex engines. e.g., This permits
+///   non-greedy and greedy searching to work as you would expect.
+/// * "overlapping" means to find all possible matches, even if they
+///   overlap.
+///
+/// Generally speaking, when doing an overlapping search, you'll want to
+/// build your regex lazy DFAs with [`MatchKind::All`] semantics. Using
+/// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is
+/// likely to lead to odd behavior since `LeftmostFirst` specifically omits
+/// some matches that can never be reported due to its semantics.
+///
+/// The following example shows the differences between how these different
+/// types of searches impact looking for matches of `[a-z]+` in the
+/// haystack `abc`.
+///
+/// ```
+/// use regex_automata::{hybrid::{dfa, regex}, MatchKind, MultiMatch};
+///
+/// let pattern = r"[a-z]+";
+/// let haystack = "abc".as_bytes();
+///
+/// // With leftmost-first semantics, we test "earliest" and "leftmost".
+/// let re = regex::Builder::new()
+///     .dfa(dfa::Config::new().match_kind(MatchKind::LeftmostFirst))
+///     .build(pattern)?;
+/// let mut cache = re.create_cache();
+///
+/// // "earliest" searching isn't impacted by greediness
+/// let mut it = re.find_earliest_iter(&mut cache, haystack);
+/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
+/// assert_eq!(None, it.next());
+///
+/// // "leftmost" searching supports greediness (and non-greediness)
+/// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+/// assert_eq!(None, it.next());
+///
+/// // For overlapping, we want "all" match kind semantics.
+/// let re = regex::Builder::new()
+///     .dfa(dfa::Config::new().match_kind(MatchKind::All))
+///     .build(pattern)?;
+/// let mut cache = re.create_cache();
+///
+/// // In the overlapping search, we find all three possible matches
+/// // starting at the beginning of the haystack.
+/// let mut it = re.find_overlapping_iter(&mut cache, haystack);
+/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+/// assert_eq!(None, it.next());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Fallibility
+///
+/// In non-default configurations, the lazy DFAs generated in this module may
+/// return an error during a search. (Currently, the only way this happens is
+/// if quit bytes are added, Unicode word boundaries are heuristically enabled,
+/// or if the cache is configured to "give up" on a search if it has been
+/// cleared too many times. All of these are turned off by default, which means
+/// a search can never fail in the default configuration.) For convenience,
+/// the main search routines, like [`find_leftmost`](Regex::find_leftmost),
+/// will panic if an error occurs. However, if you need to use DFAs which may
+/// produce an error at search time, then there are fallible equivalents of
+/// all search routines. For example, for `find_leftmost`, its fallible analog
+/// is [`try_find_leftmost`](Regex::try_find_leftmost). The routines prefixed
+/// with `try_` return `Result<Option<MultiMatch>, MatchError>`, where as the
+/// infallible routines simply return `Option<MultiMatch>`.
+///
+/// # Example
+///
+/// This example shows how to cause a search to terminate if it sees a
+/// `\n` byte, and handle the error returned. This could be useful if, for
+/// example, you wanted to prevent a user supplied pattern from matching
+/// across a line boundary.
+///
+/// ```
+/// use regex_automata::{hybrid::{dfa, regex::Regex}, MatchError};
+///
+/// let re = Regex::builder()
+///     .dfa(dfa::Config::new().quit(b'\n', true))
+///     .build(r"foo\p{any}+bar")?;
+/// let mut cache = re.create_cache();
+///
+/// let haystack = "foo\nbar".as_bytes();
+/// // Normally this would produce a match, since \p{any} contains '\n'.
+/// // But since we instructed the automaton to enter a quit state if a
+/// // '\n' is observed, this produces a match error instead.
+/// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
+/// let got = re.try_find_leftmost(&mut cache, haystack).unwrap_err();
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Debug)]
+pub struct Regex {
+    /// An optional prefilter that is passed down to the lazy DFA search
+    /// routines when present. By default, no prefilter is set.
+    pre: Option<Box<dyn Prefilter>>,
+    /// The forward lazy DFA. This can only find the end of a match.
+    forward: DFA,
+    /// The reverse lazy DFA. This can only find the start of a match.
+    ///
+    /// This is built with 'all' match semantics (instead of leftmost-first)
+    /// so that it always finds the longest possible match (which corresponds
+    /// to the leftmost starting position). It is also compiled as an anchored
+    /// matcher and has 'starts_for_each_pattern' enabled. Including starting
+    /// states for each pattern is necessary to ensure that we only look for
+    /// matches of a pattern that matched in the forward direction. Otherwise,
+    /// we might wind up finding the "leftmost" starting position of a totally
+    /// different pattern!
+    reverse: DFA,
+    /// Whether iterators on this type should advance by one codepoint or one
+    /// byte when an empty match is seen.
+    utf8: bool,
+}
+
+/// Convenience routines for regex and cache construction.
+impl Regex {
+    /// Parse the given regular expression using the default configuration and
+    /// return the corresponding regex.
+    ///
+    /// If you want a non-default configuration, then use the [`Builder`] to
+    /// set your own configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+    ///
+    /// let re = Regex::new("foo[0-9]+bar")?;
+    /// let mut cache = re.create_cache();
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 3, 14)),
+    ///     re.find_leftmost(&mut cache, b"zzzfoo12345barzzz"),
+    /// );
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new(pattern: &str) -> Result<Regex, BuildError> {
+        Regex::builder().build(pattern)
+    }
+
+    /// Like `new`, but parses multiple patterns into a single "regex set."
+    /// This similarly uses the default regex configuration.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+    ///
+    /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
+    /// let mut cache = re.create_cache();
+    ///
+    /// let mut it = re.find_leftmost_iter(
+    ///     &mut cache,
+    ///     b"abc 1 foo 4567 0 quux",
+    /// );
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
+    /// assert_eq!(None, it.next());
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn new_many<P: AsRef<str>>(
+        patterns: &[P],
+    ) -> Result<Regex, BuildError> {
+        Regex::builder().build_many(patterns)
+    }
+
+    /// Return a default configuration for a `Regex`.
+    ///
+    /// This is a convenience routine to avoid needing to import the `Config`
+    /// type when customizing the construction of a regex.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to disable UTF-8 mode for `Regex` iteration.
+    /// When UTF-8 mode is disabled, the position immediately following an
+    /// empty match is where the next search begins, instead of the next
+    /// position of a UTF-8 encoded codepoint.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+    ///
+    /// let re = Regex::builder()
+    ///     .configure(Regex::config().utf8(false))
+    ///     .build(r"")?;
+    /// let mut cache = re.create_cache();
+    ///
+    /// let haystack = "a☃z".as_bytes();
+    /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn config() -> Config {
+        Config::new()
+    }
+
+    /// Return a builder for configuring the construction of a `Regex`.
+    ///
+    /// This is a convenience routine to avoid needing to import the
+    /// [`Builder`] type in common cases.
+    ///
+    /// # Example
+    ///
+    /// This example shows how to use the builder to disable UTF-8 mode
+    /// everywhere.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     hybrid::regex::Regex,
+    ///     nfa::thompson,
+    ///     MultiMatch, SyntaxConfig,
+    /// };
+    ///
+    /// let re = Regex::builder()
+    ///     .configure(Regex::config().utf8(false))
+    ///     .syntax(SyntaxConfig::new().utf8(false))
+    ///     .thompson(thompson::Config::new().utf8(false))
+    ///     .build(r"foo(?-u:[^b])ar.*")?;
+    /// let mut cache = re.create_cache();
+    ///
+    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+    /// let expected = Some(MultiMatch::must(0, 1, 9));
+    /// let got = re.find_leftmost(&mut cache, haystack);
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn builder() -> Builder {
+        Builder::new()
+    }
+
+    /// Create a new cache for this `Regex`.
+    ///
+    /// The cache returned should only be used for searches for this
+    /// `Regex`. If you want to reuse the cache for another `Regex`, then
+    /// you must call [`Cache::reset`] with that `Regex` (or, equivalently,
+    /// [`Regex::reset_cache`]).
+    pub fn create_cache(&self) -> Cache {
+        Cache::new(self)
+    }
+
+    /// Reset the given cache such that it can be used for searching with the
+    /// this `Regex` (and only this `Regex`).
+    ///
+    /// A cache reset permits reusing memory already allocated in this cache
+    /// with a different `Regex`.
+    ///
+    /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+    /// `Regex` has been configured to "give up" after it has cleared the cache
+    /// a certain number of times.
+    ///
+    /// # Example
+    ///
+    /// This shows how to re-purpose a cache for use with a different `Regex`.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+    ///
+    /// let re1 = Regex::new(r"\w")?;
+    /// let re2 = Regex::new(r"\W")?;
+    ///
+    /// let mut cache = re1.create_cache();
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 0, 2)),
+    ///     re1.find_leftmost(&mut cache, "Δ".as_bytes()),
+    /// );
+    ///
+    /// // Using 'cache' with re2 is not allowed. It may result in panics or
+    /// // incorrect results. In order to re-purpose the cache, we must reset
+    /// // it with the Regex we'd like to use it with.
+    /// //
+    /// // Similarly, after this reset, using the cache with 're1' is also not
+    /// // allowed.
+    /// re2.reset_cache(&mut cache);
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 0, 3)),
+    ///     re2.find_leftmost(&mut cache, "☃".as_bytes()),
+    /// );
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn reset_cache(&self, cache: &mut Cache) {
+        self.forward().reset_cache(&mut cache.forward);
+        self.reverse().reset_cache(&mut cache.reverse);
+    }
+}
+
+/// Standard infallible search routines for finding and iterating over matches.
+impl Regex {
+    /// Returns true if and only if this regex matches the given haystack.
+    ///
+    /// This routine may short circuit if it knows that scanning future input
+    /// will never lead to a different result. In particular, if the underlying
+    /// DFA enters a match state or a dead state, then this routine will return
+    /// `true` or `false`, respectively, without inspecting any future input.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying lazy DFAs return an error, then this routine panics.
+    /// This only occurs in non-default configurations where quit bytes are
+    /// used, Unicode word boundaries are heuristically enabled or limits are
+    /// set on the number of times the lazy DFA's cache may be cleared.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_is_match`](Regex::try_is_match).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::hybrid::regex::Regex;
+    ///
+    /// let re = Regex::new("foo[0-9]+bar")?;
+    /// let mut cache = re.create_cache();
+    ///
+    /// assert_eq!(true, re.is_match(&mut cache, b"foo12345bar"));
+    /// assert_eq!(false, re.is_match(&mut cache, b"foobar"));
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn is_match(&self, cache: &mut Cache, haystack: &[u8]) -> bool {
+        self.try_is_match(cache, haystack).unwrap()
+    }
+
+    /// Returns the first position at which a match is found.
+    ///
+    /// This routine stops scanning input in precisely the same circumstances
+    /// as `is_match`. The key difference is that this routine returns the
+    /// position at which it stopped scanning input if and only if a match
+    /// was found. If no match is found, then `None` is returned.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying lazy DFAs return an error, then this routine panics.
+    /// This only occurs in non-default configurations where quit bytes are
+    /// used, Unicode word boundaries are heuristically enabled or limits are
+    /// set on the number of times the lazy DFA's cache may be cleared.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_earliest`](Regex::try_find_earliest).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+    ///
+    /// // Normally, the leftmost first match would greedily consume as many
+    /// // decimal digits as it could. But a match is detected as soon as one
+    /// // digit is seen.
+    /// let re = Regex::new("foo[0-9]+")?;
+    /// let mut cache = re.create_cache();
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 0, 4)),
+    ///     re.find_earliest(&mut cache, b"foo12345"),
+    /// );
+    ///
+    /// // Normally, the end of the leftmost first match here would be 3,
+    /// // but the "earliest" match semantics detect a match earlier.
+    /// let re = Regex::new("abc|a")?;
+    /// let mut cache = re.create_cache();
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 0, 1)),
+    ///     re.find_earliest(&mut cache, b"abc"),
+    /// );
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_earliest(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+    ) -> Option<MultiMatch> {
+        self.try_find_earliest(cache, haystack).unwrap()
+    }
+
+    /// Returns the start and end offset of the leftmost match. If no match
+    /// exists, then `None` is returned.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying lazy DFAs return an error, then this routine panics.
+    /// This only occurs in non-default configurations where quit bytes are
+    /// used, Unicode word boundaries are heuristically enabled or limits are
+    /// set on the number of times the lazy DFA's cache may be cleared.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_leftmost`](Regex::try_find_leftmost).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+    ///
+    /// // Greediness is applied appropriately when compared to find_earliest.
+    /// let re = Regex::new("foo[0-9]+")?;
+    /// let mut cache = re.create_cache();
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 3, 11)),
+    ///     re.find_leftmost(&mut cache, b"zzzfoo12345zzz"),
+    /// );
+    ///
+    /// // Even though a match is found after reading the first byte (`a`),
+    /// // the default leftmost-first match semantics demand that we find the
+    /// // earliest match that prefers earlier parts of the pattern over latter
+    /// // parts.
+    /// let re = Regex::new("abc|a")?;
+    /// let mut cache = re.create_cache();
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 0, 3)),
+    ///     re.find_leftmost(&mut cache, b"abc"),
+    /// );
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_leftmost(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+    ) -> Option<MultiMatch> {
+        self.try_find_leftmost(cache, haystack).unwrap()
+    }
+
+    /// Search for the first overlapping match in `haystack`.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// In particular, callers must preserve the automaton's search state from
+    /// prior calls so that the implementation knows where the last match
+    /// occurred and which pattern was reported.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying lazy DFAs return an error, then this routine panics.
+    /// This only occurs in non-default configurations where quit bytes are
+    /// used, Unicode word boundaries are heuristically enabled or limits are
+    /// set on the number of times the lazy DFA's cache may be cleared.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_overlapping`](Regex::try_find_overlapping).
+    ///
+    /// # Example
+    ///
+    /// This example shows how to run an overlapping search with multiple
+    /// regexes.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     hybrid::{dfa::DFA, regex::Regex, OverlappingState},
+    ///     MatchKind,
+    ///     MultiMatch,
+    /// };
+    ///
+    /// let re = Regex::builder()
+    ///     .dfa(DFA::config().match_kind(MatchKind::All))
+    ///     .build_many(&[r"\w+$", r"\S+$"])?;
+    /// let mut cache = re.create_cache();
+    ///
+    /// let haystack = "@foo".as_bytes();
+    /// let mut state = OverlappingState::start();
+    ///
+    /// let expected = Some(MultiMatch::must(1, 0, 4));
+    /// let got = re.find_overlapping(&mut cache, haystack, &mut state);
+    /// assert_eq!(expected, got);
+    ///
+    /// // The first pattern also matches at the same position, so re-running
+    /// // the search will yield another match. Notice also that the first
+    /// // pattern is returned after the second. This is because the second
+    /// // pattern begins its match before the first, is therefore an earlier
+    /// // match and is thus reported first.
+    /// let expected = Some(MultiMatch::must(0, 1, 4));
+    /// let got = re.find_overlapping(&mut cache, haystack, &mut state);
+    /// assert_eq!(expected, got);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_overlapping(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+        state: &mut OverlappingState,
+    ) -> Option<MultiMatch> {
+        self.try_find_overlapping(cache, haystack, state).unwrap()
+    }
+
+    /// Returns an iterator over all non-overlapping "earliest" matches.
+    ///
+    /// Match positions are reported as soon as a match is known to occur, even
+    /// if the standard leftmost match would be longer.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying lazy DFAs return an error, then this routine panics.
+    /// This only occurs in non-default configurations where quit bytes are
+    /// used, Unicode word boundaries are heuristically enabled or limits are
+    /// set on the number of times the lazy DFA's cache may be cleared.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter).
+    ///
+    /// # Example
+    ///
+    /// This example shows how to run an "earliest" iterator.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+    ///
+    /// let re = Regex::new("[0-9]+")?;
+    /// let mut cache = re.create_cache();
+    /// let haystack = "123".as_bytes();
+    ///
+    /// // Normally, a standard leftmost iterator would return a single
+    /// // match, but since "earliest" detects matches earlier, we get
+    /// // three matches.
+    /// let mut it = re.find_earliest_iter(&mut cache, haystack);
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_earliest_iter<'r, 'c, 't>(
+        &'r self,
+        cache: &'c mut Cache,
+        haystack: &'t [u8],
+    ) -> FindEarliestMatches<'r, 'c, 't> {
+        FindEarliestMatches::new(self, cache, haystack)
+    }
+
+    /// Returns an iterator over all non-overlapping leftmost matches in the
+    /// given bytes. If no match exists, then the iterator yields no elements.
+    ///
+    /// This corresponds to the "standard" regex search iterator.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying lazy DFAs return an error, then this routine panics.
+    /// This only occurs in non-default configurations where quit bytes are
+    /// used, Unicode word boundaries are heuristically enabled or limits are
+    /// set on the number of times the lazy DFA's cache may be cleared.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter).
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+    ///
+    /// let re = Regex::new("foo[0-9]+")?;
+    /// let mut cache = re.create_cache();
+    ///
+    /// let text = b"foo1 foo12 foo123";
+    /// let matches: Vec<MultiMatch> = re
+    ///     .find_leftmost_iter(&mut cache, text)
+    ///     .collect();
+    /// assert_eq!(matches, vec![
+    ///     MultiMatch::must(0, 0, 4),
+    ///     MultiMatch::must(0, 5, 10),
+    ///     MultiMatch::must(0, 11, 17),
+    /// ]);
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_leftmost_iter<'r, 'c, 't>(
+        &'r self,
+        cache: &'c mut Cache,
+        haystack: &'t [u8],
+    ) -> FindLeftmostMatches<'r, 'c, 't> {
+        FindLeftmostMatches::new(self, cache, haystack)
+    }
+
+    /// Returns an iterator over all overlapping matches in the given haystack.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// The iterator takes care of handling the overlapping state that must be
+    /// threaded through every search.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying lazy DFAs return an error, then this routine panics.
+    /// This only occurs in non-default configurations where quit bytes are
+    /// used, Unicode word boundaries are heuristically enabled or limits are
+    /// set on the number of times the lazy DFA's cache may be cleared.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter).
+    ///
+    /// # Example
+    ///
+    /// This example shows how to run an overlapping search with multiple
+    /// regexes.
+    ///
+    /// ```
+    /// use regex_automata::{
+    ///     hybrid::{dfa::DFA, regex::Regex},
+    ///     MatchKind,
+    ///     MultiMatch,
+    /// };
+    ///
+    /// let re = Regex::builder()
+    ///     .dfa(DFA::config().match_kind(MatchKind::All))
+    ///     .build_many(&[r"\w+$", r"\S+$"])?;
+    /// let mut cache = re.create_cache();
+    /// let haystack = "@foo".as_bytes();
+    ///
+    /// let mut it = re.find_overlapping_iter(&mut cache, haystack);
+    /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn find_overlapping_iter<'r, 'c, 't>(
+        &'r self,
+        cache: &'c mut Cache,
+        haystack: &'t [u8],
+    ) -> FindOverlappingMatches<'r, 'c, 't> {
+        FindOverlappingMatches::new(self, cache, haystack)
+    }
+}
+
+/// Lower level infallible search routines that permit controlling where
+/// the search starts and ends in a particular sequence. This is useful for
+/// executing searches that need to take surrounding context into account. This
+/// is required for correctly implementing iteration because of look-around
+/// operators (`^`, `$`, `\b`).
+impl Regex {
+    /// Returns true if and only if this regex matches the given haystack.
+    ///
+    /// This routine may short circuit if it knows that scanning future input
+    /// will never lead to a different result. In particular, if the underlying
+    /// DFA enters a match state or a dead state, then this routine will return
+    /// `true` or `false`, respectively, without inspecting any future input.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// # Panics
+    ///
+    /// If the underlying lazy DFAs return an error, then this routine panics.
+    /// This only occurs in non-default configurations where quit bytes are
+    /// used, Unicode word boundaries are heuristically enabled or limits are
+    /// set on the number of times the lazy DFA's cache may be cleared.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_is_match_at`](Regex::try_is_match_at).
+    pub fn is_match_at(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> bool {
+        self.try_is_match_at(cache, haystack, start, end).unwrap()
+    }
+
+    /// Returns the first position at which a match is found.
+    ///
+    /// This routine stops scanning input in precisely the same circumstances
+    /// as `is_match`. The key difference is that this routine returns the
+    /// position at which it stopped scanning input if and only if a match
+    /// was found. If no match is found, then `None` is returned.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying lazy DFAs return an error, then this routine panics.
+    /// This only occurs in non-default configurations where quit bytes are
+    /// used, Unicode word boundaries are heuristically enabled or limits are
+    /// set on the number of times the lazy DFA's cache may be cleared.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_earliest_at`](Regex::try_find_earliest_at).
+    pub fn find_earliest_at(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Option<MultiMatch> {
+        self.try_find_earliest_at(cache, haystack, start, end).unwrap()
+    }
+
+    /// Returns the same as `find_leftmost`, but starts the search at the given
+    /// offset.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches within the
+    /// same haystack, which cannot be done correctly by simply providing a
+    /// subslice of `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying lazy DFAs return an error, then this routine panics.
+    /// This only occurs in non-default configurations where quit bytes are
+    /// used, Unicode word boundaries are heuristically enabled or limits are
+    /// set on the number of times the lazy DFA's cache may be cleared.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at).
+    pub fn find_leftmost_at(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Option<MultiMatch> {
+        self.try_find_leftmost_at(cache, haystack, start, end).unwrap()
+    }
+
+    /// Search for the first overlapping match within a given range of
+    /// `haystack`.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// In particular, callers must preserve the automaton's search state from
+    /// prior calls so that the implementation knows where the last match
+    /// occurred and which pattern was reported.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `haystack`.
+    ///
+    /// # Panics
+    ///
+    /// If the underlying lazy DFAs return an error, then this routine panics.
+    /// This only occurs in non-default configurations where quit bytes are
+    /// used, Unicode word boundaries are heuristically enabled or limits are
+    /// set on the number of times the lazy DFA's cache may be cleared.
+    ///
+    /// The fallible version of this routine is
+    /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at).
+    pub fn find_overlapping_at(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+        state: &mut OverlappingState,
+    ) -> Option<MultiMatch> {
+        self.try_find_overlapping_at(cache, haystack, start, end, state)
+            .unwrap()
+    }
+}
+
+/// Fallible search routines. These may return an error when the underlying
+/// lazy DFAs have been configured in a way that permits them to fail during a
+/// search.
+///
+/// Errors during search only occur when the lazy DFA has been explicitly
+/// configured to do so, usually by specifying one or more "quit" bytes or by
+/// heuristically enabling Unicode word boundaries.
+///
+/// Errors will never be returned using the default configuration. So these
+/// fallible routines are only needed for particular configurations.
+impl Regex {
+    /// Returns true if and only if this regex matches the given haystack.
+    ///
+    /// This routine may short circuit if it knows that scanning future input
+    /// will never lead to a different result. In particular, if the underlying
+    /// DFA enters a match state or a dead state, then this routine will return
+    /// `true` or `false`, respectively, without inspecting any future input.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`is_match`](Regex::is_match).
+    pub fn try_is_match(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+    ) -> Result<bool, MatchError> {
+        self.try_is_match_at(cache, haystack, 0, haystack.len())
+    }
+
+    /// Returns the first position at which a match is found.
+    ///
+    /// This routine stops scanning input in precisely the same circumstances
+    /// as `is_match`. The key difference is that this routine returns the
+    /// position at which it stopped scanning input if and only if a match
+    /// was found. If no match is found, then `None` is returned.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_earliest`](Regex::find_earliest).
+    pub fn try_find_earliest(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_earliest_at(cache, haystack, 0, haystack.len())
+    }
+
+    /// Returns the start and end offset of the leftmost match. If no match
+    /// exists, then `None` is returned.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_leftmost`](Regex::find_leftmost).
+    pub fn try_find_leftmost(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_leftmost_at(cache, haystack, 0, haystack.len())
+    }
+
+    /// Search for the first overlapping match in `haystack`.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// In particular, callers must preserve the automaton's search state from
+    /// prior calls so that the implementation knows where the last match
+    /// occurred and which pattern was reported.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_overlapping`](Regex::find_overlapping).
+    pub fn try_find_overlapping(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+        state: &mut OverlappingState,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_overlapping_at(cache, haystack, 0, haystack.len(), state)
+    }
+
+    /// Returns an iterator over all non-overlapping "earliest" matches.
+    ///
+    /// Match positions are reported as soon as a match is known to occur, even
+    /// if the standard leftmost match would be longer.
+    ///
+    /// # Errors
+    ///
+    /// This iterator only yields errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_earliest_iter`](Regex::find_earliest_iter).
+    pub fn try_find_earliest_iter<'r, 'c, 't>(
+        &'r self,
+        cache: &'c mut Cache,
+        haystack: &'t [u8],
+    ) -> TryFindEarliestMatches<'r, 'c, 't> {
+        TryFindEarliestMatches::new(self, cache, haystack)
+    }
+
+    /// Returns an iterator over all non-overlapping leftmost matches in the
+    /// given bytes. If no match exists, then the iterator yields no elements.
+    ///
+    /// This corresponds to the "standard" regex search iterator.
+    ///
+    /// # Errors
+    ///
+    /// This iterator only yields errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_leftmost_iter`](Regex::find_leftmost_iter).
+    pub fn try_find_leftmost_iter<'r, 'c, 't>(
+        &'r self,
+        cache: &'c mut Cache,
+        haystack: &'t [u8],
+    ) -> TryFindLeftmostMatches<'r, 'c, 't> {
+        TryFindLeftmostMatches::new(self, cache, haystack)
+    }
+
+    /// Returns an iterator over all overlapping matches in the given haystack.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// The iterator takes care of handling the overlapping state that must be
+    /// threaded through every search.
+    ///
+    /// # Errors
+    ///
+    /// This iterator only yields errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_overlapping_iter`](Regex::find_overlapping_iter).
+    pub fn try_find_overlapping_iter<'r, 'c, 't>(
+        &'r self,
+        cache: &'c mut Cache,
+        haystack: &'t [u8],
+    ) -> TryFindOverlappingMatches<'r, 'c, 't> {
+        TryFindOverlappingMatches::new(self, cache, haystack)
+    }
+}
+
+/// Lower level fallible search routines that permit controlling where the
+/// search starts and ends in a particular sequence.
+impl Regex {
+    /// Returns true if and only if this regex matches the given haystack.
+    ///
+    /// This routine may short circuit if it knows that scanning future input
+    /// will never lead to a different result. In particular, if the underlying
+    /// DFA enters a match state or a dead state, then this routine will return
+    /// `true` or `false`, respectively, without inspecting any future input.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`is_match_at`](Regex::is_match_at).
+    pub fn try_is_match_at(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<bool, MatchError> {
+        self.forward()
+            .find_leftmost_fwd_at(
+                &mut cache.forward,
+                self.scanner().as_mut(),
+                None,
+                haystack,
+                start,
+                end,
+            )
+            .map(|x| x.is_some())
+    }
+
+    /// Returns the first position at which a match is found.
+    ///
+    /// This routine stops scanning input in precisely the same circumstances
+    /// as `is_match`. The key difference is that this routine returns the
+    /// position at which it stopped scanning input if and only if a match
+    /// was found. If no match is found, then `None` is returned.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `haystack`.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_earliest_at`](Regex::find_earliest_at).
+    pub fn try_find_earliest_at(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_earliest_at_imp(
+            self.scanner().as_mut(),
+            cache,
+            haystack,
+            start,
+            end,
+        )
+    }
+
+    /// Returns the start and end offset of the leftmost match. If no match
+    /// exists, then `None` is returned.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `haystack`.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_leftmost_at`](Regex::find_leftmost_at).
+    pub fn try_find_leftmost_at(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_leftmost_at_imp(
+            self.scanner().as_mut(),
+            cache,
+            haystack,
+            start,
+            end,
+        )
+    }
+
+    /// Search for the first overlapping match within a given range of
+    /// `haystack`.
+    ///
+    /// This routine is principally useful when searching for multiple patterns
+    /// on inputs where multiple patterns may match the same regions of text.
+    /// In particular, callers must preserve the automaton's search state from
+    /// prior calls so that the implementation knows where the last match
+    /// occurred and which pattern was reported.
+    ///
+    /// # Searching a substring of the haystack
+    ///
+    /// Being an "at" search routine, this permits callers to search a
+    /// substring of `haystack` by specifying a range in `haystack`.
+    /// Why expose this as an API instead of just asking callers to use
+    /// `&input[start..end]`? The reason is that regex matching often wants
+    /// to take the surrounding context into account in order to handle
+    /// look-around (`^`, `$` and `\b`).
+    ///
+    /// This is useful when implementing an iterator over matches
+    /// within the same haystack, which cannot be done correctly by simply
+    /// providing a subslice of `haystack`.
+    ///
+    /// # Errors
+    ///
+    /// This routine only errors if the search could not complete. For
+    /// DFA-based regexes, this only occurs in a non-default configuration
+    /// where quit bytes are used, Unicode word boundaries are heuristically
+    /// enabled or limits are set on the number of times the lazy DFA's cache
+    /// may be cleared.
+    ///
+    /// When a search cannot complete, callers cannot know whether a match
+    /// exists or not.
+    ///
+    /// The infallible (panics on error) version of this routine is
+    /// [`find_overlapping_at`](Regex::find_overlapping_at).
+    pub fn try_find_overlapping_at(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+        state: &mut OverlappingState,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        self.try_find_overlapping_at_imp(
+            self.scanner().as_mut(),
+            cache,
+            haystack,
+            start,
+            end,
+            state,
+        )
+    }
+}
+
+impl Regex {
+    #[inline(always)]
+    fn try_find_earliest_at_imp(
+        &self,
+        pre: Option<&mut prefilter::Scanner>,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        let (fdfa, rdfa) = (self.forward(), self.reverse());
+        let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
+        let end = match fdfa
+            .find_earliest_fwd_at(fcache, pre, None, haystack, start, end)?
+        {
+            None => return Ok(None),
+            Some(end) => end,
+        };
+        // N.B. The only time we need to tell the reverse searcher the pattern
+        // to match is in the overlapping case, since it's ambiguous. In the
+        // earliest case, I have tentatively convinced myself that it isn't
+        // necessary and the reverse search will always find the same pattern
+        // to match as the forward search. But I lack a rigorous proof. Why not
+        // just provide the pattern anyway? Well, if it is needed, then leaving
+        // it out gives us a chance to find a witness.
+        let start = rdfa
+            .find_earliest_rev_at(rcache, None, haystack, start, end.offset())?
+            .expect("reverse search must match if forward search does");
+        assert_eq!(
+            start.pattern(),
+            end.pattern(),
+            "forward and reverse search must match same pattern",
+        );
+        assert!(start.offset() <= end.offset());
+        Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+    }
+
+    #[inline(always)]
+    fn try_find_leftmost_at_imp(
+        &self,
+        pre: Option<&mut prefilter::Scanner>,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        let (fdfa, rdfa) = (self.forward(), self.reverse());
+        let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
+        let end = match fdfa
+            .find_leftmost_fwd_at(fcache, pre, None, haystack, start, end)?
+        {
+            None => return Ok(None),
+            Some(end) => end,
+        };
+        // N.B. The only time we need to tell the reverse searcher the pattern
+        // to match is in the overlapping case, since it's ambiguous. In the
+        // leftmost case, I have tentatively convinced myself that it isn't
+        // necessary and the reverse search will always find the same pattern
+        // to match as the forward search. But I lack a rigorous proof. Why not
+        // just provide the pattern anyway? Well, if it is needed, then leaving
+        // it out gives us a chance to find a witness.
+        let start = rdfa
+            .find_leftmost_rev_at(rcache, None, haystack, start, end.offset())?
+            .expect("reverse search must match if forward search does");
+        assert_eq!(
+            start.pattern(),
+            end.pattern(),
+            "forward and reverse search must match same pattern",
+        );
+        assert!(start.offset() <= end.offset());
+        Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+    }
+
+    #[inline(always)]
+    fn try_find_overlapping_at_imp(
+        &self,
+        pre: Option<&mut prefilter::Scanner>,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+        state: &mut OverlappingState,
+    ) -> Result<Option<MultiMatch>, MatchError> {
+        let (fdfa, rdfa) = (self.forward(), self.reverse());
+        let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
+        let end = match fdfa.find_overlapping_fwd_at(
+            fcache, pre, None, haystack, start, end, state,
+        )? {
+            None => return Ok(None),
+            Some(end) => end,
+        };
+        // Unlike the leftmost cases, the reverse overlapping search may match
+        // a different pattern than the forward search. See test failures when
+        // using `None` instead of `Some(end.pattern())` below. Thus, we must
+        // run our reverse search using the pattern that matched in the forward
+        // direction.
+        let start = rdfa
+            .find_leftmost_rev_at(
+                rcache,
+                Some(end.pattern()),
+                haystack,
+                0,
+                end.offset(),
+            )?
+            .expect("reverse search must match if forward search does");
+        assert_eq!(
+            start.pattern(),
+            end.pattern(),
+            "forward and reverse search must match same pattern",
+        );
+        assert!(start.offset() <= end.offset());
+        Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+    }
+}
+
+/// Non-search APIs for querying information about the regex and setting a
+/// prefilter.
+impl Regex {
+    /// Return the underlying lazy DFA responsible for forward matching.
+    ///
+    /// This is useful for accessing the underlying lazy DFA and using it
+    /// directly if the situation calls for it.
+    pub fn forward(&self) -> &DFA {
+        &self.forward
+    }
+
+    /// Return the underlying lazy DFA responsible for reverse matching.
+    ///
+    /// This is useful for accessing the underlying lazy DFA and using it
+    /// directly if the situation calls for it.
+    pub fn reverse(&self) -> &DFA {
+        &self.reverse
+    }
+
+    /// Returns the total number of patterns matched by this regex.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+    ///
+    /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
+    /// assert_eq!(3, re.pattern_count());
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn pattern_count(&self) -> usize {
+        assert_eq!(
+            self.forward().pattern_count(),
+            self.reverse().pattern_count()
+        );
+        self.forward().pattern_count()
+    }
+
+    /// Convenience function for returning this regex's prefilter as a trait
+    /// object.
+    ///
+    /// If this regex doesn't have a prefilter, then `None` is returned.
+    pub fn prefilter(&self) -> Option<&dyn Prefilter> {
+        self.pre.as_ref().map(|x| &**x)
+    }
+
+    /// Attach the given prefilter to this regex.
+    pub fn set_prefilter(&mut self, pre: Option<Box<dyn Prefilter>>) {
+        self.pre = pre;
+    }
+
+    /// Convenience function for returning a prefilter scanner.
+    fn scanner(&self) -> Option<prefilter::Scanner> {
+        self.prefilter().map(prefilter::Scanner::new)
+    }
+}
+
+/// An iterator over all non-overlapping earliest matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct FindEarliestMatches<'r, 'c, 't>(TryFindEarliestMatches<'r, 'c, 't>);
+
+impl<'r, 'c, 't> FindEarliestMatches<'r, 'c, 't> {
+    fn new(
+        re: &'r Regex,
+        cache: &'c mut Cache,
+        text: &'t [u8],
+    ) -> FindEarliestMatches<'r, 'c, 't> {
+        FindEarliestMatches(TryFindEarliestMatches::new(re, cache, text))
+    }
+}
+
+impl<'r, 'c, 't> Iterator for FindEarliestMatches<'r, 'c, 't> {
+    type Item = MultiMatch;
+
+    fn next(&mut self) -> Option<MultiMatch> {
+        next_unwrap(self.0.next())
+    }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct FindLeftmostMatches<'r, 'c, 't>(TryFindLeftmostMatches<'r, 'c, 't>);
+
+impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> {
+    fn new(
+        re: &'r Regex,
+        cache: &'c mut Cache,
+        text: &'t [u8],
+    ) -> FindLeftmostMatches<'r, 'c, 't> {
+        FindLeftmostMatches(TryFindLeftmostMatches::new(re, cache, text))
+    }
+}
+
+impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> {
+    type Item = MultiMatch;
+
+    fn next(&mut self) -> Option<MultiMatch> {
+        next_unwrap(self.0.next())
+    }
+}
+
+/// An iterator over all overlapping matches for a particular infallible
+/// search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct FindOverlappingMatches<'r, 'c, 't>(
+    TryFindOverlappingMatches<'r, 'c, 't>,
+);
+
+impl<'r, 'c, 't> FindOverlappingMatches<'r, 'c, 't> {
+    fn new(
+        re: &'r Regex,
+        cache: &'c mut Cache,
+        text: &'t [u8],
+    ) -> FindOverlappingMatches<'r, 'c, 't> {
+        FindOverlappingMatches(TryFindOverlappingMatches::new(re, cache, text))
+    }
+}
+
+impl<'r, 'c, 't> Iterator for FindOverlappingMatches<'r, 'c, 't> {
+    type Item = MultiMatch;
+
+    fn next(&mut self) -> Option<MultiMatch> {
+        next_unwrap(self.0.next())
+    }
+}
+
+/// An iterator over all non-overlapping earliest matches for a particular
+/// fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct TryFindEarliestMatches<'r, 'c, 't> {
+    re: &'r Regex,
+    cache: &'c mut Cache,
+    scanner: Option<prefilter::Scanner<'r>>,
+    text: &'t [u8],
+    last_end: usize,
+    last_match: Option<usize>,
+}
+
+impl<'r, 'c, 't> TryFindEarliestMatches<'r, 'c, 't> {
+    fn new(
+        re: &'r Regex,
+        cache: &'c mut Cache,
+        text: &'t [u8],
+    ) -> TryFindEarliestMatches<'r, 'c, 't> {
+        let scanner = re.scanner();
+        TryFindEarliestMatches {
+            re,
+            cache,
+            scanner,
+            text,
+            last_end: 0,
+            last_match: None,
+        }
+    }
+}
+
+impl<'r, 'c, 't> Iterator for TryFindEarliestMatches<'r, 'c, 't> {
+    type Item = Result<MultiMatch, MatchError>;
+
+    fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+        if self.last_end > self.text.len() {
+            return None;
+        }
+        let result = self.re.try_find_earliest_at_imp(
+            self.scanner.as_mut(),
+            self.cache,
+            self.text,
+            self.last_end,
+            self.text.len(),
+        );
+        let m = match result {
+            Err(err) => return Some(Err(err)),
+            Ok(None) => return None,
+            Ok(Some(m)) => m,
+        };
+        if m.is_empty() {
+            // This is an empty match. To ensure we make progress, start
+            // the next search at the smallest possible starting position
+            // of the next match following this one.
+            self.last_end = if self.re.utf8 {
+                crate::util::next_utf8(self.text, m.end())
+            } else {
+                m.end() + 1
+            };
+            // Don't accept empty matches immediately following a match.
+            // Just move on to the next match.
+            if Some(m.end()) == self.last_match {
+                return self.next();
+            }
+        } else {
+            self.last_end = m.end();
+        }
+        self.last_match = Some(m.end());
+        Some(Ok(m))
+    }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct TryFindLeftmostMatches<'r, 'c, 't> {
+    re: &'r Regex,
+    cache: &'c mut Cache,
+    scanner: Option<prefilter::Scanner<'r>>,
+    text: &'t [u8],
+    last_end: usize,
+    last_match: Option<usize>,
+}
+
+impl<'r, 'c, 't> TryFindLeftmostMatches<'r, 'c, 't> {
+    fn new(
+        re: &'r Regex,
+        cache: &'c mut Cache,
+        text: &'t [u8],
+    ) -> TryFindLeftmostMatches<'r, 'c, 't> {
+        let scanner = re.scanner();
+        TryFindLeftmostMatches {
+            re,
+            cache,
+            scanner,
+            text,
+            last_end: 0,
+            last_match: None,
+        }
+    }
+}
+
+impl<'r, 'c, 't> Iterator for TryFindLeftmostMatches<'r, 'c, 't> {
+    type Item = Result<MultiMatch, MatchError>;
+
+    fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+        if self.last_end > self.text.len() {
+            return None;
+        }
+        let result = self.re.try_find_leftmost_at_imp(
+            self.scanner.as_mut(),
+            self.cache,
+            self.text,
+            self.last_end,
+            self.text.len(),
+        );
+        let m = match result {
+            Err(err) => return Some(Err(err)),
+            Ok(None) => return None,
+            Ok(Some(m)) => m,
+        };
+        if m.is_empty() {
+            // This is an empty match. To ensure we make progress, start
+            // the next search at the smallest possible starting position
+            // of the next match following this one.
+            self.last_end = if self.re.utf8 {
+                crate::util::next_utf8(self.text, m.end())
+            } else {
+                m.end() + 1
+            };
+            // Don't accept empty matches immediately following a match.
+            // Just move on to the next match.
+            if Some(m.end()) == self.last_match {
+                return self.next();
+            }
+        } else {
+            self.last_end = m.end();
+        }
+        self.last_match = Some(m.end());
+        Some(Ok(m))
+    }
+}
+
+/// An iterator over all overlapping matches for a particular fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct TryFindOverlappingMatches<'r, 'c, 't> {
+    re: &'r Regex,
+    cache: &'c mut Cache,
+    scanner: Option<prefilter::Scanner<'r>>,
+    text: &'t [u8],
+    last_end: usize,
+    state: OverlappingState,
+}
+
+impl<'r, 'c, 't> TryFindOverlappingMatches<'r, 'c, 't> {
+    fn new(
+        re: &'r Regex,
+        cache: &'c mut Cache,
+        text: &'t [u8],
+    ) -> TryFindOverlappingMatches<'r, 'c, 't> {
+        let scanner = re.scanner();
+        TryFindOverlappingMatches {
+            re,
+            cache,
+            scanner,
+            text,
+            last_end: 0,
+            state: OverlappingState::start(),
+        }
+    }
+}
+
+impl<'r, 'c, 't> Iterator for TryFindOverlappingMatches<'r, 'c, 't> {
+    type Item = Result<MultiMatch, MatchError>;
+
+    fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+        if self.last_end > self.text.len() {
+            return None;
+        }
+        let result = self.re.try_find_overlapping_at_imp(
+            self.scanner.as_mut(),
+            self.cache,
+            self.text,
+            self.last_end,
+            self.text.len(),
+            &mut self.state,
+        );
+        let m = match result {
+            Err(err) => return Some(Err(err)),
+            Ok(None) => return None,
+            Ok(Some(m)) => m,
+        };
+        // Unlike the non-overlapping case, we're OK with empty matches at this
+        // level. In particular, the overlapping search algorithm is itself
+        // responsible for ensuring that progress is always made.
+        self.last_end = m.end();
+        Some(Ok(m))
+    }
+}
+
+/// A cache represents a partially computed forward and reverse DFA.
+///
+/// A cache is the key component that differentiates a classical DFA and a
+/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
+/// complete transition table that can handle all possible inputs, a hybrid
+/// NFA/DFA starts with an empty transition table and builds only the parts
+/// required during search. The parts that are built are stored in a cache. For
+/// this reason, a cache is a required parameter for nearly every operation on
+/// a [`Regex`].
+///
+/// Caches can be created from their corresponding `Regex` via
+/// [`Regex::create_cache`]. A cache can only be used with either the `Regex`
+/// that created it, or the `Regex` that was most recently used to reset it
+/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in
+/// panics or incorrect results.
+#[derive(Debug, Clone)]
+pub struct Cache {
+    forward: dfa::Cache,
+    reverse: dfa::Cache,
+}
+
+impl Cache {
+    /// Create a new cache for the given `Regex`.
+    ///
+    /// The cache returned should only be used for searches for the given
+    /// `Regex`. If you want to reuse the cache for another `Regex`, then you
+    /// must call [`Cache::reset`] with that `Regex`.
+    pub fn new(re: &Regex) -> Cache {
+        let forward = dfa::Cache::new(re.forward());
+        let reverse = dfa::Cache::new(re.reverse());
+        Cache { forward, reverse }
+    }
+
+    /// Reset this cache such that it can be used for searching with the given
+    /// `Regex` (and only that `Regex`).
+    ///
+    /// A cache reset permits reusing memory already allocated in this cache
+    /// with a different `Regex`.
+    ///
+    /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+    /// `Regex` has been configured to "give up" after it has cleared the cache
+    /// a certain number of times.
+    ///
+    /// # Example
+    ///
+    /// This shows how to re-purpose a cache for use with a different `Regex`.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+    ///
+    /// let re1 = Regex::new(r"\w")?;
+    /// let re2 = Regex::new(r"\W")?;
+    ///
+    /// let mut cache = re1.create_cache();
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 0, 2)),
+    ///     re1.find_leftmost(&mut cache, "Δ".as_bytes()),
+    /// );
+    ///
+    /// // Using 'cache' with re2 is not allowed. It may result in panics or
+    /// // incorrect results. In order to re-purpose the cache, we must reset
+    /// // it with the Regex we'd like to use it with.
+    /// //
+    /// // Similarly, after this reset, using the cache with 're1' is also not
+    /// // allowed.
+    /// cache.reset(&re2);
+    /// assert_eq!(
+    ///     Some(MultiMatch::must(0, 0, 3)),
+    ///     re2.find_leftmost(&mut cache, "☃".as_bytes()),
+    /// );
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn reset(&mut self, re: &Regex) {
+        self.forward.reset(re.forward());
+        self.reverse.reset(re.reverse());
+    }
+
+    /// Returns the heap memory usage, in bytes, as a sum of the forward and
+    /// reverse lazy DFA caches.
+    ///
+    /// This does **not** include the stack size used up by this cache. To
+    /// compute that, use `std::mem::size_of::<Cache>()`.
+    pub fn memory_usage(&self) -> usize {
+        self.forward.memory_usage() + self.reverse.memory_usage()
+    }
+
+    /// Return references to the forward and reverse caches, respectively.
+    pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) {
+        (&self.forward, &self.reverse)
+    }
+
+    /// Return mutable references to the forward and reverse caches,
+    /// respectively.
+    pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) {
+        (&mut self.forward, &mut self.reverse)
+    }
+}
+
+/// The configuration used for compiling a hybrid NFA/DFA regex.
+///
+/// A regex configuration is a simple data object that is typically used with
+/// [`Builder::configure`].
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+    utf8: Option<bool>,
+}
+
+impl Config {
+    /// Return a new default regex compiler configuration.
+    pub fn new() -> Config {
+        Config::default()
+    }
+
+    /// Whether to enable UTF-8 mode or not.
+    ///
+    /// When UTF-8 mode is enabled (the default) and an empty match is seen,
+    /// the iterators on [`Regex`] will always start the next search at the
+    /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8
+    /// mode is disabled, such searches are begun at the next byte offset.
+    ///
+    /// If this mode is enabled and invalid UTF-8 is given to search, then
+    /// behavior is unspecified.
+    ///
+    /// Generally speaking, one should enable this when
+    /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8)
+    /// and
+    /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
+    /// are enabled, and disable it otherwise.
+    ///
+    /// # Example
+    ///
+    /// This example demonstrates the differences between when this option is
+    /// enabled and disabled. The differences only arise when the regex can
+    /// return matches of length zero.
+    ///
+    /// In this first snippet, we show the results when UTF-8 mode is disabled.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+    ///
+    /// let re = Regex::builder()
+    ///     .configure(Regex::config().utf8(false))
+    ///     .build(r"")?;
+    /// let mut cache = re.create_cache();
+    ///
+    /// let haystack = "a☃z".as_bytes();
+    /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    ///
+    /// And in this snippet, we execute the same search on the same haystack,
+    /// but with UTF-8 mode enabled. Notice that byte offsets that would
+    /// otherwise split the encoding of `☃` are not returned.
+    ///
+    /// ```
+    /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+    ///
+    /// let re = Regex::builder()
+    ///     .configure(Regex::config().utf8(true))
+    ///     .build(r"")?;
+    /// let mut cache = re.create_cache();
+    ///
+    /// let haystack = "a☃z".as_bytes();
+    /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+    /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+    /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+    /// assert_eq!(None, it.next());
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn utf8(mut self, yes: bool) -> Config {
+        self.utf8 = Some(yes);
+        self
+    }
+
+    /// Returns true if and only if this configuration has UTF-8 mode enabled.
+    ///
+    /// When UTF-8 mode is enabled and an empty match is seen, the iterators on
+    /// [`Regex`] will always start the next search at the next UTF-8 encoded
+    /// codepoint. When UTF-8 mode is disabled, such searches are begun at the
+    /// next byte offset.
+    pub fn get_utf8(&self) -> bool {
+        self.utf8.unwrap_or(true)
+    }
+
+    /// Overwrite the default configuration such that the options in `o` are
+    /// always used. If an option in `o` is not set, then the corresponding
+    /// option in `self` is used. If it's not set in `self` either, then it
+    /// remains not set.
+    pub(crate) fn overwrite(self, o: Config) -> Config {
+        Config { utf8: o.utf8.or(self.utf8) }
+    }
+}
+
+/// A builder for a regex based on a hybrid NFA/DFA.
+///
+/// This builder permits configuring options for the syntax of a pattern, the
+/// NFA construction, the lazy DFA construction and finally the regex searching
+/// itself. This builder is different from a general purpose regex builder
+/// in that it permits fine grain configuration of the construction process.
+/// The trade off for this is complexity, and the possibility of setting a
+/// configuration that might not make sense. For example, there are three
+/// different UTF-8 modes:
+///
+/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the
+/// pattern itself can contain sub-expressions that match invalid UTF-8.
+/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
+/// controls whether the implicit unanchored prefix added to the NFA can
+/// match through invalid UTF-8 or not.
+/// * [`Config::utf8`] controls how the regex iterators themselves advance
+/// the starting position of the next search when a match with zero length is
+/// found.
+///
+/// Generally speaking, callers will want to either enable all of these or
+/// disable all of these.
+///
+/// Internally, building a regex requires building two hybrid NFA/DFAs,
+/// where one is responsible for finding the end of a match and the other is
+/// responsible for finding the start of a match. If you only need to detect
+/// whether something matched, or only the end of a match, then you should use
+/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper
+/// than building two of them.
+///
+/// # Example
+///
+/// This example shows how to disable UTF-8 mode in the syntax, the NFA and
+/// the regex itself. This is generally what you want for matching on
+/// arbitrary bytes.
+///
+/// ```
+/// use regex_automata::{
+///     hybrid::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig
+/// };
+///
+/// let re = Regex::builder()
+///     .configure(Regex::config().utf8(false))
+///     .syntax(SyntaxConfig::new().utf8(false))
+///     .thompson(thompson::Config::new().utf8(false))
+///     .build(r"foo(?-u:[^b])ar.*")?;
+/// let mut cache = re.create_cache();
+///
+/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+/// let expected = Some(MultiMatch::must(0, 1, 9));
+/// let got = re.find_leftmost(&mut cache, haystack);
+/// assert_eq!(expected, got);
+/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
+/// // but the subsequent `.*` does not! Disabling UTF-8
+/// // on the syntax permits this. Notice also that the
+/// // search was unanchored and skipped over invalid UTF-8.
+/// // Disabling UTF-8 on the Thompson NFA permits this.
+/// //
+/// // N.B. This example does not show the impact of
+/// // disabling UTF-8 mode on Config, since that
+/// // only impacts regexes that can produce matches of
+/// // length 0.
+/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+    config: Config,
+    dfa: dfa::Builder,
+}
+
+impl Builder {
+    /// Create a new regex builder with the default configuration.
+    pub fn new() -> Builder {
+        Builder { config: Config::default(), dfa: DFA::builder() }
+    }
+
+    /// Build a regex from the given pattern.
+    ///
+    /// If there was a problem parsing or compiling the pattern, then an error
+    /// is returned.
+    pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
+        self.build_many(&[pattern])
+    }
+
+    /// Build a regex from the given patterns.
+    pub fn build_many<P: AsRef<str>>(
+        &self,
+        patterns: &[P],
+    ) -> Result<Regex, BuildError> {
+        let forward = self.dfa.build_many(patterns)?;
+        let reverse = self
+            .dfa
+            .clone()
+            .configure(
+                DFA::config()
+                    .anchored(true)
+                    .match_kind(MatchKind::All)
+                    .starts_for_each_pattern(true),
+            )
+            .thompson(thompson::Config::new().reverse(true))
+            .build_many(patterns)?;
+        Ok(self.build_from_dfas(forward, reverse))
+    }
+
+    /// Build a regex from its component forward and reverse hybrid NFA/DFAs.
+    fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
+        // The congruous method on DFA-backed regexes is exposed, but it's
+        // not clear this builder is useful here since lazy DFAs can't be
+        // serialized and there is only one type of them.
+        let utf8 = self.config.get_utf8();
+        Regex { pre: None, forward, reverse, utf8 }
+    }
+
+    /// Apply the given regex configuration options to this builder.
+    pub fn configure(&mut self, config: Config) -> &mut Builder {
+        self.config = self.config.overwrite(config);
+        self
+    }
+
+    /// Set the syntax configuration for this builder using
+    /// [`SyntaxConfig`](crate::SyntaxConfig).
+    ///
+    /// This permits setting things like case insensitivity, Unicode and multi
+    /// line mode.
+    pub fn syntax(
+        &mut self,
+        config: crate::util::syntax::SyntaxConfig,
+    ) -> &mut Builder {
+        self.dfa.syntax(config);
+        self
+    }
+
+    /// Set the Thompson NFA configuration for this builder using
+    /// [`nfa::thompson::Config`](thompson::Config).
+    ///
+    /// This permits setting things like whether additional time should be
+    /// spent shrinking the size of the NFA.
+    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+        self.dfa.thompson(config);
+        self
+    }
+
+    /// Set the lazy DFA compilation configuration for this builder using
+    /// [`dfa::Config`](dfa::Config).
+    ///
+    /// This permits setting things like whether Unicode word boundaries should
+    /// be heuristically supported or settings how the behavior of the cache.
+    pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder {
+        self.dfa.configure(config);
+        self
+    }
+}
+
+impl Default for Builder {
+    fn default() -> Builder {
+        Builder::new()
+    }
+}
+
+#[inline(always)]
+fn next_unwrap(
+    item: Option<Result<MultiMatch, MatchError>>,
+) -> Option<MultiMatch> {
+    match item {
+        None => None,
+        Some(Ok(m)) => Some(m),
+        Some(Err(err)) => panic!(
+            "unexpected regex search error: {}\n\
+             to handle search errors, use try_ methods",
+            err,
+        ),
+    }
+}
diff --git a/src/hybrid/search.rs b/src/hybrid/search.rs

new file mode 100644 (file)

index 0000000..92760ce
--- /dev/null
+++ b/src/hybrid/search.rs
@@ -0,0 +1,663 @@
+use crate::{
+    hybrid::{
+        dfa::{Cache, DFA},
+        id::{LazyStateID, OverlappingState, StateMatch},
+    },
+    nfa::thompson,
+    util::{
+        id::PatternID,
+        matchtypes::{HalfMatch, MatchError},
+        prefilter, MATCH_OFFSET,
+    },
+};
+
+#[inline(never)]
+pub(crate) fn find_earliest_fwd(
+    pre: Option<&mut prefilter::Scanner>,
+    dfa: &DFA,
+    cache: &mut Cache,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    // Searching with a pattern ID is always anchored, so we should never use
+    // a prefilter.
+    if pre.is_some() && pattern_id.is_none() {
+        find_fwd(pre, true, dfa, cache, pattern_id, bytes, start, end)
+    } else {
+        find_fwd(None, true, dfa, cache, pattern_id, bytes, start, end)
+    }
+}
+
+#[inline(never)]
+pub(crate) fn find_leftmost_fwd(
+    pre: Option<&mut prefilter::Scanner>,
+    dfa: &DFA,
+    cache: &mut Cache,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    // Searching with a pattern ID is always anchored, so we should never use
+    // a prefilter.
+    if pre.is_some() && pattern_id.is_none() {
+        find_fwd(pre, false, dfa, cache, pattern_id, bytes, start, end)
+    } else {
+        find_fwd(None, false, dfa, cache, pattern_id, bytes, start, end)
+    }
+}
+
+#[inline(always)]
+fn find_fwd(
+    mut pre: Option<&mut prefilter::Scanner>,
+    earliest: bool,
+    dfa: &DFA,
+    cache: &mut Cache,
+    pattern_id: Option<PatternID>,
+    haystack: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    assert!(start <= end);
+    assert!(start <= haystack.len());
+    assert!(end <= haystack.len());
+
+    // Why do this? This lets 'bytes[at]' work without bounds checks below.
+    // It seems the assert on 'end <= haystack.len()' above is otherwise
+    // not enough. Why not just make 'bytes' scoped this way anyway? Well,
+    // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
+    // for resolving look-ahead.
+    let bytes = &haystack[..end];
+
+    let mut sid = init_fwd(dfa, cache, pattern_id, haystack, start, end)?;
+    let mut last_match = None;
+    let mut at = start;
+    if let Some(ref mut pre) = pre {
+        // If a prefilter doesn't report false positives, then we don't need to
+        // touch the DFA at all. However, since all matches include the pattern
+        // ID, and the prefilter infrastructure doesn't report pattern IDs, we
+        // limit this optimization to cases where there is exactly one pattern.
+        // In that case, any match must be the 0th pattern.
+        if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
+            return Ok(pre.next_candidate(bytes, at).into_option().map(
+                |offset| HalfMatch { pattern: PatternID::ZERO, offset },
+            ));
+        } else if pre.is_effective(at) {
+            match pre.next_candidate(bytes, at).into_option() {
+                None => return Ok(None),
+                Some(i) => {
+                    at = i;
+                }
+            }
+        }
+    }
+    while at < end {
+        if sid.is_tagged() {
+            sid = dfa
+                .next_state(cache, sid, bytes[at])
+                .map_err(|_| gave_up(at))?;
+            at += 1;
+        } else {
+            // SAFETY: There are two safety invariants we need to uphold
+            // here in the loop below: that 'sid' is a valid state ID for
+            // this DFA, and that 'at' is a valid index into 'bytes'. For
+            // the former, we rely on the invariant that next_state* and
+            // start_state_forward always returns a valid state ID (given a
+            // valid state ID in the former case), and that we are only at this
+            // place in the code if 'sid' is untagged. Moreover, every call to
+            // next_state_untagged_unchecked below is guarded by a check that
+            // sid is untagged. For the latter safety invariant, we always
+            // guard unchecked access with a check that 'at' is less than
+            // 'end', where 'end == bytes.len()'.
+            //
+            // For justification, this gives us a ~10% bump in search time.
+            // This was used for a benchmark:
+            //
+            //     regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb
+            //
+            // With bounds checked: ~881.4ms. Without: ~775ms. For input, I
+            // used OpenSubtitles2018.raw.sample.medium.en.
+            let mut prev_sid = sid;
+            while at < end {
+                prev_sid = sid;
+                sid = unsafe {
+                    dfa.next_state_untagged_unchecked(
+                        cache,
+                        sid,
+                        *bytes.get_unchecked(at),
+                    )
+                };
+                at += 1;
+                if sid.is_tagged() {
+                    break;
+                }
+                // SAFETY: we make four unguarded accesses to 'bytes[at]'
+                // below, and each are safe because we know that 'at + 4' is
+                // in bounds. Moreover, while we don't check whether 'sid' is
+                // untagged directly, we know it is because of the check above.
+                // And the unrolled loop below quits when the next state is not
+                // equal to the previous state.
+                //
+                // PERF: For justification for eliminating bounds checks,
+                // see above. For justification for the unrolling, we use
+                // two tests. The one above with regex '(?m)^.+$', and also
+                // '(?m)^.{40}$'. The former is kinda the best case for
+                // unrolling, and gives a 1.67 boost primarily because the DFA
+                // spends most of its time munching through the input in the
+                // same state. But the latter pattern rarely spends time in the
+                // same state through subsequent transitions, so unrolling is
+                // pretty much always ineffective in that it craps out on the
+                // first 'sid != next' check below. However, without unrolling,
+                // search is only 1.03 times faster than with unrolling on the
+                // latter pattern, which we deem to be an acceptable loss in
+                // favor of optimizing the more common case of having a "hot"
+                // state somewhere in the DFA.
+                while at + 4 < end {
+                    let next = unsafe {
+                        dfa.next_state_untagged_unchecked(
+                            cache,
+                            sid,
+                            *bytes.get_unchecked(at),
+                        )
+                    };
+                    if sid != next {
+                        break;
+                    }
+                    at += 1;
+                    let next = unsafe {
+                        dfa.next_state_untagged_unchecked(
+                            cache,
+                            sid,
+                            *bytes.get_unchecked(at),
+                        )
+                    };
+                    if sid != next {
+                        break;
+                    }
+                    at += 1;
+                    let next = unsafe {
+                        dfa.next_state_untagged_unchecked(
+                            cache,
+                            sid,
+                            *bytes.get_unchecked(at),
+                        )
+                    };
+                    if sid != next {
+                        break;
+                    }
+                    at += 1;
+                    let next = unsafe {
+                        dfa.next_state_untagged_unchecked(
+                            cache,
+                            sid,
+                            *bytes.get_unchecked(at),
+                        )
+                    };
+                    if sid != next {
+                        break;
+                    }
+                    at += 1;
+                }
+            }
+            if sid.is_unknown() {
+                sid = dfa
+                    .next_state(cache, prev_sid, bytes[at - 1])
+                    .map_err(|_| gave_up(at - 1))?;
+            }
+        }
+        if sid.is_tagged() {
+            if sid.is_start() {
+                if let Some(ref mut pre) = pre {
+                    if pre.is_effective(at) {
+                        match pre.next_candidate(bytes, at).into_option() {
+                            None => return Ok(None),
+                            Some(i) => {
+                                at = i;
+                            }
+                        }
+                    }
+                }
+            } else if sid.is_match() {
+                last_match = Some(HalfMatch {
+                    pattern: dfa.match_pattern(cache, sid, 0),
+                    offset: at - MATCH_OFFSET,
+                });
+                if earliest {
+                    return Ok(last_match);
+                }
+            } else if sid.is_dead() {
+                return Ok(last_match);
+            } else if sid.is_quit() {
+                if last_match.is_some() {
+                    return Ok(last_match);
+                }
+                let offset = at - 1;
+                return Err(MatchError::Quit { byte: bytes[offset], offset });
+            } else {
+                debug_assert!(sid.is_unknown());
+                unreachable!("sid being unknown is a bug");
+            }
+        }
+    }
+    // We are careful to use 'haystack' here, which contains the full context
+    // that we might want to inspect.
+    Ok(eoi_fwd(dfa, cache, haystack, end, &mut sid)?.or(last_match))
+}
+
+#[inline(never)]
+pub(crate) fn find_earliest_rev(
+    dfa: &DFA,
+    cache: &mut Cache,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    find_rev(true, dfa, cache, pattern_id, bytes, start, end)
+}
+
+#[inline(never)]
+pub(crate) fn find_leftmost_rev(
+    dfa: &DFA,
+    cache: &mut Cache,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    find_rev(false, dfa, cache, pattern_id, bytes, start, end)
+}
+
+#[inline(always)]
+fn find_rev(
+    earliest: bool,
+    dfa: &DFA,
+    cache: &mut Cache,
+    pattern_id: Option<PatternID>,
+    haystack: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+    assert!(start <= end);
+    assert!(start <= haystack.len());
+    assert!(end <= haystack.len());
+
+    // Why do this? This lets 'bytes[at]' work without bounds checks below.
+    // It seems the assert on 'end <= haystack.len()' above is otherwise
+    // not enough. Why not just make 'bytes' scoped this way anyway? Well,
+    // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
+    // for resolving look-ahead.
+    let bytes = &haystack[start..];
+
+    let mut sid = init_rev(dfa, cache, pattern_id, haystack, start, end)?;
+    let mut last_match = None;
+    let mut at = end - start;
+    while at > 0 {
+        if sid.is_tagged() {
+            at -= 1;
+            sid = dfa
+                .next_state(cache, sid, bytes[at])
+                .map_err(|_| gave_up(at))?;
+        } else {
+            // SAFETY: See comments in 'find_fwd' for both a safety argument
+            // and a justification from a performance perspective as to 1) why
+            // we elide bounds checks and 2) why we do a specialized version of
+            // unrolling below.
+            let mut prev_sid = sid;
+            while at > 0 && !sid.is_tagged() {
+                prev_sid = sid;
+                at -= 1;
+                while at > 3 {
+                    let next = unsafe {
+                        dfa.next_state_untagged_unchecked(
+                            cache,
+                            sid,
+                            *bytes.get_unchecked(at),
+                        )
+                    };
+                    if sid != next {
+                        break;
+                    }
+                    at -= 1;
+                    let next = unsafe {
+                        dfa.next_state_untagged_unchecked(
+                            cache,
+                            sid,
+                            *bytes.get_unchecked(at),
+                        )
+                    };
+                    if sid != next {
+                        break;
+                    }
+                    at -= 1;
+                    let next = unsafe {
+                        dfa.next_state_untagged_unchecked(
+                            cache,
+                            sid,
+                            *bytes.get_unchecked(at),
+                        )
+                    };
+                    if sid != next {
+                        break;
+                    }
+                    at -= 1;
+                    let next = unsafe {
+                        dfa.next_state_untagged_unchecked(
+                            cache,
+                            sid,
+                            *bytes.get_unchecked(at),
+                        )
+                    };
+                    if sid != next {
+                        break;
+                    }
+                    at -= 1;
+                }
+                sid = unsafe {
+                    dfa.next_state_untagged_unchecked(
+                        cache,
+                        sid,
+                        *bytes.get_unchecked(at),
+                    )
+                };
+            }
+            if sid.is_unknown() {
+                sid = dfa
+                    .next_state(cache, prev_sid, bytes[at])
+                    .map_err(|_| gave_up(at))?;
+            }
+        }
+        if sid.is_tagged() {
+            if sid.is_start() {
+                continue;
+            } else if sid.is_match() {
+                last_match = Some(HalfMatch {
+                    pattern: dfa.match_pattern(cache, sid, 0),
+                    offset: start + at + MATCH_OFFSET,
+                });
+                if earliest {
+                    return Ok(last_match);
+                }
+            } else if sid.is_dead() {
+                return Ok(last_match);
+            } else {
+                debug_assert!(sid.is_quit());
+                if last_match.is_some() {
+                    return Ok(last_match);
+                }
+                return Err(MatchError::Quit { byte: bytes[at], offset: at });
+            }
+        }
+    }
+    Ok(eoi_rev(dfa, cache, haystack, start, sid)?.or(last_match))
+}
+
+#[inline(never)]
+pub(crate) fn find_overlapping_fwd(
+    pre: Option<&mut prefilter::Scanner>,
+    dfa: &DFA,
+    cache: &mut Cache,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+    caller_state: &mut OverlappingState,
+) -> Result<Option<HalfMatch>, MatchError> {
+    // Searching with a pattern ID is always anchored, so we should only ever
+    // use a prefilter when no pattern ID is given.
+    if pre.is_some() && pattern_id.is_none() {
+        find_overlapping_fwd_imp(
+            pre,
+            dfa,
+            cache,
+            pattern_id,
+            bytes,
+            start,
+            end,
+            caller_state,
+        )
+    } else {
+        find_overlapping_fwd_imp(
+            None,
+            dfa,
+            cache,
+            pattern_id,
+            bytes,
+            start,
+            end,
+            caller_state,
+        )
+    }
+}
+
+#[inline(always)]
+fn find_overlapping_fwd_imp(
+    mut pre: Option<&mut prefilter::Scanner>,
+    dfa: &DFA,
+    cache: &mut Cache,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    mut start: usize,
+    end: usize,
+    caller_state: &mut OverlappingState,
+) -> Result<Option<HalfMatch>, MatchError> {
+    assert!(start <= end);
+    assert!(start <= bytes.len());
+    assert!(end <= bytes.len());
+
+    let mut sid = match caller_state.id() {
+        None => init_fwd(dfa, cache, pattern_id, bytes, start, end)?,
+        Some(sid) => {
+            if let Some(last) = caller_state.last_match() {
+                let match_count = dfa.match_count(cache, sid);
+                if last.match_index < match_count {
+                    let m = HalfMatch {
+                        pattern: dfa.match_pattern(
+                            cache,
+                            sid,
+                            last.match_index,
+                        ),
+                        offset: last.offset,
+                    };
+                    last.match_index += 1;
+                    return Ok(Some(m));
+                }
+            }
+
+            // This is a subtle but critical detail. If the caller provides a
+            // non-None state ID, then it must be the case that the state ID
+            // corresponds to one set by this function. The state ID therefore
+            // corresponds to a match state, a dead state or some other state.
+            // However, "some other" state _only_ occurs when the input has
+            // been exhausted because the only way to stop before then is to
+            // see a match or a dead/quit state.
+            //
+            // If the input is exhausted or if it's a dead state, then
+            // incrementing the starting position has no relevance on
+            // correctness, since the loop below will either not execute
+            // at all or will immediately stop due to being in a dead state.
+            // (Once in a dead state it is impossible to leave it.)
+            //
+            // Therefore, the only case we need to consider is when
+            // caller_state is a match state. In this case, since our machines
+            // support the ability to delay a match by a certain number of
+            // bytes (to support look-around), it follows that we actually
+            // consumed that many additional bytes on our previous search. When
+            // the caller resumes their search to find subsequent matches, they
+            // will use the ending location from the previous match as the next
+            // starting point, which is `match_offset` bytes PRIOR to where
+            // we scanned to on the previous search. Therefore, we need to
+            // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
+            //
+            // Incidentally, since MATCH_OFFSET is non-zero, this also makes
+            // dealing with empty matches convenient. Namely, callers needn't
+            // special case them when implementing an iterator. Instead, this
+            // ensures that forward progress is always made.
+            start += MATCH_OFFSET;
+            sid
+        }
+    };
+
+    let mut at = start;
+    while at < end {
+        let byte = bytes[at];
+        sid = dfa.next_state(cache, sid, byte).map_err(|_| gave_up(at))?;
+        at += 1;
+        if sid.is_tagged() {
+            caller_state.set_id(sid);
+            if sid.is_start() {
+                if let Some(ref mut pre) = pre {
+                    if pre.is_effective(at) {
+                        match pre.next_candidate(bytes, at).into_option() {
+                            None => return Ok(None),
+                            Some(i) => {
+                                at = i;
+                            }
+                        }
+                    }
+                }
+            } else if sid.is_match() {
+                let offset = at - MATCH_OFFSET;
+                caller_state
+                    .set_last_match(StateMatch { match_index: 1, offset });
+                return Ok(Some(HalfMatch {
+                    pattern: dfa.match_pattern(cache, sid, 0),
+                    offset,
+                }));
+            } else if sid.is_dead() {
+                return Ok(None);
+            } else {
+                debug_assert!(sid.is_quit());
+                return Err(MatchError::Quit { byte, offset: at - 1 });
+            }
+        }
+    }
+
+    let result = eoi_fwd(dfa, cache, bytes, end, &mut sid);
+    caller_state.set_id(sid);
+    if let Ok(Some(ref last_match)) = result {
+        caller_state.set_last_match(StateMatch {
+            // '1' is always correct here since if we get to this point, this
+            // always corresponds to the first (index '0') match discovered at
+            // this position. So the next match to report at this position (if
+            // it exists) is at index '1'.
+            match_index: 1,
+            offset: last_match.offset(),
+        });
+    }
+    result
+}
+
+#[inline(always)]
+fn init_fwd(
+    dfa: &DFA,
+    cache: &mut Cache,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<LazyStateID, MatchError> {
+    let sid = dfa
+        .start_state_forward(cache, pattern_id, bytes, start, end)
+        .map_err(|_| gave_up(start))?;
+    // Start states can never be match states, since all matches are delayed
+    // by 1 byte.
+    assert!(!sid.is_match());
+    Ok(sid)
+}
+
+#[inline(always)]
+fn init_rev(
+    dfa: &DFA,
+    cache: &mut Cache,
+    pattern_id: Option<PatternID>,
+    bytes: &[u8],
+    start: usize,
+    end: usize,
+) -> Result<LazyStateID, MatchError> {
+    let sid = dfa
+        .start_state_reverse(cache, pattern_id, bytes, start, end)
+        .map_err(|_| gave_up(end))?;
+    // Start states can never be match states, since all matches are delayed
+    // by 1 byte.
+    assert!(!sid.is_match());
+    Ok(sid)
+}
+
+#[inline(always)]
+fn eoi_fwd(
+    dfa: &DFA,
+    cache: &mut Cache,
+    bytes: &[u8],
+    end: usize,
+    sid: &mut LazyStateID,
+) -> Result<Option<HalfMatch>, MatchError> {
+    match bytes.get(end) {
+        Some(&b) => {
+            *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(end))?;
+            if sid.is_match() {
+                Ok(Some(HalfMatch {
+                    pattern: dfa.match_pattern(cache, *sid, 0),
+                    offset: end,
+                }))
+            } else {
+                Ok(None)
+            }
+        }
+        None => {
+            *sid = dfa
+                .next_eoi_state(cache, *sid)
+                .map_err(|_| gave_up(bytes.len()))?;
+            if sid.is_match() {
+                Ok(Some(HalfMatch {
+                    pattern: dfa.match_pattern(cache, *sid, 0),
+                    offset: bytes.len(),
+                }))
+            } else {
+                Ok(None)
+            }
+        }
+    }
+}
+
+#[inline(always)]
+fn eoi_rev(
+    dfa: &DFA,
+    cache: &mut Cache,
+    bytes: &[u8],
+    start: usize,
+    state: LazyStateID,
+) -> Result<Option<HalfMatch>, MatchError> {
+    if start > 0 {
+        let sid = dfa
+            .next_state(cache, state, bytes[start - 1])
+            .map_err(|_| gave_up(start))?;
+        if sid.is_match() {
+            Ok(Some(HalfMatch {
+                pattern: dfa.match_pattern(cache, sid, 0),
+                offset: start,
+            }))
+        } else {
+            Ok(None)
+        }
+    } else {
+        let sid =
+            dfa.next_eoi_state(cache, state).map_err(|_| gave_up(start))?;
+        if sid.is_match() {
+            Ok(Some(HalfMatch {
+                pattern: dfa.match_pattern(cache, sid, 0),
+                offset: 0,
+            }))
+        } else {
+            Ok(None)
+        }
+    }
+}
+
+/// A convenience routine for constructing a "gave up" match error.
+#[inline(always)]
+fn gave_up(offset: usize) -> MatchError {
+    MatchError::GaveUp { offset }
+}
diff --git a/src/lib.rs b/src/lib.rs

new file mode 100644 (file)

index 0000000..d9d7ada
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,47 @@
+/*!
+This crate provides an "expert" API for executing regular expressions using
+finite automata.
+
+**WARNING**: This `0.2` release of `regex-automata` was published
+before it was ready to unblock work elsewhere that needed some
+of the new APIs in this release. At the time of writing, it is
+strongly preferred that you continue using the
+[`regex-automata 0.1`](https://docs.rs/regex-automata/0.1/regex_automata/)
+release. Since this release represents an unfinished state, please do not
+create issues for this release unless it's for a critical bug.
+*/
+
+#![allow(warnings)]
+// #![deny(missing_docs)]
+#![cfg_attr(not(feature = "std"), no_std)]
+
+#[cfg(not(any(
+    target_pointer_width = "16",
+    target_pointer_width = "32",
+    target_pointer_width = "64"
+)))]
+compile_error!("regex-automata currently not supported on non-{16,32,64}");
+
+#[cfg(feature = "alloc")]
+extern crate alloc;
+
+#[doc(inline)]
+pub use crate::util::id::PatternID;
+#[cfg(feature = "alloc")]
+pub use crate::util::syntax::SyntaxConfig;
+pub use crate::util::{
+    bytes::{DeserializeError, SerializeError},
+    matchtypes::{HalfMatch, Match, MatchError, MatchKind, MultiMatch},
+};
+
+#[macro_use]
+mod macros;
+
+pub mod dfa;
+#[cfg(feature = "alloc")]
+pub mod hybrid;
+#[doc(hidden)]
+#[cfg(feature = "alloc")]
+pub mod nfa;
+#[doc(hidden)]
+pub mod util;
diff --git a/src/macros.rs b/src/macros.rs

new file mode 100644 (file)

index 0000000..649ba17
--- /dev/null
+++ b/src/macros.rs
@@ -0,0 +1,30 @@
+/// A simple macro for defining bitfield accessors/mutators.
+#[cfg(feature = "alloc")]
+macro_rules! define_bool {
+    ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
+        fn $is_fn_name(&self) -> bool {
+            self.bools & (0b1 << $bit) > 0
+        }
+
+        fn $set_fn_name(&mut self, yes: bool) {
+            if yes {
+                self.bools |= 1 << $bit;
+            } else {
+                self.bools &= !(1 << $bit);
+            }
+        }
+    };
+}
+
+macro_rules! log {
+    ($($tt:tt)*) => {
+        #[cfg(feature = "logging")]
+        {
+            $($tt)*
+        }
+    }
+}
+
+macro_rules! trace {
+    ($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
+}
diff --git a/src/nfa/mod.rs b/src/nfa/mod.rs

new file mode 100644 (file)

index 0000000..61ce5ef
--- /dev/null
+++ b/src/nfa/mod.rs
@@ -0,0 +1 @@
+pub mod thompson;
diff --git a/src/nfa/thompson/compiler.rs b/src/nfa/thompson/compiler.rs

new file mode 100644 (file)

index 0000000..3011940
--- /dev/null
+++ b/src/nfa/thompson/compiler.rs
@@ -0,0 +1,1713 @@
+/*
+This module provides an NFA compiler using Thompson's construction
+algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
+graph as output. The NFA graph is structured in a way that permits it to be
+executed by a virtual machine and also used to efficiently build a DFA.
+
+The compiler deals with a slightly expanded set of NFA states that notably
+includes an empty node that has exactly one epsilon transition to the next
+state. In other words, it's a "goto" instruction if one views Thompson's NFA
+as a set of bytecode instructions. These goto instructions are removed in
+a subsequent phase before returning the NFA to the caller. The purpose of
+these empty nodes is that they make the construction algorithm substantially
+simpler to implement. We remove them before returning to the caller because
+they can represent substantial overhead when traversing the NFA graph
+(either while searching using the NFA directly or while building a DFA).
+
+In the future, it would be nice to provide a Glushkov compiler as well,
+as it would work well as a bit-parallel NFA for smaller regexes. But
+the Thompson construction is one I'm more familiar with and seems more
+straight-forward to deal with when it comes to large Unicode character
+classes.
+
+Internally, the compiler uses interior mutability to improve composition
+in the face of the borrow checker. In particular, we'd really like to be
+able to write things like this:
+
+    self.c_concat(exprs.iter().map(|e| self.c(e)))
+
+Which elegantly uses iterators to build up a sequence of compiled regex
+sub-expressions and then hands it off to the concatenating compiler
+routine. Without interior mutability, the borrow checker won't let us
+borrow `self` mutably both inside and outside the closure at the same
+time.
+*/
+
+use core::{
+    borrow::Borrow,
+    cell::{Cell, RefCell},
+    mem,
+};
+
+use alloc::{sync::Arc, vec, vec::Vec};
+
+use regex_syntax::{
+    hir::{self, Anchor, Class, Hir, HirKind, Literal, WordBoundary},
+    utf8::{Utf8Range, Utf8Sequences},
+    ParserBuilder,
+};
+
+use crate::{
+    nfa::thompson::{
+        error::Error,
+        map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
+        range_trie::RangeTrie,
+        Look, SparseTransitions, State, Transition, NFA,
+    },
+    util::{
+        alphabet::ByteClassSet,
+        id::{IteratorIDExt, PatternID, StateID},
+    },
+};
+
+/// The configuration used for compiling a Thompson NFA from a regex pattern.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+    reverse: Option<bool>,
+    utf8: Option<bool>,
+    nfa_size_limit: Option<Option<usize>>,
+    shrink: Option<bool>,
+    captures: Option<bool>,
+    #[cfg(test)]
+    unanchored_prefix: Option<bool>,
+}
+
+impl Config {
+    /// Return a new default Thompson NFA compiler configuration.
+    pub fn new() -> Config {
+        Config::default()
+    }
+
+    /// Reverse the NFA.
+    ///
+    /// A NFA reversal is performed by reversing all of the concatenated
+    /// sub-expressions in the original pattern, recursively. The resulting
+    /// NFA can be used to match the pattern starting from the end of a string
+    /// instead of the beginning of a string.
+    ///
+    /// Reversing the NFA is useful for building a reverse DFA, which is most
+    /// useful for finding the start of a match after its ending position has
+    /// been found.
+    ///
+    /// This is disabled by default.
+    pub fn reverse(mut self, yes: bool) -> Config {
+        self.reverse = Some(yes);
+        self
+    }
+
+    /// Whether to enable UTF-8 mode or not.
+    ///
+    /// When UTF-8 mode is enabled (which is the default), unanchored searches
+    /// will only match through valid UTF-8. If invalid UTF-8 is seen, then
+    /// an unanchored search will stop at that point. This is equivalent to
+    /// putting a `(?s:.)*?` at the start of the regex.
+    ///
+    /// When UTF-8 mode is disabled, then unanchored searches will match
+    /// through any arbitrary byte. This is equivalent to putting a
+    /// `(?s-u:.)*?` at the start of the regex.
+    ///
+    /// Generally speaking, UTF-8 mode should only be used when you know you
+    /// are searching valid UTF-8, such as a Rust `&str`. If UTF-8 mode is used
+    /// on input that is not valid UTF-8, then the regex is not likely to work
+    /// as expected.
+    ///
+    /// This is enabled by default.
+    pub fn utf8(mut self, yes: bool) -> Config {
+        self.utf8 = Some(yes);
+        self
+    }
+
+    /// Sets an approximate size limit on the total heap used by the NFA being
+    /// compiled.
+    ///
+    /// This permits imposing constraints on the size of a compiled NFA. This
+    /// may be useful in contexts where the regex pattern is untrusted and one
+    /// wants to avoid using too much memory.
+    ///
+    /// This size limit does not apply to auxiliary heap used during
+    /// compilation that is not part of the built NFA.
+    ///
+    /// Note that this size limit is applied during compilation in order for
+    /// the limit to prevent too much heap from being used. However, the
+    /// implementation may use an intermediate NFA representation that is
+    /// otherwise slightly bigger than the final public form. Since the size
+    /// limit may be applied to an intermediate representation, there is not
+    /// necessarily a precise correspondence between the configured size limit
+    /// and the heap usage of the final NFA.
+    ///
+    /// There is no size limit by default.
+    ///
+    /// # Example
+    ///
+    /// This example demonstrates how Unicode mode can greatly increase the
+    /// size of the NFA.
+    ///
+    /// ```
+    /// use regex_automata::nfa::thompson::NFA;
+    ///
+    /// // 300KB isn't enough!
+    /// NFA::builder()
+    ///     .configure(NFA::config().nfa_size_limit(Some(300_000)))
+    ///     .build(r"\w{20}")
+    ///     .unwrap_err();
+    ///
+    /// // ... but 400KB probably is.
+    /// let nfa = NFA::builder()
+    ///     .configure(NFA::config().nfa_size_limit(Some(400_000)))
+    ///     .build(r"\w{20}")?;
+    ///
+    /// assert_eq!(nfa.pattern_len(), 1);
+    ///
+    /// # Ok::<(), Box<dyn std::error::Error>>(())
+    /// ```
+    pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
+        self.nfa_size_limit = Some(bytes);
+        self
+    }
+
+    /// Apply best effort heuristics to shrink the NFA at the expense of more
+    /// time/memory.
+    ///
+    /// This is enabled by default. Generally speaking, if one is using an NFA
+    /// to compile a DFA, then the extra time used to shrink the NFA will be
+    /// more than made up for during DFA construction (potentially by a lot).
+    /// In other words, enabling this can substantially decrease the overall
+    /// amount of time it takes to build a DFA.
+    ///
+    /// The only reason to disable this if you want to compile an NFA and start
+    /// using it as quickly as possible without needing to build a DFA. e.g.,
+    /// for an NFA simulation or for a lazy DFA.
+    ///
+    /// This is enabled by default.
+    pub fn shrink(mut self, yes: bool) -> Config {
+        self.shrink = Some(yes);
+        self
+    }
+
+    /// Whether to include 'Capture' states in the NFA.
+    ///
+    /// This can only be enabled when compiling a forward NFA. This is
+    /// always disabled---with no way to override it---when the `reverse`
+    /// configuration is enabled.
+    ///
+    /// This is enabled by default.
+    pub fn captures(mut self, yes: bool) -> Config {
+        self.captures = Some(yes);
+        self
+    }
+
+    /// Whether to compile an unanchored prefix into this NFA.
+    ///
+    /// This is enabled by default. It is made available for tests only to make
+    /// it easier to unit test the output of the compiler.
+    #[cfg(test)]
+    fn unanchored_prefix(mut self, yes: bool) -> Config {
+        self.unanchored_prefix = Some(yes);
+        self
+    }
+
+    pub fn get_reverse(&self) -> bool {
+        self.reverse.unwrap_or(false)
+    }
+
+    pub fn get_utf8(&self) -> bool {
+        self.utf8.unwrap_or(true)
+    }
+
+    pub fn get_nfa_size_limit(&self) -> Option<usize> {
+        self.nfa_size_limit.unwrap_or(None)
+    }
+
+    pub fn get_shrink(&self) -> bool {
+        self.shrink.unwrap_or(true)
+    }
+
+    pub fn get_captures(&self) -> bool {
+        !self.get_reverse() && self.captures.unwrap_or(true)
+    }
+
+    fn get_unanchored_prefix(&self) -> bool {
+        #[cfg(test)]
+        {
+            self.unanchored_prefix.unwrap_or(true)
+        }
+        #[cfg(not(test))]
+        {
+            true
+        }
+    }
+
+    pub(crate) fn overwrite(self, o: Config) -> Config {
+        Config {
+            reverse: o.reverse.or(self.reverse),
+            utf8: o.utf8.or(self.utf8),
+            nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
+            shrink: o.shrink.or(self.shrink),
+            captures: o.captures.or(self.captures),
+            #[cfg(test)]
+            unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
+        }
+    }
+}
+
+/// A builder for compiling an NFA.
+#[derive(Clone, Debug)]
+pub struct Builder {
+    config: Config,
+    parser: ParserBuilder,
+}
+
+impl Builder {
+    /// Create a new NFA builder with its default configuration.
+    pub fn new() -> Builder {
+        Builder { config: Config::default(), parser: ParserBuilder::new() }
+    }
+
+    /// Compile the given regular expression into an NFA.
+    ///
+    /// If there was a problem parsing the regex, then that error is returned.
+    ///
+    /// Otherwise, if there was a problem building the NFA, then an error is
+    /// returned. The only error that can occur is if the compiled regex would
+    /// exceed the size limits configured on this builder.
+    pub fn build(&self, pattern: &str) -> Result<NFA, Error> {
+        self.build_many(&[pattern])
+    }
+
+    pub fn build_many<P: AsRef<str>>(
+        &self,
+        patterns: &[P],
+    ) -> Result<NFA, Error> {
+        let mut hirs = vec![];
+        for p in patterns {
+            hirs.push(
+                self.parser
+                    .build()
+                    .parse(p.as_ref())
+                    .map_err(Error::syntax)?,
+            );
+            log!(log::trace!("parsed: {:?}", p.as_ref()));
+        }
+        self.build_many_from_hir(&hirs)
+    }
+
+    /// Compile the given high level intermediate representation of a regular
+    /// expression into an NFA.
+    ///
+    /// If there was a problem building the NFA, then an error is returned. The
+    /// only error that can occur is if the compiled regex would exceed the
+    /// size limits configured on this builder.
+    pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, Error> {
+        self.build_from_hir_with(&mut Compiler::new(), expr)
+    }
+
+    pub fn build_many_from_hir<H: Borrow<Hir>>(
+        &self,
+        exprs: &[H],
+    ) -> Result<NFA, Error> {
+        self.build_many_from_hir_with(&mut Compiler::new(), exprs)
+    }
+
+    /// Compile the given high level intermediate representation of a regular
+    /// expression into the NFA given using the given compiler. Callers may
+    /// prefer this over `build` if they would like to reuse allocations while
+    /// compiling many regular expressions.
+    ///
+    /// On success, the given NFA is completely overwritten with the NFA
+    /// produced by the compiler.
+    ///
+    /// If there was a problem building the NFA, then an error is returned.
+    /// The only error that can occur is if the compiled regex would exceed
+    /// the size limits configured on this builder. When an error is returned,
+    /// the contents of `nfa` are unspecified and should not be relied upon.
+    /// However, it can still be reused in subsequent calls to this method.
+    fn build_from_hir_with(
+        &self,
+        compiler: &mut Compiler,
+        expr: &Hir,
+    ) -> Result<NFA, Error> {
+        self.build_many_from_hir_with(compiler, &[expr])
+    }
+
+    fn build_many_from_hir_with<H: Borrow<Hir>>(
+        &self,
+        compiler: &mut Compiler,
+        exprs: &[H],
+    ) -> Result<NFA, Error> {
+        compiler.configure(self.config);
+        compiler.compile(exprs)
+    }
+
+    /// Apply the given NFA configuration options to this builder.
+    pub fn configure(&mut self, config: Config) -> &mut Builder {
+        self.config = self.config.overwrite(config);
+        self
+    }
+
+    /// Set the syntax configuration for this builder using
+    /// [`SyntaxConfig`](../../struct.SyntaxConfig.html).
+    ///
+    /// This permits setting things like case insensitivity, Unicode and multi
+    /// line mode.
+    ///
+    /// This syntax configuration generally only applies when an NFA is built
+    /// directly from a pattern string. If an NFA is built from an HIR, then
+    /// all syntax settings are ignored.
+    pub fn syntax(
+        &mut self,
+        config: crate::util::syntax::SyntaxConfig,
+    ) -> &mut Builder {
+        config.apply(&mut self.parser);
+        self
+    }
+}
+
+/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
+/// construction. Namely, this compiler permits epsilon transitions between
+/// states.
+#[derive(Clone, Debug)]
+pub struct Compiler {
+    /// The configuration from the builder.
+    config: Config,
+    /// The final NFA that is built.
+    ///
+    /// Parts of this NFA are constructed during compilation, but the actual
+    /// states aren't added until a final "finish" step. This is because the
+    /// states constructed during compilation have unconditional epsilon
+    /// transitions, which makes the logic of compilation much simpler. The
+    /// "finish" step removes these unconditional epsilon transitions and must
+    /// therefore remap all of the transition state IDs.
+    nfa: RefCell<NFA>,
+    /// The set of compiled NFA states. Once a state is compiled, it is
+    /// assigned a state ID equivalent to its index in this list. Subsequent
+    /// compilation can modify previous states by adding new transitions.
+    states: RefCell<Vec<CState>>,
+    /// State used for compiling character classes to UTF-8 byte automata.
+    /// State is not retained between character class compilations. This just
+    /// serves to amortize allocation to the extent possible.
+    utf8_state: RefCell<Utf8State>,
+    /// State used for arranging character classes in reverse into a trie.
+    trie_state: RefCell<RangeTrie>,
+    /// State used for caching common suffixes when compiling reverse UTF-8
+    /// automata (for Unicode character classes).
+    utf8_suffix: RefCell<Utf8SuffixMap>,
+    /// A map used to re-map state IDs when translating the compiler's internal
+    /// NFA state representation to the external NFA representation.
+    remap: RefCell<Vec<StateID>>,
+    /// A set of compiler internal state IDs that correspond to states that are
+    /// exclusively epsilon transitions, i.e., goto instructions, combined with
+    /// the state that they point to. This is used to record said states while
+    /// transforming the compiler's internal NFA representation to the external
+    /// form.
+    empties: RefCell<Vec<(StateID, StateID)>>,
+    /// The total memory used by each of the 'CState's in 'states'. This only
+    /// includes heap usage by each state, and not the size of the state
+    /// itself.
+    memory_cstates: Cell<usize>,
+}
+
+/// A compiler intermediate state representation for an NFA that is only used
+/// during compilation. Once compilation is done, `CState`s are converted
+/// to `State`s (defined in the parent module), which have a much simpler
+/// representation.
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum CState {
+    /// An empty state whose only purpose is to forward the automaton to
+    /// another state via en epsilon transition. These are useful during
+    /// compilation but are otherwise removed at the end.
+    Empty {
+        next: StateID,
+    },
+    /// An empty state that records a capture location.
+    ///
+    /// From the perspective of finite automata, this is precisely equivalent
+    /// to 'Empty', but serves the purpose of instructing NFA simulations to
+    /// record additional state when the finite state machine passes through
+    /// this epsilon transition.
+    ///
+    /// These transitions are treated as epsilon transitions with no additional
+    /// effects in DFAs.
+    ///
+    /// 'slot' in this context refers to the specific capture group offset that
+    /// is being recorded. Each capturing group has two slots corresponding to
+    /// the start and end of the matching portion of that group.
+    CaptureStart {
+        next: StateID,
+        capture_index: u32,
+        name: Option<Arc<str>>,
+    },
+    CaptureEnd {
+        next: StateID,
+        capture_index: u32,
+    },
+    /// A state that only transitions to `next` if the current input byte is
+    /// in the range `[start, end]` (inclusive on both ends).
+    Range {
+        range: Transition,
+    },
+    /// A state with possibly many transitions, represented in a sparse
+    /// fashion. Transitions are ordered lexicographically by input range.
+    /// As such, this may only be used when every transition has equal
+    /// priority. (In practice, this is only used for encoding large UTF-8
+    /// automata.) In contrast, a `Union` state has each alternate in order
+    /// of priority. Priority is used to implement greedy matching and also
+    /// alternations themselves, e.g., `abc|a` where `abc` has priority over
+    /// `a`.
+    ///
+    /// To clarify, it is possible to remove `Sparse` and represent all things
+    /// that `Sparse` is used for via `Union`. But this creates a more bloated
+    /// NFA with more epsilon transitions than is necessary in the special case
+    /// of character classes.
+    Sparse {
+        ranges: Vec<Transition>,
+    },
+    /// A conditional epsilon transition satisfied via some sort of
+    /// look-around.
+    Look {
+        look: Look,
+        next: StateID,
+    },
+    /// An alternation such that there exists an epsilon transition to all
+    /// states in `alternates`, where matches found via earlier transitions
+    /// are preferred over later transitions.
+    Union {
+        alternates: Vec<StateID>,
+    },
+    /// An alternation such that there exists an epsilon transition to all
+    /// states in `alternates`, where matches found via later transitions are
+    /// preferred over earlier transitions.
+    ///
+    /// This "reverse" state exists for convenience during compilation that
+    /// permits easy construction of non-greedy combinations of NFA states. At
+    /// the end of compilation, Union and UnionReverse states are merged into
+    /// one Union type of state, where the latter has its epsilon transitions
+    /// reversed to reflect the priority inversion.
+    ///
+    /// The "convenience" here arises from the fact that as new states are
+    /// added to the list of `alternates`, we would like that add operation
+    /// to be amortized constant time. But if we used a `Union`, we'd need to
+    /// prepend the state, which takes O(n) time. There are other approaches we
+    /// could use to solve this, but this seems simple enough.
+    UnionReverse {
+        alternates: Vec<StateID>,
+    },
+    /// A match state. There is at most one such occurrence of this state in
+    /// an NFA for each pattern compiled into the NFA. At time of writing, a
+    /// match state is always produced for every pattern given, but in theory,
+    /// if a pattern can never lead to a match, then the match state could be
+    /// omitted.
+    ///
+    /// `id` refers to the ID of the pattern itself, which corresponds to the
+    /// pattern's index (starting at 0). `start_id` refers to the anchored
+    /// NFA starting state corresponding to this pattern.
+    Match {
+        pattern_id: PatternID,
+        start_id: StateID,
+    },
+}
+
+/// A value that represents the result of compiling a sub-expression of a
+/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
+/// has an initial state at `start` and a final state at `end`.
+#[derive(Clone, Copy, Debug)]
+pub struct ThompsonRef {
+    start: StateID,
+    end: StateID,
+}
+
+impl Compiler {
+    /// Create a new compiler.
+    pub fn new() -> Compiler {
+        Compiler {
+            config: Config::default(),
+            nfa: RefCell::new(NFA::empty()),
+            states: RefCell::new(vec![]),
+            utf8_state: RefCell::new(Utf8State::new()),
+            trie_state: RefCell::new(RangeTrie::new()),
+            utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
+            remap: RefCell::new(vec![]),
+            empties: RefCell::new(vec![]),
+            memory_cstates: Cell::new(0),
+        }
+    }
+
+    /// Configure and prepare this compiler from the builder's knobs.
+    ///
+    /// The compiler is must always reconfigured by the builder before using it
+    /// to build an NFA. Namely, this will also clear any latent state in the
+    /// compiler used during previous compilations.
+    fn configure(&mut self, config: Config) {
+        self.config = config;
+        self.nfa.borrow_mut().clear();
+        self.states.borrow_mut().clear();
+        self.memory_cstates.set(0);
+        // We don't need to clear anything else since they are cleared on
+        // their own and only when they are used.
+    }
+
+    /// Convert the current intermediate NFA to its final compiled form.
+    fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, Error> {
+        if exprs.is_empty() {
+            return Ok(NFA::never_match());
+        }
+        if exprs.len() > PatternID::LIMIT {
+            return Err(Error::too_many_patterns(exprs.len()));
+        }
+
+        // We always add an unanchored prefix unless we were specifically told
+        // not to (for tests only), or if we know that the regex is anchored
+        // for all matches. When an unanchored prefix is not added, then the
+        // NFA's anchored and unanchored start states are equivalent.
+        let all_anchored =
+            exprs.iter().all(|e| e.borrow().is_anchored_start());
+        let anchored = !self.config.get_unanchored_prefix() || all_anchored;
+        let unanchored_prefix = if anchored {
+            self.c_empty()?
+        } else {
+            if self.config.get_utf8() {
+                self.c_unanchored_prefix_valid_utf8()?
+            } else {
+                self.c_unanchored_prefix_invalid_utf8()?
+            }
+        };
+
+        let compiled = self.c_alternation(
+            exprs.iter().with_pattern_ids().map(|(pid, e)| {
+                let group_kind = hir::GroupKind::CaptureIndex(0);
+                let one = self.c_group(&group_kind, e.borrow())?;
+                let match_state_id = self.add_match(pid, one.start)?;
+                self.patch(one.end, match_state_id)?;
+                Ok(ThompsonRef { start: one.start, end: match_state_id })
+            }),
+        )?;
+        self.patch(unanchored_prefix.end, compiled.start)?;
+        self.finish(compiled.start, unanchored_prefix.start)?;
+        Ok(self.nfa.replace(NFA::empty()))
+    }
+
+    /// Finishes the compilation process and populates the NFA attached to this
+    /// compiler with the final graph.
+    fn finish(
+        &self,
+        start_anchored: StateID,
+        start_unanchored: StateID,
+    ) -> Result<(), Error> {
+        trace!(
+            "intermediate NFA compilation complete, \
+             intermediate NFA size: {} states, {} bytes on heap",
+            self.states.borrow().len(),
+            self.nfa_memory_usage(),
+        );
+        let mut nfa = self.nfa.borrow_mut();
+        let mut bstates = self.states.borrow_mut();
+        let mut remap = self.remap.borrow_mut();
+        let mut empties = self.empties.borrow_mut();
+        remap.resize(bstates.len(), StateID::ZERO);
+        empties.clear();
+
+        // The idea here is to convert our intermediate states to their final
+        // form. The only real complexity here is the process of converting
+        // transitions, which are expressed in terms of state IDs. The new
+        // set of states will be smaller because of partial epsilon removal,
+        // so the state IDs will not be the same.
+        for (sid, bstate) in bstates.iter_mut().with_state_ids() {
+            match *bstate {
+                CState::Empty { next } => {
+                    // Since we're removing empty states, we need to handle
+                    // them later since we don't yet know which new state this
+                    // empty state will be mapped to.
+                    empties.push((sid, next));
+                }
+                CState::CaptureStart { next, capture_index, ref name } => {
+                    // We can't remove this empty state because of the side
+                    // effect of capturing an offset for this capture slot.
+                    remap[sid] = nfa.add_capture_start(
+                        next,
+                        capture_index,
+                        name.clone(),
+                    )?;
+                }
+                CState::CaptureEnd { next, capture_index } => {
+                    // We can't remove this empty state because of the side
+                    // effect of capturing an offset for this capture slot.
+                    remap[sid] = nfa.add_capture_end(next, capture_index)?;
+                }
+                CState::Range { range } => {
+                    remap[sid] = nfa.add_range(range)?;
+                }
+                CState::Sparse { ref mut ranges } => {
+                    let ranges =
+                        mem::replace(ranges, vec![]).into_boxed_slice();
+                    remap[sid] =
+                        nfa.add_sparse(SparseTransitions { ranges })?;
+                }
+                CState::Look { look, next } => {
+                    remap[sid] = nfa.add_look(next, look)?;
+                }
+                CState::Union { ref mut alternates } => {
+                    let alternates =
+                        mem::replace(alternates, vec![]).into_boxed_slice();
+                    remap[sid] = nfa.add_union(alternates)?;
+                }
+                CState::UnionReverse { ref mut alternates } => {
+                    let mut alternates =
+                        mem::replace(alternates, vec![]).into_boxed_slice();
+                    alternates.reverse();
+                    remap[sid] = nfa.add_union(alternates)?;
+                }
+                CState::Match { start_id, .. } => {
+                    remap[sid] = nfa.add_match()?;
+                    nfa.finish_pattern(start_id)?;
+                }
+            }
+        }
+        for &(empty_id, mut empty_next) in empties.iter() {
+            // empty states can point to other empty states, forming a chain.
+            // So we must follow the chain until the end, which must end at
+            // a non-empty state, and therefore, a state that is correctly
+            // remapped. We are guaranteed to terminate because our compiler
+            // never builds a loop among only empty states.
+            while let CState::Empty { next } = bstates[empty_next] {
+                empty_next = next;
+            }
+            remap[empty_id] = remap[empty_next];
+        }
+        nfa.set_start_anchored(start_anchored);
+        nfa.set_start_unanchored(start_unanchored);
+        nfa.remap(&remap);
+        trace!(
+            "final NFA (reverse? {:?}) compilation complete, \
+             final NFA size: {} states, {} bytes on heap",
+            self.config.get_reverse(),
+            nfa.states().len(),
+            nfa.memory_usage(),
+        );
+        Ok(())
+    }
+
+    fn c(&self, expr: &Hir) -> Result<ThompsonRef, Error> {
+        match *expr.kind() {
+            HirKind::Empty => self.c_empty(),
+            HirKind::Literal(Literal::Unicode(ch)) => self.c_char(ch),
+            HirKind::Literal(Literal::Byte(b)) => self.c_range(b, b),
+            HirKind::Class(Class::Bytes(ref c)) => self.c_byte_class(c),
+            HirKind::Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
+            HirKind::Anchor(ref anchor) => self.c_anchor(anchor),
+            HirKind::WordBoundary(ref wb) => self.c_word_boundary(wb),
+            HirKind::Repetition(ref rep) => self.c_repetition(rep),
+            HirKind::Group(ref group) => self.c_group(&group.kind, &group.hir),
+            HirKind::Concat(ref es) => {
+                self.c_concat(es.iter().map(|e| self.c(e)))
+            }
+            HirKind::Alternation(ref es) => {
+                self.c_alternation(es.iter().map(|e| self.c(e)))
+            }
+        }
+    }
+
+    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
+    where
+        I: DoubleEndedIterator<Item = Result<ThompsonRef, Error>>,
+    {
+        let first = if self.is_reverse() { it.next_back() } else { it.next() };
+        let ThompsonRef { start, mut end } = match first {
+            Some(result) => result?,
+            None => return self.c_empty(),
+        };
+        loop {
+            let next =
+                if self.is_reverse() { it.next_back() } else { it.next() };
+            let compiled = match next {
+                Some(result) => result?,
+                None => break,
+            };
+            self.patch(end, compiled.start)?;
+            end = compiled.end;
+        }
+        Ok(ThompsonRef { start, end })
+    }
+
+    fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
+    where
+        I: Iterator<Item = Result<ThompsonRef, Error>>,
+    {
+        let first = it.next().expect("alternations must be non-empty")?;
+        let second = match it.next() {
+            None => return Ok(first),
+            Some(result) => result?,
+        };
+
+        let union = self.add_union()?;
+        let end = self.add_empty()?;
+        self.patch(union, first.start)?;
+        self.patch(first.end, end)?;
+        self.patch(union, second.start)?;
+        self.patch(second.end, end)?;
+        for result in it {
+            let compiled = result?;
+            self.patch(union, compiled.start)?;
+            self.patch(compiled.end, end)?;
+        }
+        Ok(ThompsonRef { start: union, end })
+    }
+
+    fn c_group(
+        &self,
+        kind: &hir::GroupKind,
+        expr: &Hir,
+    ) -> Result<ThompsonRef, Error> {
+        if !self.config.get_captures() {
+            return self.c(expr);
+        }
+        let (capi, name) = match *kind {
+            hir::GroupKind::NonCapturing => return self.c(expr),
+            hir::GroupKind::CaptureIndex(index) => (index, None),
+            hir::GroupKind::CaptureName { ref name, index } => {
+                (index, Some(Arc::from(&**name)))
+            }
+        };
+
+        let start = self.add_capture_start(capi, name)?;
+        let inner = self.c(expr)?;
+        let end = self.add_capture_end(capi)?;
+
+        self.patch(start, inner.start)?;
+        self.patch(inner.end, end)?;
+        Ok(ThompsonRef { start, end })
+    }
+
+    fn c_repetition(
+        &self,
+        rep: &hir::Repetition,
+    ) -> Result<ThompsonRef, Error> {
+        match rep.kind {
+            hir::RepetitionKind::ZeroOrOne => {
+                self.c_zero_or_one(&rep.hir, rep.greedy)
+            }
+            hir::RepetitionKind::ZeroOrMore => {
+                self.c_at_least(&rep.hir, rep.greedy, 0)
+            }
+            hir::RepetitionKind::OneOrMore => {
+                self.c_at_least(&rep.hir, rep.greedy, 1)
+            }
+            hir::RepetitionKind::Range(ref rng) => match *rng {
+                hir::RepetitionRange::Exactly(count) => {
+                    self.c_exactly(&rep.hir, count)
+                }
+                hir::RepetitionRange::AtLeast(m) => {
+                    self.c_at_least(&rep.hir, rep.greedy, m)
+                }
+                hir::RepetitionRange::Bounded(min, max) => {
+                    self.c_bounded(&rep.hir, rep.greedy, min, max)
+                }
+            },
+        }
+    }
+
+    fn c_bounded(
+        &self,
+        expr: &Hir,
+        greedy: bool,
+        min: u32,
+        max: u32,
+    ) -> Result<ThompsonRef, Error> {
+        let prefix = self.c_exactly(expr, min)?;
+        if min == max {
+            return Ok(prefix);
+        }
+
+        // It is tempting here to compile the rest here as a concatenation
+        // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
+        // were `aaa?a?a?`. The problem here is that it leads to this program:
+        //
+        //     >000000: 61 => 01
+        //      000001: 61 => 02
+        //      000002: union(03, 04)
+        //      000003: 61 => 04
+        //      000004: union(05, 06)
+        //      000005: 61 => 06
+        //      000006: union(07, 08)
+        //      000007: 61 => 08
+        //      000008: MATCH
+        //
+        // And effectively, once you hit state 2, the epsilon closure will
+        // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
+        // to instead compile it like so:
+        //
+        //     >000000: 61 => 01
+        //      000001: 61 => 02
+        //      000002: union(03, 08)
+        //      000003: 61 => 04
+        //      000004: union(05, 08)
+        //      000005: 61 => 06
+        //      000006: union(07, 08)
+        //      000007: 61 => 08
+        //      000008: MATCH
+        //
+        // So that the epsilon closure of state 2 is now just 3 and 8.
+        let empty = self.add_empty()?;
+        let mut prev_end = prefix.end;
+        for _ in min..max {
+            let union = if greedy {
+                self.add_union()
+            } else {
+                self.add_reverse_union()
+            }?;
+            let compiled = self.c(expr)?;
+            self.patch(prev_end, union)?;
+            self.patch(union, compiled.start)?;
+            self.patch(union, empty)?;
+            prev_end = compiled.end;
+        }
+        self.patch(prev_end, empty)?;
+        Ok(ThompsonRef { start: prefix.start, end: empty })
+    }
+
+    fn c_at_least(
+        &self,
+        expr: &Hir,
+        greedy: bool,
+        n: u32,
+    ) -> Result<ThompsonRef, Error> {
+        if n == 0 {
+            // When the expression cannot match the empty string, then we
+            // can get away with something much simpler: just one 'alt'
+            // instruction that optionally repeats itself. But if the expr
+            // can match the empty string... see below.
+            if !expr.is_match_empty() {
+                let union = if greedy {
+                    self.add_union()
+                } else {
+                    self.add_reverse_union()
+                }?;
+                let compiled = self.c(expr)?;
+                self.patch(union, compiled.start)?;
+                self.patch(compiled.end, union)?;
+                return Ok(ThompsonRef { start: union, end: union });
+            }
+
+            // What's going on here? Shouldn't x* be simpler than this? It
+            // turns out that when implementing leftmost-first (Perl-like)
+            // match semantics, x* results in an incorrect preference order
+            // when computing the transitive closure of states if and only if
+            // 'x' can match the empty string. So instead, we compile x* as
+            // (x+)?, which preserves the correct preference order.
+            //
+            // See: https://github.com/rust-lang/regex/issues/779
+            let compiled = self.c(expr)?;
+            let plus = if greedy {
+                self.add_union()
+            } else {
+                self.add_reverse_union()
+            }?;
+            self.patch(compiled.end, plus)?;
+            self.patch(plus, compiled.start)?;
+
+            let question = if greedy {
+                self.add_union()
+            } else {
+                self.add_reverse_union()
+            }?;
+            let empty = self.add_empty()?;
+            self.patch(question, compiled.start)?;
+            self.patch(question, empty)?;
+            self.patch(plus, empty)?;
+            Ok(ThompsonRef { start: question, end: empty })
+        } else if n == 1 {
+            let compiled = self.c(expr)?;
+            let union = if greedy {
+                self.add_union()
+            } else {
+                self.add_reverse_union()
+            }?;
+            self.patch(compiled.end, union)?;
+            self.patch(union, compiled.start)?;
+            Ok(ThompsonRef { start: compiled.start, end: union })
+        } else {
+            let prefix = self.c_exactly(expr, n - 1)?;
+            let last = self.c(expr)?;
+            let union = if greedy {
+                self.add_union()
+            } else {
+                self.add_reverse_union()
+            }?;
+            self.patch(prefix.end, last.start)?;
+            self.patch(last.end, union)?;
+            self.patch(union, last.start)?;
+            Ok(ThompsonRef { start: prefix.start, end: union })
+        }
+    }
+
+    fn c_zero_or_one(
+        &self,
+        expr: &Hir,
+        greedy: bool,
+    ) -> Result<ThompsonRef, Error> {
+        let union =
+            if greedy { self.add_union() } else { self.add_reverse_union() }?;
+        let compiled = self.c(expr)?;
+        let empty = self.add_empty()?;
+        self.patch(union, compiled.start)?;
+        self.patch(union, empty)?;
+        self.patch(compiled.end, empty)?;
+        Ok(ThompsonRef { start: union, end: empty })
+    }
+
+    fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef, Error> {
+        let it = (0..n).map(|_| self.c(expr));
+        self.c_concat(it)
+    }
+
+    fn c_byte_class(
+        &self,
+        cls: &hir::ClassBytes,
+    ) -> Result<ThompsonRef, Error> {
+        let end = self.add_empty()?;
+        let mut trans = Vec::with_capacity(cls.ranges().len());
+        for r in cls.iter() {
+            trans.push(Transition {
+                start: r.start(),
+                end: r.end(),
+                next: end,
+            });
+        }
+        Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
+    }
+
+    fn c_unicode_class(
+        &self,
+        cls: &hir::ClassUnicode,
+    ) -> Result<ThompsonRef, Error> {
+        // If all we have are ASCII ranges wrapped in a Unicode package, then
+        // there is zero reason to bring out the big guns. We can fit all ASCII
+        // ranges within a single sparse state.
+        if cls.is_all_ascii() {
+            let end = self.add_empty()?;
+            let mut trans = Vec::with_capacity(cls.ranges().len());
+            for r in cls.iter() {
+                assert!(r.start() <= '\x7F');
+                assert!(r.end() <= '\x7F');
+                trans.push(Transition {
+                    start: r.start() as u8,
+                    end: r.end() as u8,
+                    next: end,
+                });
+            }
+            Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
+        } else if self.is_reverse() {
+            if !self.config.get_shrink() {
+                // When we don't want to spend the extra time shrinking, we
+                // compile the UTF-8 automaton in reverse using something like
+                // the "naive" approach, but will attempt to re-use common
+                // suffixes.
+                self.c_unicode_class_reverse_with_suffix(cls)
+            } else {
+                // When we want to shrink our NFA for reverse UTF-8 automata,
+                // we cannot feed UTF-8 sequences directly to the UTF-8
+                // compiler, since the UTF-8 compiler requires all sequences
+                // to be lexicographically sorted. Instead, we organize our
+                // sequences into a range trie, which can then output our
+                // sequences in the correct order. Unfortunately, building the
+                // range trie is fairly expensive (but not nearly as expensive
+                // as building a DFA). Hence the reason why the 'shrink' option
+                // exists, so that this path can be toggled off. For example,
+                // we might want to turn this off if we know we won't be
+                // compiling a DFA.
+                let mut trie = self.trie_state.borrow_mut();
+                trie.clear();
+
+                for rng in cls.iter() {
+                    for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
+                        seq.reverse();
+                        trie.insert(seq.as_slice());
+                    }
+                }
+                let mut utf8_state = self.utf8_state.borrow_mut();
+                let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
+                trie.iter(|seq| {
+                    utf8c.add(&seq)?;
+                    Ok(())
+                })?;
+                utf8c.finish()
+            }
+        } else {
+            // In the forward direction, we always shrink our UTF-8 automata
+            // because we can stream it right into the UTF-8 compiler. There
+            // is almost no downside (in either memory or time) to using this
+            // approach.
+            let mut utf8_state = self.utf8_state.borrow_mut();
+            let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
+            for rng in cls.iter() {
+                for seq in Utf8Sequences::new(rng.start(), rng.end()) {
+                    utf8c.add(seq.as_slice())?;
+                }
+            }
+            utf8c.finish()
+        }
+
+        // For reference, the code below is the "naive" version of compiling a
+        // UTF-8 automaton. It is deliciously simple (and works for both the
+        // forward and reverse cases), but will unfortunately produce very
+        // large NFAs. When compiling a forward automaton, the size difference
+        // can sometimes be an order of magnitude. For example, the '\w' regex
+        // will generate about ~3000 NFA states using the naive approach below,
+        // but only 283 states when using the approach above. This is because
+        // the approach above actually compiles a *minimal* (or near minimal,
+        // because of the bounded hashmap for reusing equivalent states) UTF-8
+        // automaton.
+        //
+        // The code below is kept as a reference point in order to make it
+        // easier to understand the higher level goal here. Although, it will
+        // almost certainly bit-rot, so keep that in mind.
+        /*
+        let it = cls
+            .iter()
+            .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
+            .map(|seq| {
+                let it = seq
+                    .as_slice()
+                    .iter()
+                    .map(|rng| self.c_range(rng.start, rng.end));
+                self.c_concat(it)
+            });
+        self.c_alternation(it)
+        */
+    }
+
+    fn c_unicode_class_reverse_with_suffix(
+        &self,
+        cls: &hir::ClassUnicode,
+    ) -> Result<ThompsonRef, Error> {
+        // N.B. It would likely be better to cache common *prefixes* in the
+        // reverse direction, but it's not quite clear how to do that. The
+        // advantage of caching suffixes is that it does give us a win, and
+        // has a very small additional overhead.
+        let mut cache = self.utf8_suffix.borrow_mut();
+        cache.clear();
+
+        let union = self.add_union()?;
+        let alt_end = self.add_empty()?;
+        for urng in cls.iter() {
+            for seq in Utf8Sequences::new(urng.start(), urng.end()) {
+                let mut end = alt_end;
+                for brng in seq.as_slice() {
+                    let key = Utf8SuffixKey {
+                        from: end,
+                        start: brng.start,
+                        end: brng.end,
+                    };
+                    let hash = cache.hash(&key);
+                    if let Some(id) = cache.get(&key, hash) {
+                        end = id;
+                        continue;
+                    }
+
+                    let compiled = self.c_range(brng.start, brng.end)?;
+                    self.patch(compiled.end, end)?;
+                    end = compiled.start;
+                    cache.set(key, hash, end);
+                }
+                self.patch(union, end)?;
+            }
+        }
+        Ok(ThompsonRef { start: union, end: alt_end })
+    }
+
+    fn c_anchor(&self, anchor: &Anchor) -> Result<ThompsonRef, Error> {
+        let look = match *anchor {
+            Anchor::StartLine => Look::StartLine,
+            Anchor::EndLine => Look::EndLine,
+            Anchor::StartText => Look::StartText,
+            Anchor::EndText => Look::EndText,
+        };
+        let id = self.add_look(look)?;
+        Ok(ThompsonRef { start: id, end: id })
+    }
+
+    fn c_word_boundary(
+        &self,
+        wb: &WordBoundary,
+    ) -> Result<ThompsonRef, Error> {
+        let look = match *wb {
+            WordBoundary::Unicode => Look::WordBoundaryUnicode,
+            WordBoundary::UnicodeNegate => Look::WordBoundaryUnicodeNegate,
+            WordBoundary::Ascii => Look::WordBoundaryAscii,
+            WordBoundary::AsciiNegate => Look::WordBoundaryAsciiNegate,
+        };
+        let id = self.add_look(look)?;
+        Ok(ThompsonRef { start: id, end: id })
+    }
+
+    fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> {
+        let mut buf = [0; 4];
+        let it = ch
+            .encode_utf8(&mut buf)
+            .as_bytes()
+            .iter()
+            .map(|&b| self.c_range(b, b));
+        self.c_concat(it)
+    }
+
+    fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, Error> {
+        let id = self.add_range(start, end)?;
+        Ok(ThompsonRef { start: id, end: id })
+    }
+
+    fn c_empty(&self) -> Result<ThompsonRef, Error> {
+        let id = self.add_empty()?;
+        Ok(ThompsonRef { start: id, end: id })
+    }
+
+    fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef, Error> {
+        self.c_at_least(&Hir::any(false), false, 0)
+    }
+
+    fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef, Error> {
+        self.c_at_least(&Hir::any(true), false, 0)
+    }
+
+    fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> {
+        let old_memory_cstates = self.memory_cstates.get();
+        match self.states.borrow_mut()[from] {
+            CState::Empty { ref mut next } => {
+                *next = to;
+            }
+            CState::Range { ref mut range } => {
+                range.next = to;
+            }
+            CState::Sparse { .. } => {
+                panic!("cannot patch from a sparse NFA state")
+            }
+            CState::Look { ref mut next, .. } => {
+                *next = to;
+            }
+            CState::Union { ref mut alternates } => {
+                alternates.push(to);
+                self.memory_cstates
+                    .set(old_memory_cstates + mem::size_of::<StateID>());
+            }
+            CState::UnionReverse { ref mut alternates } => {
+                alternates.push(to);
+                self.memory_cstates
+                    .set(old_memory_cstates + mem::size_of::<StateID>());
+            }
+            CState::CaptureStart { ref mut next, .. } => {
+                *next = to;
+            }
+            CState::CaptureEnd { ref mut next, .. } => {
+                *next = to;
+            }
+            CState::Match { .. } => {}
+        }
+        if old_memory_cstates != self.memory_cstates.get() {
+            self.check_nfa_size_limit()?;
+        }
+        Ok(())
+    }
+
+    fn add_empty(&self) -> Result<StateID, Error> {
+        self.add_state(CState::Empty { next: StateID::ZERO })
+    }
+
+    fn add_capture_start(
+        &self,
+        capture_index: u32,
+        name: Option<Arc<str>>,
+    ) -> Result<StateID, Error> {
+        self.add_state(CState::CaptureStart {
+            next: StateID::ZERO,
+            capture_index,
+            name,
+        })
+    }
+
+    fn add_capture_end(&self, capture_index: u32) -> Result<StateID, Error> {
+        self.add_state(CState::CaptureEnd {
+            next: StateID::ZERO,
+            capture_index,
+        })
+    }
+
+    fn add_range(&self, start: u8, end: u8) -> Result<StateID, Error> {
+        let trans = Transition { start, end, next: StateID::ZERO };
+        self.add_state(CState::Range { range: trans })
+    }
+
+    fn add_sparse(&self, ranges: Vec<Transition>) -> Result<StateID, Error> {
+        if ranges.len() == 1 {
+            self.add_state(CState::Range { range: ranges[0] })
+        } else {
+            self.add_state(CState::Sparse { ranges })
+        }
+    }
+
+    fn add_look(&self, mut look: Look) -> Result<StateID, Error> {
+        if self.is_reverse() {
+            look = look.reversed();
+        }
+        self.add_state(CState::Look { look, next: StateID::ZERO })
+    }
+
+    fn add_union(&self) -> Result<StateID, Error> {
+        self.add_state(CState::Union { alternates: vec![] })
+    }
+
+    fn add_reverse_union(&self) -> Result<StateID, Error> {
+        self.add_state(CState::UnionReverse { alternates: vec![] })
+    }
+
+    fn add_match(
+        &self,
+        pattern_id: PatternID,
+        start_id: StateID,
+    ) -> Result<StateID, Error> {
+        self.add_state(CState::Match { pattern_id, start_id })
+    }
+
+    fn add_state(&self, state: CState) -> Result<StateID, Error> {
+        let mut states = self.states.borrow_mut();
+        let id = StateID::new(states.len())
+            .map_err(|_| Error::too_many_states(states.len()))?;
+        self.memory_cstates
+            .set(self.memory_cstates.get() + state.memory_usage());
+        states.push(state);
+        // If we don't explicitly drop this, then 'nfa_memory_usage' will also
+        // try to borrow it when we check the size limit and hit an error.
+        drop(states);
+        self.check_nfa_size_limit()?;
+        Ok(id)
+    }
+
+    fn is_reverse(&self) -> bool {
+        self.config.get_reverse()
+    }
+
+    /// If an NFA size limit was set, this checks that the NFA compiled so far
+    /// fits within that limit. If so, then nothing is returned. Otherwise, an
+    /// error is returned.
+    ///
+    /// This should be called after increasing the heap usage of the
+    /// intermediate NFA.
+    ///
+    /// Note that this borrows 'self.states', so callers should ensure there is
+    /// no mutable borrow of it outstanding.
+    fn check_nfa_size_limit(&self) -> Result<(), Error> {
+        if let Some(limit) = self.config.get_nfa_size_limit() {
+            if self.nfa_memory_usage() > limit {
+                return Err(Error::exceeded_size_limit(limit));
+            }
+        }
+        Ok(())
+    }
+
+    /// Returns the heap memory usage, in bytes, of the NFA compiled so far.
+    ///
+    /// Note that this is an approximation of how big the final NFA will be.
+    /// In practice, the final NFA will likely be a bit smaller since it uses
+    /// things like `Box<[T]>` instead of `Vec<T>`.
+    fn nfa_memory_usage(&self) -> usize {
+        self.states.borrow().len() * mem::size_of::<CState>()
+            + self.memory_cstates.get()
+    }
+}
+
+impl CState {
+    fn memory_usage(&self) -> usize {
+        match *self {
+            CState::Empty { .. }
+            | CState::Range { .. }
+            | CState::Look { .. }
+            | CState::CaptureStart { .. }
+            | CState::CaptureEnd { .. }
+            | CState::Match { .. } => 0,
+            CState::Sparse { ref ranges } => {
+                ranges.len() * mem::size_of::<Transition>()
+            }
+            CState::Union { ref alternates } => {
+                alternates.len() * mem::size_of::<StateID>()
+            }
+            CState::UnionReverse { ref alternates } => {
+                alternates.len() * mem::size_of::<StateID>()
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+struct Utf8Compiler<'a> {
+    nfac: &'a Compiler,
+    state: &'a mut Utf8State,
+    target: StateID,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8State {
+    compiled: Utf8BoundedMap,
+    uncompiled: Vec<Utf8Node>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8Node {
+    trans: Vec<Transition>,
+    last: Option<Utf8LastTransition>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8LastTransition {
+    start: u8,
+    end: u8,
+}
+
+impl Utf8State {
+    fn new() -> Utf8State {
+        Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
+    }
+
+    fn clear(&mut self) {
+        self.compiled.clear();
+        self.uncompiled.clear();
+    }
+}
+
+impl<'a> Utf8Compiler<'a> {
+    fn new(
+        nfac: &'a Compiler,
+        state: &'a mut Utf8State,
+    ) -> Result<Utf8Compiler<'a>, Error> {
+        let target = nfac.add_empty()?;
+        state.clear();
+        let mut utf8c = Utf8Compiler { nfac, state, target };
+        utf8c.add_empty();
+        Ok(utf8c)
+    }
+
+    fn finish(&mut self) -> Result<ThompsonRef, Error> {
+        self.compile_from(0)?;
+        let node = self.pop_root();
+        let start = self.compile(node)?;
+        Ok(ThompsonRef { start, end: self.target })
+    }
+
+    fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), Error> {
+        let prefix_len = ranges
+            .iter()
+            .zip(&self.state.uncompiled)
+            .take_while(|&(range, node)| {
+                node.last.as_ref().map_or(false, |t| {
+                    (t.start, t.end) == (range.start, range.end)
+                })
+            })
+            .count();
+        assert!(prefix_len < ranges.len());
+        self.compile_from(prefix_len)?;
+        self.add_suffix(&ranges[prefix_len..]);
+        Ok(())
+    }
+
+    fn compile_from(&mut self, from: usize) -> Result<(), Error> {
+        let mut next = self.target;
+        while from + 1 < self.state.uncompiled.len() {
+            let node = self.pop_freeze(next);
+            next = self.compile(node)?;
+        }
+        self.top_last_freeze(next);
+        Ok(())
+    }
+
+    fn compile(&mut self, node: Vec<Transition>) -> Result<StateID, Error> {
+        let hash = self.state.compiled.hash(&node);
+        if let Some(id) = self.state.compiled.get(&node, hash) {
+            return Ok(id);
+        }
+        let id = self.nfac.add_sparse(node.clone())?;
+        self.state.compiled.set(node, hash, id);
+        Ok(id)
+    }
+
+    fn add_suffix(&mut self, ranges: &[Utf8Range]) {
+        assert!(!ranges.is_empty());
+        let last = self
+            .state
+            .uncompiled
+            .len()
+            .checked_sub(1)
+            .expect("non-empty nodes");
+        assert!(self.state.uncompiled[last].last.is_none());
+        self.state.uncompiled[last].last = Some(Utf8LastTransition {
+            start: ranges[0].start,
+            end: ranges[0].end,
+        });
+        for r in &ranges[1..] {
+            self.state.uncompiled.push(Utf8Node {
+                trans: vec![],
+                last: Some(Utf8LastTransition { start: r.start, end: r.end }),
+            });
+        }
+    }
+
+    fn add_empty(&mut self) {
+        self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
+    }
+
+    fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
+        let mut uncompiled = self.state.uncompiled.pop().unwrap();
+        uncompiled.set_last_transition(next);
+        uncompiled.trans
+    }
+
+    fn pop_root(&mut self) -> Vec<Transition> {
+        assert_eq!(self.state.uncompiled.len(), 1);
+        assert!(self.state.uncompiled[0].last.is_none());
+        self.state.uncompiled.pop().expect("non-empty nodes").trans
+    }
+
+    fn top_last_freeze(&mut self, next: StateID) {
+        let last = self
+            .state
+            .uncompiled
+            .len()
+            .checked_sub(1)
+            .expect("non-empty nodes");
+        self.state.uncompiled[last].set_last_transition(next);
+    }
+}
+
+impl Utf8Node {
+    fn set_last_transition(&mut self, next: StateID) {
+        if let Some(last) = self.last.take() {
+            self.trans.push(Transition {
+                start: last.start,
+                end: last.end,
+                next,
+            });
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use alloc::vec::Vec;
+
+    use super::{
+        Builder, Config, PatternID, SparseTransitions, State, StateID,
+        Transition, NFA,
+    };
+
+    fn build(pattern: &str) -> NFA {
+        Builder::new()
+            .configure(Config::new().captures(false).unanchored_prefix(false))
+            .build(pattern)
+            .unwrap()
+    }
+
+    fn pid(id: usize) -> PatternID {
+        PatternID::new(id).unwrap()
+    }
+
+    fn sid(id: usize) -> StateID {
+        StateID::new(id).unwrap()
+    }
+
+    fn s_byte(byte: u8, next: usize) -> State {
+        let next = sid(next);
+        let trans = Transition { start: byte, end: byte, next };
+        State::Range { range: trans }
+    }
+
+    fn s_range(start: u8, end: u8, next: usize) -> State {
+        let next = sid(next);
+        let trans = Transition { start, end, next };
+        State::Range { range: trans }
+    }
+
+    fn s_sparse(ranges: &[(u8, u8, usize)]) -> State {
+        let ranges = ranges
+            .iter()
+            .map(|&(start, end, next)| Transition {
+                start,
+                end,
+                next: sid(next),
+            })
+            .collect();
+        State::Sparse(SparseTransitions { ranges })
+    }
+
+    fn s_union(alts: &[usize]) -> State {
+        State::Union {
+            alternates: alts
+                .iter()
+                .map(|&id| sid(id))
+                .collect::<Vec<StateID>>()
+                .into_boxed_slice(),
+        }
+    }
+
+    fn s_match(id: usize) -> State {
+        State::Match { id: pid(id) }
+    }
+
+    // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
+    // prefix.
+    #[test]
+    fn compile_unanchored_prefix() {
+        // When the machine can only match valid UTF-8.
+        let nfa = Builder::new()
+            .configure(Config::new().captures(false))
+            .build(r"a")
+            .unwrap();
+        // There should be many states since the `.` in `(?s:.)*?` matches any
+        // Unicode scalar value.
+        assert_eq!(11, nfa.len());
+        assert_eq!(nfa.states[10], s_match(0));
+        assert_eq!(nfa.states[9], s_byte(b'a', 10));
+
+        // When the machine can match through invalid UTF-8.
+        let nfa = Builder::new()
+            .configure(Config::new().captures(false).utf8(false))
+            .build(r"a")
+            .unwrap();
+        assert_eq!(
+            nfa.states,
+            &[
+                s_union(&[2, 1]),
+                s_range(0, 255, 0),
+                s_byte(b'a', 3),
+                s_match(0),
+            ]
+        );
+    }
+
+    #[test]
+    fn compile_empty() {
+        assert_eq!(build("").states, &[s_match(0),]);
+    }
+
+    #[test]
+    fn compile_literal() {
+        assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(0),]);
+        assert_eq!(
+            build("ab").states,
+            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
+        );
+        assert_eq!(
+            build("☃").states,
+            &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
+        );
+
+        // Check that non-UTF-8 literals work.
+        let nfa = Builder::new()
+            .configure(
+                Config::new()
+                    .captures(false)
+                    .utf8(false)
+                    .unanchored_prefix(false),
+            )
+            .syntax(crate::SyntaxConfig::new().utf8(false))
+            .build(r"(?-u)\xFF")
+            .unwrap();
+        assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(0),]);
+    }
+
+    #[test]
+    fn compile_class() {
+        assert_eq!(
+            build(r"[a-z]").states,
+            &[s_range(b'a', b'z', 1), s_match(0),]
+        );
+        assert_eq!(
+            build(r"[x-za-c]").states,
+            &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
+        );
+        assert_eq!(
+            build(r"[\u03B1-\u03B4]").states,
+            &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
+        );
+        assert_eq!(
+            build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
+            &[
+                s_range(0xB1, 0xB4, 5),
+                s_range(0x99, 0x9E, 5),
+                s_byte(0xA4, 1),
+                s_byte(0x9F, 2),
+                s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
+                s_match(0),
+            ]
+        );
+        assert_eq!(
+            build(r"[a-z☃]").states,
+            &[
+                s_byte(0x83, 3),
+                s_byte(0x98, 0),
+                s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
+                s_match(0),
+            ]
+        );
+    }
+
+    #[test]
+    fn compile_repetition() {
+        assert_eq!(
+            build(r"a?").states,
+            &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(0),]
+        );
+        assert_eq!(
+            build(r"a??").states,
+            &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(0),]
+        );
+    }
+
+    #[test]
+    fn compile_group() {
+        assert_eq!(
+            build(r"ab+").states,
+            &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(0)]
+        );
+        assert_eq!(
+            build(r"(ab)").states,
+            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
+        );
+        assert_eq!(
+            build(r"(ab)+").states,
+            &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(0)]
+        );
+    }
+
+    #[test]
+    fn compile_alternation() {
+        assert_eq!(
+            build(r"a|b").states,
+            &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(0)]
+        );
+        assert_eq!(
+            build(r"|b").states,
+            &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(0)]
+        );
+        assert_eq!(
+            build(r"a|").states,
+            &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(0)]
+        );
+    }
+
+    #[test]
+    fn many_start_pattern() {
+        let nfa = Builder::new()
+            .configure(Config::new().captures(false).unanchored_prefix(false))
+            .build_many(&["a", "b"])
+            .unwrap();
+        assert_eq!(
+            nfa.states,
+            &[
+                s_byte(b'a', 1),
+                s_match(0),
+                s_byte(b'b', 3),
+                s_match(1),
+                s_union(&[0, 2]),
+            ]
+        );
+        assert_eq!(nfa.start_anchored().as_usize(), 4);
+        assert_eq!(nfa.start_unanchored().as_usize(), 4);
+        // Test that the start states for each individual pattern are correct.
+        assert_eq!(nfa.start_pattern(pid(0)), sid(0));
+        assert_eq!(nfa.start_pattern(pid(1)), sid(2));
+    }
+}
diff --git a/src/nfa/thompson/error.rs b/src/nfa/thompson/error.rs

new file mode 100644 (file)

index 0000000..52f02e8
--- /dev/null
+++ b/src/nfa/thompson/error.rs
@@ -0,0 +1,145 @@
+use crate::util::id::{PatternID, StateID};
+
+/// An error that can occured during the construction of a thompson NFA.
+///
+/// This error does not provide many introspection capabilities. There are
+/// generally only two things you can do with it:
+///
+/// * Obtain a human readable message via its `std::fmt::Display` impl.
+/// * Access an underlying [`regex_syntax::Error`] type from its `source`
+/// method via the `std::error::Error` trait. This error only occurs when using
+/// convenience routines for building an NFA directly from a pattern string.
+///
+/// Otherwise, errors typically occur when a limit has been breeched. For
+/// example, if the total heap usage of the compiled NFA exceeds the limit
+/// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then
+/// building the NFA will fail.
+#[derive(Clone, Debug)]
+pub struct Error {
+    kind: ErrorKind,
+}
+
+/// The kind of error that occurred during the construction of a thompson NFA.
+#[derive(Clone, Debug)]
+enum ErrorKind {
+    /// An error that occurred while parsing a regular expression. Note that
+    /// this error may be printed over multiple lines, and is generally
+    /// intended to be end user readable on its own.
+    Syntax(regex_syntax::Error),
+    /// An error that occurs if too many patterns were given to the NFA
+    /// compiler.
+    TooManyPatterns {
+        /// The number of patterns given, which exceeds the limit.
+        given: usize,
+        /// The limit on the number of patterns.
+        limit: usize,
+    },
+    /// An error that occurs if too states are produced while building an NFA.
+    TooManyStates {
+        /// The minimum number of states that are desired, which exceeds the
+        /// limit.
+        given: usize,
+        /// The limit on the number of states.
+        limit: usize,
+    },
+    /// An error that occurs when NFA compilation exceeds a configured heap
+    /// limit.
+    ExceededSizeLimit {
+        /// The configured limit, in bytes.
+        limit: usize,
+    },
+    /// An error that occurs when an invalid capture group index is added to
+    /// the NFA. An "invalid" index can be one that is too big (e.g., results
+    /// in an integer overflow) or one that is discontinuous from previous
+    /// capture group indices added.
+    InvalidCaptureIndex {
+        /// The invalid index that was given.
+        index: usize,
+    },
+    /// An error that occurs when an NFA contains a Unicode word boundary, but
+    /// where the crate was compiled without the necessary data for dealing
+    /// with Unicode word boundaries.
+    UnicodeWordUnavailable,
+}
+
+impl Error {
+    fn kind(&self) -> &ErrorKind {
+        &self.kind
+    }
+
+    pub(crate) fn syntax(err: regex_syntax::Error) -> Error {
+        Error { kind: ErrorKind::Syntax(err) }
+    }
+
+    pub(crate) fn too_many_patterns(given: usize) -> Error {
+        let limit = PatternID::LIMIT;
+        Error { kind: ErrorKind::TooManyPatterns { given, limit } }
+    }
+
+    pub(crate) fn too_many_states(given: usize) -> Error {
+        let limit = StateID::LIMIT;
+        Error { kind: ErrorKind::TooManyStates { given, limit } }
+    }
+
+    pub(crate) fn exceeded_size_limit(limit: usize) -> Error {
+        Error { kind: ErrorKind::ExceededSizeLimit { limit } }
+    }
+
+    pub(crate) fn invalid_capture_index(index: usize) -> Error {
+        Error { kind: ErrorKind::InvalidCaptureIndex { index } }
+    }
+
+    pub(crate) fn unicode_word_unavailable() -> Error {
+        Error { kind: ErrorKind::UnicodeWordUnavailable }
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for Error {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self.kind() {
+            ErrorKind::Syntax(ref err) => Some(err),
+            ErrorKind::TooManyPatterns { .. } => None,
+            ErrorKind::TooManyStates { .. } => None,
+            ErrorKind::ExceededSizeLimit { .. } => None,
+            ErrorKind::InvalidCaptureIndex { .. } => None,
+            ErrorKind::UnicodeWordUnavailable => None,
+        }
+    }
+}
+
+impl core::fmt::Display for Error {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        match self.kind() {
+            ErrorKind::Syntax(_) => write!(f, "error parsing regex"),
+            ErrorKind::TooManyPatterns { given, limit } => write!(
+                f,
+                "attemped to compile {} patterns, \
+                 which exceeds the limit of {}",
+                given, limit,
+            ),
+            ErrorKind::TooManyStates { given, limit } => write!(
+                f,
+                "attemped to compile {} NFA states, \
+                 which exceeds the limit of {}",
+                given, limit,
+            ),
+            ErrorKind::ExceededSizeLimit { limit } => write!(
+                f,
+                "heap usage during NFA compilation exceeded limit of {}",
+                limit,
+            ),
+            ErrorKind::InvalidCaptureIndex { index } => write!(
+                f,
+                "capture group index {} is invalid (too big or discontinuous)",
+                index,
+            ),
+            ErrorKind::UnicodeWordUnavailable => write!(
+                f,
+                "crate has been compiled without Unicode word boundary \
+                 support, but the NFA contains Unicode word boundary \
+                 assertions",
+            ),
+        }
+    }
+}
diff --git a/src/nfa/thompson/map.rs b/src/nfa/thompson/map.rs

new file mode 100644 (file)

index 0000000..79ff63c
--- /dev/null
+++ b/src/nfa/thompson/map.rs
@@ -0,0 +1,290 @@
+// This module contains a couple simple and purpose built hash maps. The key
+// trade off they make is that they serve as caches rather than true maps. That
+// is, inserting a new entry may cause eviction of another entry. This gives
+// us two things. First, there's less overhead associated with inserts and
+// lookups. Secondly, it lets us control our memory usage.
+//
+// These maps are used in some fairly hot code when generating NFA states for
+// large Unicode character classes.
+//
+// Instead of exposing a rich hashmap entry API, we just permit the caller to
+// produce a hash of the key directly. The hash can then be reused for both
+// lookups and insertions at the cost of leaking abstraction a bit. But these
+// are for internal use only, so it's fine.
+//
+// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
+// (almost) minimal DFA for large Unicode character classes in linear time.
+// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
+// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
+// since there's a bit more expense in the reverse direction.)
+//
+// The Utf8SuffixMap is used when compiling large Unicode character classes for
+// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
+// construction of UTF-8 automata by caching common suffixes. This doesn't
+// get the same space savings as Daciuk's algorithm, but it's basically as
+// fast as the naive approach and typically winds up using less memory (since
+// it generates smaller NFAs) despite the presence of the cache.
+//
+// These maps effectively represent caching mechanisms for CState::Sparse and
+// CState::Range, respectively. The former represents a single NFA state with
+// many transitions of equivalent priority while the latter represents a single
+// NFA state with a single transition. (Neither state ever has or is an
+// epsilon transition.) Thus, they have different key types. It's likely we
+// could make one generic map, but the machinery didn't seem worth it. They
+// are simple enough.
+
+use alloc::{vec, vec::Vec};
+
+use crate::{nfa::thompson::Transition, util::id::StateID};
+
+// Basic FNV-1a hash constants as described in:
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+const PRIME: u64 = 1099511628211;
+const INIT: u64 = 14695981039346656037;
+
+/// A bounded hash map where the key is a sequence of NFA transitions and the
+/// value is a pre-existing NFA state ID.
+///
+/// std's hashmap can be used for this, however, this map has two important
+/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
+/// control our memory usage by limited the number of slots. In general, the
+/// cost here is that this map acts as a cache. That is, inserting a new entry
+/// may remove an old entry. We are okay with this, since it does not impact
+/// correctness in the cases where it is used. The only effect that dropping
+/// states from the cache has is that the resulting NFA generated may be bigger
+/// than it otherwise would be.
+///
+/// This improves benchmarks that compile large Unicode character classes,
+/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
+/// Specifically, one could observe the difference with std's hashmap via
+/// something like the following benchmark:
+///
+///   hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
+///
+/// But to observe that difference, you'd have to modify the code to use
+/// std's hashmap.
+///
+/// It is quite possible that there is a better way to approach this problem.
+/// For example, if there happens to be a very common state that collides with
+/// a lot of less frequent states, then we could wind up with very poor caching
+/// behavior. Alas, the effectiveness of this cache has not been measured.
+/// Instead, ad hoc experiments suggest that it is "good enough." Additional
+/// smarts (such as an LRU eviction policy) have to be weighed against the
+/// amount of extra time they cost.
+#[derive(Clone, Debug)]
+pub struct Utf8BoundedMap {
+    /// The current version of this map. Only entries with matching versions
+    /// are considered during lookups. If an entry is found with a mismatched
+    /// version, then the map behaves as if the entry does not exist.
+    ///
+    /// This makes it possible to clear the map by simply incrementing the
+    /// version number instead of actually deallocating any storage.
+    version: u16,
+    /// The total number of entries this map can store.
+    capacity: usize,
+    /// The actual entries, keyed by hash. Collisions between different states
+    /// result in the old state being dropped.
+    map: Vec<Utf8BoundedEntry>,
+}
+
+/// An entry in this map.
+#[derive(Clone, Debug, Default)]
+struct Utf8BoundedEntry {
+    /// The version of the map used to produce this entry. If this entry's
+    /// version does not match the current version of the map, then the map
+    /// should behave as if this entry does not exist.
+    version: u16,
+    /// The key, which is a sorted sequence of non-overlapping NFA transitions.
+    key: Vec<Transition>,
+    /// The state ID corresponding to the state containing the transitions in
+    /// this entry.
+    val: StateID,
+}
+
+impl Utf8BoundedMap {
+    /// Create a new bounded map with the given capacity. The map will never
+    /// grow beyond the given size.
+    ///
+    /// Note that this does not allocate. Instead, callers must call `clear`
+    /// before using this map. `clear` will allocate space if necessary.
+    ///
+    /// This avoids the need to pay for the allocation of this map when
+    /// compiling regexes that lack large Unicode character classes.
+    pub fn new(capacity: usize) -> Utf8BoundedMap {
+        assert!(capacity > 0);
+        Utf8BoundedMap { version: 0, capacity, map: vec![] }
+    }
+
+    /// Clear this map of all entries, but permit the reuse of allocation
+    /// if possible.
+    ///
+    /// This must be called before the map can be used.
+    pub fn clear(&mut self) {
+        if self.map.is_empty() {
+            self.map = vec![Utf8BoundedEntry::default(); self.capacity];
+        } else {
+            self.version = self.version.wrapping_add(1);
+            // If we loop back to version 0, then we forcefully clear the
+            // entire map. Otherwise, it might be possible to incorrectly
+            // match entries used to generate other NFAs.
+            if self.version == 0 {
+                self.map = vec![Utf8BoundedEntry::default(); self.capacity];
+            }
+        }
+    }
+
+    /// Return a hash of the given transitions.
+    pub fn hash(&self, key: &[Transition]) -> usize {
+        let mut h = INIT;
+        for t in key {
+            h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
+            h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
+            h = (h ^ (t.next.as_usize() as u64)).wrapping_mul(PRIME);
+        }
+        (h as usize) % self.map.len()
+    }
+
+    /// Retrieve the cached state ID corresponding to the given key. The hash
+    /// given must have been computed with `hash` using the same key value.
+    ///
+    /// If there is no cached state with the given transitions, then None is
+    /// returned.
+    pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
+        let entry = &self.map[hash];
+        if entry.version != self.version {
+            return None;
+        }
+        // There may be a hash collision, so we need to confirm real equality.
+        if entry.key != key {
+            return None;
+        }
+        Some(entry.val)
+    }
+
+    /// Add a cached state to this map with the given key. Callers should
+    /// ensure that `state_id` points to a state that contains precisely the
+    /// NFA transitions given.
+    ///
+    /// `hash` must have been computed using the `hash` method with the same
+    /// key.
+    pub fn set(
+        &mut self,
+        key: Vec<Transition>,
+        hash: usize,
+        state_id: StateID,
+    ) {
+        self.map[hash] =
+            Utf8BoundedEntry { version: self.version, key, val: state_id };
+    }
+}
+
+/// A cache of suffixes used to modestly compress UTF-8 automata for large
+/// Unicode character classes.
+#[derive(Clone, Debug)]
+pub struct Utf8SuffixMap {
+    /// The current version of this map. Only entries with matching versions
+    /// are considered during lookups. If an entry is found with a mismatched
+    /// version, then the map behaves as if the entry does not exist.
+    version: u16,
+    /// The total number of entries this map can store.
+    capacity: usize,
+    /// The actual entries, keyed by hash. Collisions between different states
+    /// result in the old state being dropped.
+    map: Vec<Utf8SuffixEntry>,
+}
+
+/// A key that uniquely identifies an NFA state. It is a triple that represents
+/// a transition from one state for a particular byte range.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Utf8SuffixKey {
+    pub from: StateID,
+    pub start: u8,
+    pub end: u8,
+}
+
+/// An entry in this map.
+#[derive(Clone, Debug, Default)]
+struct Utf8SuffixEntry {
+    /// The version of the map used to produce this entry. If this entry's
+    /// version does not match the current version of the map, then the map
+    /// should behave as if this entry does not exist.
+    version: u16,
+    /// The key, which consists of a transition in a particular state.
+    key: Utf8SuffixKey,
+    /// The identifier that the transition in the key maps to.
+    val: StateID,
+}
+
+impl Utf8SuffixMap {
+    /// Create a new bounded map with the given capacity. The map will never
+    /// grow beyond the given size.
+    ///
+    /// Note that this does not allocate. Instead, callers must call `clear`
+    /// before using this map. `clear` will allocate space if necessary.
+    ///
+    /// This avoids the need to pay for the allocation of this map when
+    /// compiling regexes that lack large Unicode character classes.
+    pub fn new(capacity: usize) -> Utf8SuffixMap {
+        assert!(capacity > 0);
+        Utf8SuffixMap { version: 0, capacity, map: vec![] }
+    }
+
+    /// Clear this map of all entries, but permit the reuse of allocation
+    /// if possible.
+    ///
+    /// This must be called before the map can be used.
+    pub fn clear(&mut self) {
+        if self.map.is_empty() {
+            self.map = vec![Utf8SuffixEntry::default(); self.capacity];
+        } else {
+            self.version = self.version.wrapping_add(1);
+            if self.version == 0 {
+                self.map = vec![Utf8SuffixEntry::default(); self.capacity];
+            }
+        }
+    }
+
+    /// Return a hash of the given transition.
+    pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
+        // Basic FNV-1a hash as described:
+        // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+        const PRIME: u64 = 1099511628211;
+        const INIT: u64 = 14695981039346656037;
+
+        let mut h = INIT;
+        h = (h ^ (key.from.as_usize() as u64)).wrapping_mul(PRIME);
+        h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
+        h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
+        (h as usize) % self.map.len()
+    }
+
+    /// Retrieve the cached state ID corresponding to the given key. The hash
+    /// given must have been computed with `hash` using the same key value.
+    ///
+    /// If there is no cached state with the given key, then None is returned.
+    pub fn get(
+        &mut self,
+        key: &Utf8SuffixKey,
+        hash: usize,
+    ) -> Option<StateID> {
+        let entry = &self.map[hash];
+        if entry.version != self.version {
+            return None;
+        }
+        if key != &entry.key {
+            return None;
+        }
+        Some(entry.val)
+    }
+
+    /// Add a cached state to this map with the given key. Callers should
+    /// ensure that `state_id` points to a state that contains precisely the
+    /// NFA transition given.
+    ///
+    /// `hash` must have been computed using the `hash` method with the same
+    /// key.
+    pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
+        self.map[hash] =
+            Utf8SuffixEntry { version: self.version, key, val: state_id };
+    }
+}
diff --git a/src/nfa/thompson/mod.rs b/src/nfa/thompson/mod.rs

new file mode 100644 (file)

index 0000000..88a438e
--- /dev/null
+++ b/src/nfa/thompson/mod.rs
@@ -0,0 +1,1555 @@
+use core::{convert::TryFrom, fmt, mem, ops::Range};
+
+use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec};
+
+use crate::util::{
+    alphabet::{self, ByteClassSet},
+    decode_last_utf8, decode_utf8,
+    id::{IteratorIDExt, PatternID, PatternIDIter, StateID},
+    is_word_byte, is_word_char_fwd, is_word_char_rev,
+};
+
+pub use self::{
+    compiler::{Builder, Config},
+    error::Error,
+};
+
+mod compiler;
+mod error;
+mod map;
+pub mod pikevm;
+mod range_trie;
+
+/// A map from capture group name to its corresponding capture index.
+///
+/// Since there are always two slots for each capture index, the pair of slots
+/// corresponding to the capture index for a pattern ID of 0 are indexed at
+/// `map["<name>"] * 2` and `map["<name>"] * 2 + 1`.
+///
+/// This type is actually wrapped inside a Vec indexed by pattern ID on the
+/// NFA, since multiple patterns may have the same capture group name.
+///
+/// Note that this is somewhat of a sub-optimal representation, since it
+/// requires a hashmap for each pattern. A better representation would be
+/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look
+/// up a capture index by name without producing a `Arc<str>`, which requires
+/// an allocation. To fix this, I think we'd need to define our own unsized
+/// type or something?
+#[cfg(feature = "std")]
+type CaptureNameMap = std::collections::HashMap<Arc<str>, usize>;
+#[cfg(not(feature = "std"))]
+type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, usize>;
+
+// The NFA API below is not something I'm terribly proud of at the moment. In
+// particular, it supports both mutating the NFA and actually using the NFA to
+// perform a search. I think combining these two things muddies the waters a
+// bit too much.
+//
+// I think the issue is that I saw the compiler as the 'builder,' and where
+// the compiler had the ability to manipulate the internal state of the NFA.
+// However, one of my goals was to make it possible for others to build their
+// own NFAs in a way that is *not* couple to the regex-syntax crate.
+//
+// So I think really, there should be an NFA, a NFABuilder and then the
+// internal compiler which uses the NFABuilder API to build an NFA. Alas, at
+// the time of writing, I kind of ran out of steam.
+
+/// A fully compiled Thompson NFA.
+///
+/// The states of the NFA are indexed by state IDs, which are how transitions
+/// are expressed.
+#[derive(Clone)]
+pub struct NFA {
+    /// The state list. This list is guaranteed to be indexable by all starting
+    /// state IDs, and it is also guaranteed to contain at most one `Match`
+    /// state for each pattern compiled into this NFA. (A pattern may not have
+    /// a corresponding `Match` state if a `Match` state is impossible to
+    /// reach.)
+    states: Vec<State>,
+    /// The anchored starting state of this NFA.
+    start_anchored: StateID,
+    /// The unanchored starting state of this NFA.
+    start_unanchored: StateID,
+    /// The starting states for each individual pattern. Starting at any
+    /// of these states will result in only an anchored search for the
+    /// corresponding pattern. The vec is indexed by pattern ID. When the NFA
+    /// contains a single regex, then `start_pattern[0]` and `start_anchored`
+    /// are always equivalent.
+    start_pattern: Vec<StateID>,
+    /// A map from PatternID to its corresponding range of capture slots. Each
+    /// range is guaranteed to be contiguous with the previous range. The
+    /// end of the last range corresponds to the total number of slots needed
+    /// for this NFA.
+    patterns_to_slots: Vec<Range<usize>>,
+    /// A map from capture name to its corresponding index. So e.g., given
+    /// a single regex like '(\w+) (\w+) (?P<word>\w+)', the capture name
+    /// 'word' for pattern ID=0 would corresponding to the index '3'. Its
+    /// corresponding slots would then be '3 * 2 = 6' and '3 * 2 + 1 = 7'.
+    capture_name_to_index: Vec<CaptureNameMap>,
+    /// A map from pattern ID to capture group index to name, if one exists.
+    /// This is effectively the inverse of 'capture_name_to_index'. The outer
+    /// vec is indexed by pattern ID, while the inner vec is index by capture
+    /// index offset for the corresponding pattern.
+    ///
+    /// The first capture group for each pattern is always unnamed and is thus
+    /// always None.
+    capture_index_to_name: Vec<Vec<Option<Arc<str>>>>,
+    /// A representation of equivalence classes over the transitions in this
+    /// NFA. Two bytes in the same equivalence class must not discriminate
+    /// between a match or a non-match. This map can be used to shrink the
+    /// total size of a DFA's transition table with a small match-time cost.
+    ///
+    /// Note that the NFA's transitions are *not* defined in terms of these
+    /// equivalence classes. The NFA's transitions are defined on the original
+    /// byte values. For the most part, this is because they wouldn't really
+    /// help the NFA much since the NFA already uses a sparse representation
+    /// to represent transitions. Byte classes are most effective in a dense
+    /// representation.
+    byte_class_set: ByteClassSet,
+    /// Various facts about this NFA, which can be used to improve failure
+    /// modes (e.g., rejecting DFA construction if an NFA has Unicode word
+    /// boundaries) or for performing optimizations (avoiding an increase in
+    /// states if there are no look-around states).
+    facts: Facts,
+    /// Heap memory used indirectly by NFA states. Since each state might use a
+    /// different amount of heap, we need to keep track of this incrementally.
+    memory_states: usize,
+}
+
+impl NFA {
+    pub fn config() -> Config {
+        Config::new()
+    }
+
+    pub fn builder() -> Builder {
+        Builder::new()
+    }
+
+    /// Returns an NFA with no states. Its match semantics are unspecified.
+    ///
+    /// An empty NFA is useful as a starting point for building one. It is
+    /// itself not intended to be used for matching. For example, its starting
+    /// state identifiers are configured to be `0`, but since it has no states,
+    /// the identifiers are invalid.
+    ///
+    /// If you need an NFA that never matches is anything and can be correctly
+    /// used for matching, use [`NFA::never_match`].
+    #[inline]
+    pub fn empty() -> NFA {
+        NFA {
+            states: vec![],
+            start_anchored: StateID::ZERO,
+            start_unanchored: StateID::ZERO,
+            start_pattern: vec![],
+            patterns_to_slots: vec![],
+            capture_name_to_index: vec![],
+            capture_index_to_name: vec![],
+            byte_class_set: ByteClassSet::empty(),
+            facts: Facts::default(),
+            memory_states: 0,
+        }
+    }
+
+    /// Returns an NFA with a single regex that always matches at every
+    /// position.
+    #[inline]
+    pub fn always_match() -> NFA {
+        let mut nfa = NFA::empty();
+        // Since we're only adding one pattern, these are guaranteed to work.
+        let start = nfa.add_match().unwrap();
+        assert_eq!(start.as_usize(), 0);
+        let pid = nfa.finish_pattern(start).unwrap();
+        assert_eq!(pid.as_usize(), 0);
+        nfa
+    }
+
+    /// Returns an NFA that never matches at any position. It contains no
+    /// regexes.
+    #[inline]
+    pub fn never_match() -> NFA {
+        let mut nfa = NFA::empty();
+        // Since we're only adding one state, this can never fail.
+        nfa.add_fail().unwrap();
+        nfa
+    }
+
+    /// Return the number of states in this NFA.
+    ///
+    /// This is guaranteed to be no bigger than [`StateID::LIMIT`].
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.states.len()
+    }
+
+    /// Returns the total number of distinct match states in this NFA.
+    /// Stated differently, this returns the total number of regex patterns
+    /// used to build this NFA.
+    ///
+    /// This may return zero if the NFA was constructed with no patterns. In
+    /// this case, and only this case, the NFA can never produce a match for
+    /// any input.
+    ///
+    /// This is guaranteed to be no bigger than [`PatternID::LIMIT`].
+    #[inline]
+    pub fn pattern_len(&self) -> usize {
+        self.start_pattern.len()
+    }
+
+    /// Returns the pattern ID of the pattern currently being compiled by this
+    /// NFA.
+    fn current_pattern_id(&self) -> PatternID {
+        // This always works because we never permit more patterns in
+        // 'start_pattern' than can be addressed by PatternID. Also, we only
+        // add a new entry to 'start_pattern' once we finish compiling a
+        // pattern. Thus, the length refers to the ID of the current pattern
+        // being compiled.
+        PatternID::new(self.start_pattern.len()).unwrap()
+    }
+
+    /// Returns the total number of capturing groups in this NFA.
+    ///
+    /// This includes the special 0th capture group that is always present and
+    /// captures the start and end offset of the entire match.
+    ///
+    /// This is a convenience routine for `nfa.capture_slot_len() / 2`.
+    #[inline]
+    pub fn capture_len(&self) -> usize {
+        let slots = self.capture_slot_len();
+        // This assert is guaranteed to pass since the NFA construction process
+        // guarantees that it is always true.
+        assert_eq!(slots % 2, 0, "capture slots must be divisible by 2");
+        slots / 2
+    }
+
+    /// Returns the total number of capturing slots in this NFA.
+    ///
+    /// This value is guaranteed to be a multiple of 2. (Where each capturing
+    /// group has precisely two capturing slots in the NFA.)
+    #[inline]
+    pub fn capture_slot_len(&self) -> usize {
+        self.patterns_to_slots.last().map_or(0, |r| r.end)
+    }
+
+    /// Return a range of capture slots for the given pattern.
+    ///
+    /// The range returned is guaranteed to be contiguous with ranges for
+    /// adjacent patterns.
+    ///
+    /// This panics if the given pattern ID is greater than or equal to the
+    /// number of patterns in this NFA.
+    #[inline]
+    pub fn pattern_slots(&self, pid: PatternID) -> Range<usize> {
+        self.patterns_to_slots[pid].clone()
+    }
+
+    /// Return the capture group index corresponding to the given name in the
+    /// given pattern. If no such capture group name exists in the given
+    /// pattern, then this returns `None`.
+    ///
+    /// If the given pattern ID is invalid, then this panics.
+    #[inline]
+    pub fn capture_name_to_index(
+        &self,
+        pid: PatternID,
+        name: &str,
+    ) -> Option<usize> {
+        assert!(pid.as_usize() < self.pattern_len(), "invalid pattern ID");
+        self.capture_name_to_index[pid].get(name).cloned()
+    }
+
+    // TODO: add iterators over capture group names.
+    // Do we also permit indexing?
+
+    /// Returns an iterator over all pattern IDs in this NFA.
+    #[inline]
+    pub fn patterns(&self) -> PatternIter {
+        PatternIter {
+            it: PatternID::iter(self.pattern_len()),
+            _marker: core::marker::PhantomData,
+        }
+    }
+
+    /// Return the ID of the initial anchored state of this NFA.
+    #[inline]
+    pub fn start_anchored(&self) -> StateID {
+        self.start_anchored
+    }
+
+    /// Set the anchored starting state ID for this NFA.
+    #[inline]
+    pub fn set_start_anchored(&mut self, id: StateID) {
+        self.start_anchored = id;
+    }
+
+    /// Return the ID of the initial unanchored state of this NFA.
+    #[inline]
+    pub fn start_unanchored(&self) -> StateID {
+        self.start_unanchored
+    }
+
+    /// Set the unanchored starting state ID for this NFA.
+    #[inline]
+    pub fn set_start_unanchored(&mut self, id: StateID) {
+        self.start_unanchored = id;
+    }
+
+    /// Return the ID of the initial anchored state for the given pattern.
+    ///
+    /// If the pattern doesn't exist in this NFA, then this panics.
+    #[inline]
+    pub fn start_pattern(&self, pid: PatternID) -> StateID {
+        self.start_pattern[pid]
+    }
+
+    /// Get the byte class set for this NFA.
+    #[inline]
+    pub fn byte_class_set(&self) -> &ByteClassSet {
+        &self.byte_class_set
+    }
+
+    /// Return a reference to the NFA state corresponding to the given ID.
+    #[inline]
+    pub fn state(&self, id: StateID) -> &State {
+        &self.states[id]
+    }
+
+    /// Returns a slice of all states in this NFA.
+    ///
+    /// The slice returned may be indexed by a `StateID` generated by `add`.
+    #[inline]
+    pub fn states(&self) -> &[State] {
+        &self.states
+    }
+
+    #[inline]
+    pub fn is_always_start_anchored(&self) -> bool {
+        self.start_anchored() == self.start_unanchored()
+    }
+
+    #[inline]
+    pub fn has_any_look(&self) -> bool {
+        self.facts.has_any_look()
+    }
+
+    #[inline]
+    pub fn has_any_anchor(&self) -> bool {
+        self.facts.has_any_anchor()
+    }
+
+    #[inline]
+    pub fn has_word_boundary(&self) -> bool {
+        self.has_word_boundary_unicode() || self.has_word_boundary_ascii()
+    }
+
+    #[inline]
+    pub fn has_word_boundary_unicode(&self) -> bool {
+        self.facts.has_word_boundary_unicode()
+    }
+
+    #[inline]
+    pub fn has_word_boundary_ascii(&self) -> bool {
+        self.facts.has_word_boundary_ascii()
+    }
+
+    /// Returns the memory usage, in bytes, of this NFA.
+    ///
+    /// This does **not** include the stack size used up by this NFA. To
+    /// compute that, use `std::mem::size_of::<NFA>()`.
+    #[inline]
+    pub fn memory_usage(&self) -> usize {
+        self.states.len() * mem::size_of::<State>()
+            + self.memory_states
+            + self.start_pattern.len() * mem::size_of::<StateID>()
+    }
+
+    // Why do we define a bunch of 'add_*' routines below instead of just
+    // defining a single 'add' routine that accepts a 'State'? Indeed, for most
+    // of the 'add_*' routines below, such a simple API would be more than
+    // appropriate. Unfortunately, adding capture states and, to a lesser
+    // extent, match states, is a bit more complex. Namely, when we add a
+    // capture state, we *really* want to know the corresponding capture
+    // group's name and index and what not, so that we can update other state
+    // inside this NFA. But, e.g., the capture group name is not and should
+    // not be included in 'State::Capture'. So what are our choices?
+    //
+    // 1) Define one 'add' and require some additional optional parameters.
+    // This feels quite ugly, and adds unnecessary complexity to more common
+    // and simpler cases.
+    //
+    // 2) Do what we do below. The sad thing is that our API is bigger with
+    // more methods. But each method is very specific and hopefully simple.
+    //
+    // 3) Define a new enum, say, 'StateWithInfo', or something that permits
+    // providing both a State and some extra ancillary info in some cases. This
+    // doesn't seem too bad to me, but seems slightly worse than (2) because of
+    // the additional type required.
+    //
+    // 4) Abandon the idea that we have to specify things like the capture
+    // group name when we add the Capture state to the NFA. We would then need
+    // to add other methods that permit the caller to add this additional state
+    // "out of band." Other than it introducing some additional complexity, I
+    // decided against this because I wanted the NFA builder API to make it
+    // as hard as possible to build a bad or invalid NFA. Using the approach
+    // below, as you'll see, permits us to do a lot of strict checking of our
+    // inputs and return an error if we see something we don't expect.
+
+    pub fn add_range(&mut self, range: Transition) -> Result<StateID, Error> {
+        self.byte_class_set.set_range(range.start, range.end);
+        self.add_state(State::Range { range })
+    }
+
+    pub fn add_sparse(
+        &mut self,
+        sparse: SparseTransitions,
+    ) -> Result<StateID, Error> {
+        for range in sparse.ranges.iter() {
+            self.byte_class_set.set_range(range.start, range.end);
+        }
+        self.add_state(State::Sparse(sparse))
+    }
+
+    pub fn add_look(
+        &mut self,
+        next: StateID,
+        look: Look,
+    ) -> Result<StateID, Error> {
+        self.facts.set_has_any_look(true);
+        look.add_to_byteset(&mut self.byte_class_set);
+        match look {
+            Look::StartLine
+            | Look::EndLine
+            | Look::StartText
+            | Look::EndText => {
+                self.facts.set_has_any_anchor(true);
+            }
+            Look::WordBoundaryUnicode | Look::WordBoundaryUnicodeNegate => {
+                self.facts.set_has_word_boundary_unicode(true);
+            }
+            Look::WordBoundaryAscii | Look::WordBoundaryAsciiNegate => {
+                self.facts.set_has_word_boundary_ascii(true);
+            }
+        }
+        self.add_state(State::Look { look, next })
+    }
+
+    pub fn add_union(
+        &mut self,
+        alternates: Box<[StateID]>,
+    ) -> Result<StateID, Error> {
+        self.add_state(State::Union { alternates })
+    }
+
+    pub fn add_capture_start(
+        &mut self,
+        next_id: StateID,
+        capture_index: u32,
+        name: Option<Arc<str>>,
+    ) -> Result<StateID, Error> {
+        let pid = self.current_pattern_id();
+        let capture_index = match usize::try_from(capture_index) {
+            Err(_) => {
+                return Err(Error::invalid_capture_index(core::usize::MAX))
+            }
+            Ok(capture_index) => capture_index,
+        };
+        // Do arithmetic to find our absolute slot index first, to make sure
+        // the index is at least possibly valid (doesn't overflow).
+        let relative_slot = match capture_index.checked_mul(2) {
+            Some(relative_slot) => relative_slot,
+            None => return Err(Error::invalid_capture_index(capture_index)),
+        };
+        let slot = match relative_slot.checked_add(self.capture_slot_len()) {
+            Some(slot) => slot,
+            None => return Err(Error::invalid_capture_index(capture_index)),
+        };
+        // Make sure we have space to insert our (pid,index)|-->name mapping.
+        if pid.as_usize() >= self.capture_index_to_name.len() {
+            // Note that we require that if you're adding capturing groups,
+            // then there must be at least one capturing group per pattern.
+            // Moreover, whenever we expand our space here, it should always
+            // first be for the first capture group (at index==0).
+            if pid.as_usize() > self.capture_index_to_name.len()
+                || capture_index > 0
+            {
+                return Err(Error::invalid_capture_index(capture_index));
+            }
+            self.capture_name_to_index.push(CaptureNameMap::new());
+            self.capture_index_to_name.push(vec![]);
+        }
+        if capture_index >= self.capture_index_to_name[pid].len() {
+            // We require that capturing groups are added in correspondence
+            // to their index. So no discontinuous indices. This is likely
+            // overly strict, but also makes it simpler to provide guarantees
+            // about our capturing group data.
+            if capture_index > self.capture_index_to_name[pid].len() {
+                return Err(Error::invalid_capture_index(capture_index));
+            }
+            self.capture_index_to_name[pid].push(None);
+        }
+        if let Some(ref name) = name {
+            self.capture_name_to_index[pid]
+                .insert(Arc::clone(name), capture_index);
+        }
+        self.capture_index_to_name[pid][capture_index] = name;
+        self.add_state(State::Capture { next: next_id, slot })
+    }
+
+    pub fn add_capture_end(
+        &mut self,
+        next_id: StateID,
+        capture_index: u32,
+    ) -> Result<StateID, Error> {
+        let pid = self.current_pattern_id();
+        let capture_index = match usize::try_from(capture_index) {
+            Err(_) => {
+                return Err(Error::invalid_capture_index(core::usize::MAX))
+            }
+            Ok(capture_index) => capture_index,
+        };
+        // If we haven't already added this capture group via a corresponding
+        // 'add_capture_start' call, then we consider the index given to be
+        // invalid.
+        if pid.as_usize() >= self.capture_index_to_name.len()
+            || capture_index >= self.capture_index_to_name[pid].len()
+        {
+            return Err(Error::invalid_capture_index(capture_index));
+        }
+        // Since we've already confirmed that this capture index is invalid
+        // and has a corresponding starting slot, we know the multiplcation
+        // has already been done and succeeded.
+        let relative_slot_start = capture_index.checked_mul(2).unwrap();
+        let relative_slot = match relative_slot_start.checked_add(1) {
+            Some(relative_slot) => relative_slot,
+            None => return Err(Error::invalid_capture_index(capture_index)),
+        };
+        let slot = match relative_slot.checked_add(self.capture_slot_len()) {
+            Some(slot) => slot,
+            None => return Err(Error::invalid_capture_index(capture_index)),
+        };
+        self.add_state(State::Capture { next: next_id, slot })
+    }
+
+    pub fn add_fail(&mut self) -> Result<StateID, Error> {
+        self.add_state(State::Fail)
+    }
+
+    /// Add a new match state to this NFA and return its state ID.
+    pub fn add_match(&mut self) -> Result<StateID, Error> {
+        let pattern_id = self.current_pattern_id();
+        let sid = self.add_state(State::Match { id: pattern_id })?;
+        Ok(sid)
+    }
+
+    /// Finish compiling the current pattern and return its identifier. The
+    /// given ID should be the state ID corresponding to the anchored starting
+    /// state for matching this pattern.
+    pub fn finish_pattern(
+        &mut self,
+        start_id: StateID,
+    ) -> Result<PatternID, Error> {
+        // We've gotta make sure that we never permit the user to add more
+        // patterns than we can identify. So if we're already at the limit,
+        // then return an error. This is somewhat non-ideal since this won't
+        // result in an error until trying to complete the compilation of a
+        // pattern instead of starting it.
+        if self.start_pattern.len() >= PatternID::LIMIT {
+            return Err(Error::too_many_patterns(
+                self.start_pattern.len().saturating_add(1),
+            ));
+        }
+        let pid = self.current_pattern_id();
+        self.start_pattern.push(start_id);
+        // Add the number of new slots created by this pattern. This is always
+        // equivalent to '2 * caps.len()', where 'caps.len()' is the number of
+        // new capturing groups introduced by the pattern we're finishing.
+        let new_cap_groups = self
+            .capture_index_to_name
+            .get(pid.as_usize())
+            .map_or(0, |caps| caps.len());
+        let new_slots = match new_cap_groups.checked_mul(2) {
+            Some(new_slots) => new_slots,
+            None => {
+                // Just return the biggest index that we know exists.
+                let index = new_cap_groups.saturating_sub(1);
+                return Err(Error::invalid_capture_index(index));
+            }
+        };
+        let slot_start = self.capture_slot_len();
+        self.patterns_to_slots.push(slot_start..(slot_start + new_slots));
+        Ok(pid)
+    }
+
+    fn add_state(&mut self, state: State) -> Result<StateID, Error> {
+        let id = StateID::new(self.states.len())
+            .map_err(|_| Error::too_many_states(self.states.len()))?;
+        self.memory_states += state.memory_usage();
+        self.states.push(state);
+        Ok(id)
+    }
+
+    /// Remap the transitions in every state of this NFA using the given map.
+    /// The given map should be indexed according to state ID namespace used by
+    /// the transitions of the states currently in this NFA.
+    ///
+    /// This may be used during the final phases of an NFA compiler, which
+    /// turns its intermediate NFA into the final NFA. Remapping may be
+    /// required to bring the state pointers from the intermediate NFA to the
+    /// final NFA.
+    pub fn remap(&mut self, old_to_new: &[StateID]) {
+        for state in &mut self.states {
+            state.remap(old_to_new);
+        }
+        self.start_anchored = old_to_new[self.start_anchored];
+        self.start_unanchored = old_to_new[self.start_unanchored];
+        for (pid, id) in self.start_pattern.iter_mut().with_pattern_ids() {
+            *id = old_to_new[*id];
+        }
+    }
+
+    /// Clear this NFA such that it has zero states and is otherwise "empty."
+    ///
+    /// An empty NFA is useful as a starting point for building one. It is
+    /// itself not intended to be used for matching. For example, its starting
+    /// state identifiers are configured to be `0`, but since it has no states,
+    /// the identifiers are invalid.
+    pub fn clear(&mut self) {
+        self.states.clear();
+        self.start_anchored = StateID::ZERO;
+        self.start_unanchored = StateID::ZERO;
+        self.start_pattern.clear();
+        self.patterns_to_slots.clear();
+        self.capture_name_to_index.clear();
+        self.capture_index_to_name.clear();
+        self.byte_class_set = ByteClassSet::empty();
+        self.facts = Facts::default();
+        self.memory_states = 0;
+    }
+}
+
+impl fmt::Debug for NFA {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "thompson::NFA(")?;
+        for (sid, state) in self.states.iter().with_state_ids() {
+            let status = if sid == self.start_anchored {
+                '^'
+            } else if sid == self.start_unanchored {
+                '>'
+            } else {
+                ' '
+            };
+            writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?;
+        }
+        if self.pattern_len() > 1 {
+            writeln!(f, "")?;
+            for pid in self.patterns() {
+                let sid = self.start_pattern(pid);
+                writeln!(
+                    f,
+                    "START({:06?}): {:?}",
+                    pid.as_usize(),
+                    sid.as_usize()
+                )?;
+            }
+        }
+        writeln!(f, "")?;
+        writeln!(
+            f,
+            "transition equivalence classes: {:?}",
+            self.byte_class_set().byte_classes()
+        )?;
+        writeln!(f, ")")?;
+        Ok(())
+    }
+}
+
+/// A state in a final compiled NFA.
+#[derive(Clone, Eq, PartialEq)]
+pub enum State {
+    /// A state that transitions to `next` if and only if the current input
+    /// byte is in the range `[start, end]` (inclusive).
+    ///
+    /// This is a special case of Sparse in that it encodes only one transition
+    /// (and therefore avoids the allocation).
+    Range { range: Transition },
+    /// A state with possibly many transitions, represented in a sparse
+    /// fashion. Transitions are ordered lexicographically by input range. As
+    /// such, this may only be used when every transition has equal priority.
+    /// (In practice, this is only used for encoding UTF-8 automata.)
+    Sparse(SparseTransitions),
+    /// A conditional epsilon transition satisfied via some sort of
+    /// look-around.
+    Look { look: Look, next: StateID },
+    /// An alternation such that there exists an epsilon transition to all
+    /// states in `alternates`, where matches found via earlier transitions
+    /// are preferred over later transitions.
+    Union { alternates: Box<[StateID]> },
+    /// An empty state that records a capture location.
+    ///
+    /// From the perspective of finite automata, this is precisely equivalent
+    /// to an epsilon transition, but serves the purpose of instructing NFA
+    /// simulations to record additional state when the finite state machine
+    /// passes through this epsilon transition.
+    ///
+    /// These transitions are treated as epsilon transitions with no additional
+    /// effects in DFAs.
+    ///
+    /// 'slot' in this context refers to the specific capture group offset that
+    /// is being recorded. Each capturing group has two slots corresponding to
+    /// the start and end of the matching portion of that group.
+    /// A fail state. When encountered, the automaton is guaranteed to never
+    /// reach a match state.
+    Capture { next: StateID, slot: usize },
+    /// A state that cannot be transitioned out of. If a search reaches this
+    /// state, then no match is possible and the search should terminate.
+    Fail,
+    /// A match state. There is exactly one such occurrence of this state for
+    /// each regex compiled into the NFA.
+    Match { id: PatternID },
+}
+
+impl State {
+    /// Returns true if and only if this state contains one or more epsilon
+    /// transitions.
+    #[inline]
+    pub fn is_epsilon(&self) -> bool {
+        match *self {
+            State::Range { .. }
+            | State::Sparse { .. }
+            | State::Fail
+            | State::Match { .. } => false,
+            State::Look { .. }
+            | State::Union { .. }
+            | State::Capture { .. } => true,
+        }
+    }
+
+    /// Returns the heap memory usage of this NFA state in bytes.
+    fn memory_usage(&self) -> usize {
+        match *self {
+            State::Range { .. }
+            | State::Look { .. }
+            | State::Capture { .. }
+            | State::Match { .. }
+            | State::Fail => 0,
+            State::Sparse(SparseTransitions { ref ranges }) => {
+                ranges.len() * mem::size_of::<Transition>()
+            }
+            State::Union { ref alternates } => {
+                alternates.len() * mem::size_of::<StateID>()
+            }
+        }
+    }
+
+    /// Remap the transitions in this state using the given map. Namely, the
+    /// given map should be indexed according to the transitions currently
+    /// in this state.
+    ///
+    /// This is used during the final phase of the NFA compiler, which turns
+    /// its intermediate NFA into the final NFA.
+    fn remap(&mut self, remap: &[StateID]) {
+        match *self {
+            State::Range { ref mut range } => range.next = remap[range.next],
+            State::Sparse(SparseTransitions { ref mut ranges }) => {
+                for r in ranges.iter_mut() {
+                    r.next = remap[r.next];
+                }
+            }
+            State::Look { ref mut next, .. } => *next = remap[*next],
+            State::Union { ref mut alternates } => {
+                for alt in alternates.iter_mut() {
+                    *alt = remap[*alt];
+                }
+            }
+            State::Capture { ref mut next, .. } => *next = remap[*next],
+            State::Fail => {}
+            State::Match { .. } => {}
+        }
+    }
+}
+
+impl fmt::Debug for State {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match *self {
+            State::Range { ref range } => range.fmt(f),
+            State::Sparse(SparseTransitions { ref ranges }) => {
+                let rs = ranges
+                    .iter()
+                    .map(|t| format!("{:?}", t))
+                    .collect::<Vec<String>>()
+                    .join(", ");
+                write!(f, "sparse({})", rs)
+            }
+            State::Look { ref look, next } => {
+                write!(f, "{:?} => {:?}", look, next.as_usize())
+            }
+            State::Union { ref alternates } => {
+                let alts = alternates
+                    .iter()
+                    .map(|id| format!("{:?}", id.as_usize()))
+                    .collect::<Vec<String>>()
+                    .join(", ");
+                write!(f, "alt({})", alts)
+            }
+            State::Capture { next, slot } => {
+                write!(f, "capture({:?}) => {:?}", slot, next.as_usize())
+            }
+            State::Fail => write!(f, "FAIL"),
+            State::Match { id } => write!(f, "MATCH({:?})", id.as_usize()),
+        }
+    }
+}
+
+/// A collection of facts about an NFA.
+///
+/// There are no real cohesive principles behind what gets put in here. For
+/// the most part, it is implementation driven.
+#[derive(Clone, Copy, Debug, Default)]
+struct Facts {
+    /// Various yes/no facts about this NFA.
+    bools: u16,
+}
+
+impl Facts {
+    define_bool!(0, has_any_look, set_has_any_look);
+    define_bool!(1, has_any_anchor, set_has_any_anchor);
+    define_bool!(2, has_word_boundary_unicode, set_has_word_boundary_unicode);
+    define_bool!(3, has_word_boundary_ascii, set_has_word_boundary_ascii);
+}
+
+/// A sequence of transitions used to represent a sparse state.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct SparseTransitions {
+    pub ranges: Box<[Transition]>,
+}
+
+impl SparseTransitions {
+    pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
+        haystack.get(at).and_then(|&b| self.matches_byte(b))
+    }
+
+    pub fn matches_unit(&self, unit: alphabet::Unit) -> Option<StateID> {
+        unit.as_u8().map_or(None, |byte| self.matches_byte(byte))
+    }
+
+    pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
+        for t in self.ranges.iter() {
+            if t.start > byte {
+                break;
+            } else if t.matches_byte(byte) {
+                return Some(t.next);
+            }
+        }
+        None
+
+        /*
+        // This is an alternative implementation that uses binary search. In
+        // some ad hoc experiments, like
+        //
+        //   smallishru=OpenSubtitles2018.raw.sample.smallish.ru
+        //   regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b'
+        //
+        // I could not observe any improvement, and in fact, things seemed to
+        // be a bit slower.
+        self.ranges
+            .binary_search_by(|t| {
+                if t.end < byte {
+                    core::cmp::Ordering::Less
+                } else if t.start > byte {
+                    core::cmp::Ordering::Greater
+                } else {
+                    core::cmp::Ordering::Equal
+                }
+            })
+            .ok()
+            .map(|i| self.ranges[i].next)
+        */
+    }
+}
+
+/// A transition to another state, only if the given byte falls in the
+/// inclusive range specified.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Transition {
+    pub start: u8,
+    pub end: u8,
+    pub next: StateID,
+}
+
+impl Transition {
+    pub fn matches(&self, haystack: &[u8], at: usize) -> bool {
+        haystack.get(at).map_or(false, |&b| self.matches_byte(b))
+    }
+
+    pub fn matches_unit(&self, unit: alphabet::Unit) -> bool {
+        unit.as_u8().map_or(false, |byte| self.matches_byte(byte))
+    }
+
+    pub fn matches_byte(&self, byte: u8) -> bool {
+        self.start <= byte && byte <= self.end
+    }
+}
+
+impl fmt::Debug for Transition {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        use crate::util::DebugByte;
+
+        let Transition { start, end, next } = *self;
+        if self.start == self.end {
+            write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())
+        } else {
+            write!(
+                f,
+                "{:?}-{:?} => {:?}",
+                DebugByte(start),
+                DebugByte(end),
+                next.as_usize(),
+            )
+        }
+    }
+}
+
+/// A conditional NFA epsilon transition.
+///
+/// A simulation of the NFA can only move through this epsilon transition if
+/// the current position satisfies some look-around property. Some assertions
+/// are look-behind (StartLine, StartText), some assertions are look-ahead
+/// (EndLine, EndText) while other assertions are both look-behind and
+/// look-ahead (WordBoundary*).
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum Look {
+    /// The previous position is either `\n` or the current position is the
+    /// beginning of the haystack (i.e., at position `0`).
+    StartLine = 1 << 0,
+    /// The next position is either `\n` or the current position is the end of
+    /// the haystack (i.e., at position `haystack.len()`).
+    EndLine = 1 << 1,
+    /// The current position is the beginning of the haystack (i.e., at
+    /// position `0`).
+    StartText = 1 << 2,
+    /// The current position is the end of the haystack (i.e., at position
+    /// `haystack.len()`).
+    EndText = 1 << 3,
+    /// When tested at position `i`, where `p=decode_utf8_rev(&haystack[..i])`
+    /// and `n=decode_utf8(&haystack[i..])`, this assertion passes if and only
+    /// if `is_word(p) != is_word(n)`. If `i=0`, then `is_word(p)=false` and if
+    /// `i=haystack.len()`, then `is_word(n)=false`.
+    WordBoundaryUnicode = 1 << 4,
+    /// Same as for `WordBoundaryUnicode`, but requires that
+    /// `is_word(p) == is_word(n)`.
+    WordBoundaryUnicodeNegate = 1 << 5,
+    /// When tested at position `i`, where `p=haystack[i-1]` and
+    /// `n=haystack[i]`, this assertion passes if and only if `is_word(p)
+    /// != is_word(n)`. If `i=0`, then `is_word(p)=false` and if
+    /// `i=haystack.len()`, then `is_word(n)=false`.
+    WordBoundaryAscii = 1 << 6,
+    /// Same as for `WordBoundaryAscii`, but requires that
+    /// `is_word(p) == is_word(n)`.
+    ///
+    /// Note that it is possible for this assertion to match at positions that
+    /// split the UTF-8 encoding of a codepoint. For this reason, this may only
+    /// be used when UTF-8 mode is disable in the regex syntax.
+    WordBoundaryAsciiNegate = 1 << 7,
+}
+
+impl Look {
+    #[inline(always)]
+    pub fn matches(&self, bytes: &[u8], at: usize) -> bool {
+        match *self {
+            Look::StartLine => at == 0 || bytes[at - 1] == b'\n',
+            Look::EndLine => at == bytes.len() || bytes[at] == b'\n',
+            Look::StartText => at == 0,
+            Look::EndText => at == bytes.len(),
+            Look::WordBoundaryUnicode => {
+                let word_before = is_word_char_rev(bytes, at);
+                let word_after = is_word_char_fwd(bytes, at);
+                word_before != word_after
+            }
+            Look::WordBoundaryUnicodeNegate => {
+                // This is pretty subtle. Why do we need to do UTF-8 decoding
+                // here? Well... at time of writing, the is_word_char_{fwd,rev}
+                // routines will only return true if there is a valid UTF-8
+                // encoding of a "word" codepoint, and false in every other
+                // case (including invalid UTF-8). This means that in regions
+                // of invalid UTF-8 (which might be a subset of valid UTF-8!),
+                // it would result in \B matching. While this would be
+                // questionable in the context of truly invalid UTF-8, it is
+                // *certainly* wrong to report match boundaries that split the
+                // encoding of a codepoint. So to work around this, we ensure
+                // that we can decode a codepoint on either side of `at`. If
+                // either direction fails, then we don't permit \B to match at
+                // all.
+                //
+                // Now, this isn't exactly optimal from a perf perspective. We
+                // could try and detect this in is_word_char_{fwd,rev}, but
+                // it's not clear if it's worth it. \B is, after all, rarely
+                // used.
+                //
+                // And in particular, we do *not* have to do this with \b,
+                // because \b *requires* that at least one side of `at` be a
+                // "word" codepoint, which in turn implies one side of `at`
+                // must be valid UTF-8. This in turn implies that \b can never
+                // split a valid UTF-8 encoding of a codepoint. In the case
+                // where one side of `at` is truly invalid UTF-8 and the other
+                // side IS a word codepoint, then we want \b to match since it
+                // represents a valid UTF-8 boundary. It also makes sense. For
+                // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'.
+                let word_before = at > 0
+                    && match decode_last_utf8(&bytes[..at]) {
+                        None | Some(Err(_)) => return false,
+                        Some(Ok(_)) => is_word_char_rev(bytes, at),
+                    };
+                let word_after = at < bytes.len()
+                    && match decode_utf8(&bytes[at..]) {
+                        None | Some(Err(_)) => return false,
+                        Some(Ok(_)) => is_word_char_fwd(bytes, at),
+                    };
+                word_before == word_after
+            }
+            Look::WordBoundaryAscii => {
+                let word_before = at > 0 && is_word_byte(bytes[at - 1]);
+                let word_after = at < bytes.len() && is_word_byte(bytes[at]);
+                word_before != word_after
+            }
+            Look::WordBoundaryAsciiNegate => {
+                let word_before = at > 0 && is_word_byte(bytes[at - 1]);
+                let word_after = at < bytes.len() && is_word_byte(bytes[at]);
+                word_before == word_after
+            }
+        }
+    }
+
+    /// Create a look-around assertion from its corresponding integer (as
+    /// defined in `Look`). If the given integer does not correspond to any
+    /// assertion, then None is returned.
+    fn from_int(n: u8) -> Option<Look> {
+        match n {
+            0b0000_0001 => Some(Look::StartLine),
+            0b0000_0010 => Some(Look::EndLine),
+            0b0000_0100 => Some(Look::StartText),
+            0b0000_1000 => Some(Look::EndText),
+            0b0001_0000 => Some(Look::WordBoundaryUnicode),
+            0b0010_0000 => Some(Look::WordBoundaryUnicodeNegate),
+            0b0100_0000 => Some(Look::WordBoundaryAscii),
+            0b1000_0000 => Some(Look::WordBoundaryAsciiNegate),
+            _ => None,
+        }
+    }
+
+    /// Flip the look-around assertion to its equivalent for reverse searches.
+    fn reversed(&self) -> Look {
+        match *self {
+            Look::StartLine => Look::EndLine,
+            Look::EndLine => Look::StartLine,
+            Look::StartText => Look::EndText,
+            Look::EndText => Look::StartText,
+            Look::WordBoundaryUnicode => Look::WordBoundaryUnicode,
+            Look::WordBoundaryUnicodeNegate => Look::WordBoundaryUnicodeNegate,
+            Look::WordBoundaryAscii => Look::WordBoundaryAscii,
+            Look::WordBoundaryAsciiNegate => Look::WordBoundaryAsciiNegate,
+        }
+    }
+
+    /// Split up the given byte classes into equivalence classes in a way that
+    /// is consistent with this look-around assertion.
+    fn add_to_byteset(&self, set: &mut ByteClassSet) {
+        match *self {
+            Look::StartText | Look::EndText => {}
+            Look::StartLine | Look::EndLine => {
+                set.set_range(b'\n', b'\n');
+            }
+            Look::WordBoundaryUnicode
+            | Look::WordBoundaryUnicodeNegate
+            | Look::WordBoundaryAscii
+            | Look::WordBoundaryAsciiNegate => {
+                // We need to mark all ranges of bytes whose pairs result in
+                // evaluating \b differently. This isn't technically correct
+                // for Unicode word boundaries, but DFAs can't handle those
+                // anyway, and thus, the byte classes don't need to either
+                // since they are themselves only used in DFAs.
+                let iswb = regex_syntax::is_word_byte;
+                let mut b1: u16 = 0;
+                let mut b2: u16;
+                while b1 <= 255 {
+                    b2 = b1 + 1;
+                    while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
+                        b2 += 1;
+                    }
+                    set.set_range(b1 as u8, (b2 - 1) as u8);
+                    b1 = b2;
+                }
+            }
+        }
+    }
+}
+
+/// LookSet is a memory-efficient set of look-around assertions. Callers may
+/// idempotently insert or remove any look-around assertion from a set.
+#[repr(transparent)]
+#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub(crate) struct LookSet {
+    set: u8,
+}
+
+impl LookSet {
+    /// Return a LookSet from its representation.
+    pub(crate) fn from_repr(repr: u8) -> LookSet {
+        LookSet { set: repr }
+    }
+
+    /// Return a mutable LookSet from a mutable pointer to its representation.
+    pub(crate) fn from_repr_mut(repr: &mut u8) -> &mut LookSet {
+        // SAFETY: This is safe since a LookSet is repr(transparent) where its
+        // repr is a u8.
+        unsafe { core::mem::transmute::<&mut u8, &mut LookSet>(repr) }
+    }
+
+    /// Return true if and only if this set is empty.
+    pub(crate) fn is_empty(&self) -> bool {
+        self.set == 0
+    }
+
+    /// Clears this set such that it has no assertions in it.
+    pub(crate) fn clear(&mut self) {
+        self.set = 0;
+    }
+
+    /// Insert the given look-around assertion into this set. If the assertion
+    /// already exists, then this is a no-op.
+    pub(crate) fn insert(&mut self, look: Look) {
+        self.set |= look as u8;
+    }
+
+    /// Remove the given look-around assertion from this set. If the assertion
+    /// is not in this set, then this is a no-op.
+    #[cfg(test)]
+    pub(crate) fn remove(&mut self, look: Look) {
+        self.set &= !(look as u8);
+    }
+
+    /// Return true if and only if the given assertion is in this set.
+    pub(crate) fn contains(&self, look: Look) -> bool {
+        (look as u8) & self.set != 0
+    }
+
+    /// Subtract the given `other` set from the `self` set and return a new
+    /// set.
+    pub(crate) fn subtract(&self, other: LookSet) -> LookSet {
+        LookSet { set: self.set & !other.set }
+    }
+
+    /// Return the intersection of the given `other` set with the `self` set
+    /// and return the resulting set.
+    pub(crate) fn intersect(&self, other: LookSet) -> LookSet {
+        LookSet { set: self.set & other.set }
+    }
+}
+
+impl core::fmt::Debug for LookSet {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        let mut members = vec![];
+        for i in 0..8 {
+            let look = match Look::from_int(1 << i) {
+                None => continue,
+                Some(look) => look,
+            };
+            if self.contains(look) {
+                members.push(look);
+            }
+        }
+        f.debug_tuple("LookSet").field(&members).finish()
+    }
+}
+
+/// An iterator over all pattern IDs in an NFA.
+pub struct PatternIter<'a> {
+    it: PatternIDIter,
+    /// We explicitly associate a lifetime with this iterator even though we
+    /// don't actually borrow anything from the NFA. We do this for backward
+    /// compatibility purposes. If we ever do need to borrow something from
+    /// the NFA, then we can and just get rid of this marker without breaking
+    /// the public API.
+    _marker: core::marker::PhantomData<&'a ()>,
+}
+
+impl<'a> Iterator for PatternIter<'a> {
+    type Item = PatternID;
+
+    fn next(&mut self) -> Option<PatternID> {
+        self.it.next()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    // TODO: Replace tests using DFA with NFA matching engine once implemented.
+    use crate::dfa::{dense, Automaton};
+
+    #[test]
+    fn always_match() {
+        let nfa = NFA::always_match();
+        let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap();
+        let find = |input, start, end| {
+            dfa.find_leftmost_fwd_at(None, None, input, start, end)
+                .unwrap()
+                .map(|m| m.offset())
+        };
+
+        assert_eq!(Some(0), find(b"", 0, 0));
+        assert_eq!(Some(0), find(b"a", 0, 1));
+        assert_eq!(Some(1), find(b"a", 1, 1));
+        assert_eq!(Some(0), find(b"ab", 0, 2));
+        assert_eq!(Some(1), find(b"ab", 1, 2));
+        assert_eq!(Some(2), find(b"ab", 2, 2));
+    }
+
+    #[test]
+    fn never_match() {
+        let nfa = NFA::never_match();
+        let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap();
+        let find = |input, start, end| {
+            dfa.find_leftmost_fwd_at(None, None, input, start, end)
+                .unwrap()
+                .map(|m| m.offset())
+        };
+
+        assert_eq!(None, find(b"", 0, 0));
+        assert_eq!(None, find(b"a", 0, 1));
+        assert_eq!(None, find(b"a", 1, 1));
+        assert_eq!(None, find(b"ab", 0, 2));
+        assert_eq!(None, find(b"ab", 1, 2));
+        assert_eq!(None, find(b"ab", 2, 2));
+    }
+
+    #[test]
+    fn look_set() {
+        let mut f = LookSet::default();
+        assert!(!f.contains(Look::StartText));
+        assert!(!f.contains(Look::EndText));
+        assert!(!f.contains(Look::StartLine));
+        assert!(!f.contains(Look::EndLine));
+        assert!(!f.contains(Look::WordBoundaryUnicode));
+        assert!(!f.contains(Look::WordBoundaryUnicodeNegate));
+        assert!(!f.contains(Look::WordBoundaryAscii));
+        assert!(!f.contains(Look::WordBoundaryAsciiNegate));
+
+        f.insert(Look::StartText);
+        assert!(f.contains(Look::StartText));
+        f.remove(Look::StartText);
+        assert!(!f.contains(Look::StartText));
+
+        f.insert(Look::EndText);
+        assert!(f.contains(Look::EndText));
+        f.remove(Look::EndText);
+        assert!(!f.contains(Look::EndText));
+
+        f.insert(Look::StartLine);
+        assert!(f.contains(Look::StartLine));
+        f.remove(Look::StartLine);
+        assert!(!f.contains(Look::StartLine));
+
+        f.insert(Look::EndLine);
+        assert!(f.contains(Look::EndLine));
+        f.remove(Look::EndLine);
+        assert!(!f.contains(Look::EndLine));
+
+        f.insert(Look::WordBoundaryUnicode);
+        assert!(f.contains(Look::WordBoundaryUnicode));
+        f.remove(Look::WordBoundaryUnicode);
+        assert!(!f.contains(Look::WordBoundaryUnicode));
+
+        f.insert(Look::WordBoundaryUnicodeNegate);
+        assert!(f.contains(Look::WordBoundaryUnicodeNegate));
+        f.remove(Look::WordBoundaryUnicodeNegate);
+        assert!(!f.contains(Look::WordBoundaryUnicodeNegate));
+
+        f.insert(Look::WordBoundaryAscii);
+        assert!(f.contains(Look::WordBoundaryAscii));
+        f.remove(Look::WordBoundaryAscii);
+        assert!(!f.contains(Look::WordBoundaryAscii));
+
+        f.insert(Look::WordBoundaryAsciiNegate);
+        assert!(f.contains(Look::WordBoundaryAsciiNegate));
+        f.remove(Look::WordBoundaryAsciiNegate);
+        assert!(!f.contains(Look::WordBoundaryAsciiNegate));
+    }
+
+    #[test]
+    fn look_matches_start_line() {
+        let look = Look::StartLine;
+
+        assert!(look.matches(B(""), 0));
+        assert!(look.matches(B("\n"), 0));
+        assert!(look.matches(B("\n"), 1));
+        assert!(look.matches(B("a"), 0));
+        assert!(look.matches(B("\na"), 1));
+
+        assert!(!look.matches(B("a"), 1));
+        assert!(!look.matches(B("a\na"), 1));
+    }
+
+    #[test]
+    fn look_matches_end_line() {
+        let look = Look::EndLine;
+
+        assert!(look.matches(B(""), 0));
+        assert!(look.matches(B("\n"), 1));
+        assert!(look.matches(B("\na"), 0));
+        assert!(look.matches(B("\na"), 2));
+        assert!(look.matches(B("a\na"), 1));
+
+        assert!(!look.matches(B("a"), 0));
+        assert!(!look.matches(B("\na"), 1));
+        assert!(!look.matches(B("a\na"), 0));
+        assert!(!look.matches(B("a\na"), 2));
+    }
+
+    #[test]
+    fn look_matches_start_text() {
+        let look = Look::StartText;
+
+        assert!(look.matches(B(""), 0));
+        assert!(look.matches(B("\n"), 0));
+        assert!(look.matches(B("a"), 0));
+
+        assert!(!look.matches(B("\n"), 1));
+        assert!(!look.matches(B("\na"), 1));
+        assert!(!look.matches(B("a"), 1));
+        assert!(!look.matches(B("a\na"), 1));
+    }
+
+    #[test]
+    fn look_matches_end_text() {
+        let look = Look::EndText;
+
+        assert!(look.matches(B(""), 0));
+        assert!(look.matches(B("\n"), 1));
+        assert!(look.matches(B("\na"), 2));
+
+        assert!(!look.matches(B("\na"), 0));
+        assert!(!look.matches(B("a\na"), 1));
+        assert!(!look.matches(B("a"), 0));
+        assert!(!look.matches(B("\na"), 1));
+        assert!(!look.matches(B("a\na"), 0));
+        assert!(!look.matches(B("a\na"), 2));
+    }
+
+    #[test]
+    fn look_matches_word_unicode() {
+        let look = Look::WordBoundaryUnicode;
+
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+        // Simple ASCII word boundaries.
+        assert!(look.matches(B("a"), 0));
+        assert!(look.matches(B("a"), 1));
+        assert!(look.matches(B("a "), 1));
+        assert!(look.matches(B(" a "), 1));
+        assert!(look.matches(B(" a "), 2));
+
+        // Unicode word boundaries with a non-ASCII codepoint.
+        assert!(look.matches(B("𝛃"), 0));
+        assert!(look.matches(B("𝛃"), 4));
+        assert!(look.matches(B("𝛃 "), 4));
+        assert!(look.matches(B(" 𝛃 "), 1));
+        assert!(look.matches(B(" 𝛃 "), 5));
+
+        // Unicode word boundaries between non-ASCII codepoints.
+        assert!(look.matches(B("𝛃𐆀"), 0));
+        assert!(look.matches(B("𝛃𐆀"), 4));
+
+        // Non word boundaries for ASCII.
+        assert!(!look.matches(B(""), 0));
+        assert!(!look.matches(B("ab"), 1));
+        assert!(!look.matches(B("a "), 2));
+        assert!(!look.matches(B(" a "), 0));
+        assert!(!look.matches(B(" a "), 3));
+
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(!look.matches(B("𝛃b"), 4));
+        assert!(!look.matches(B("𝛃 "), 5));
+        assert!(!look.matches(B(" 𝛃 "), 0));
+        assert!(!look.matches(B(" 𝛃 "), 6));
+        assert!(!look.matches(B("𝛃"), 1));
+        assert!(!look.matches(B("𝛃"), 2));
+        assert!(!look.matches(B("𝛃"), 3));
+
+        // Non word boundaries with non-ASCII codepoints.
+        assert!(!look.matches(B("𝛃𐆀"), 1));
+        assert!(!look.matches(B("𝛃𐆀"), 2));
+        assert!(!look.matches(B("𝛃𐆀"), 3));
+        assert!(!look.matches(B("𝛃𐆀"), 5));
+        assert!(!look.matches(B("𝛃𐆀"), 6));
+        assert!(!look.matches(B("𝛃𐆀"), 7));
+        assert!(!look.matches(B("𝛃𐆀"), 8));
+    }
+
+    #[test]
+    fn look_matches_word_ascii() {
+        let look = Look::WordBoundaryAscii;
+
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+        // Simple ASCII word boundaries.
+        assert!(look.matches(B("a"), 0));
+        assert!(look.matches(B("a"), 1));
+        assert!(look.matches(B("a "), 1));
+        assert!(look.matches(B(" a "), 1));
+        assert!(look.matches(B(" a "), 2));
+
+        // Unicode word boundaries with a non-ASCII codepoint. Since this is
+        // an ASCII word boundary, none of these match.
+        assert!(!look.matches(B("𝛃"), 0));
+        assert!(!look.matches(B("𝛃"), 4));
+        assert!(!look.matches(B("𝛃 "), 4));
+        assert!(!look.matches(B(" 𝛃 "), 1));
+        assert!(!look.matches(B(" 𝛃 "), 5));
+
+        // Unicode word boundaries between non-ASCII codepoints. Again, since
+        // this is an ASCII word boundary, none of these match.
+        assert!(!look.matches(B("𝛃𐆀"), 0));
+        assert!(!look.matches(B("𝛃𐆀"), 4));
+
+        // Non word boundaries for ASCII.
+        assert!(!look.matches(B(""), 0));
+        assert!(!look.matches(B("ab"), 1));
+        assert!(!look.matches(B("a "), 2));
+        assert!(!look.matches(B(" a "), 0));
+        assert!(!look.matches(B(" a "), 3));
+
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(look.matches(B("𝛃b"), 4));
+        assert!(!look.matches(B("𝛃 "), 5));
+        assert!(!look.matches(B(" 𝛃 "), 0));
+        assert!(!look.matches(B(" 𝛃 "), 6));
+        assert!(!look.matches(B("𝛃"), 1));
+        assert!(!look.matches(B("𝛃"), 2));
+        assert!(!look.matches(B("𝛃"), 3));
+
+        // Non word boundaries with non-ASCII codepoints.
+        assert!(!look.matches(B("𝛃𐆀"), 1));
+        assert!(!look.matches(B("𝛃𐆀"), 2));
+        assert!(!look.matches(B("𝛃𐆀"), 3));
+        assert!(!look.matches(B("𝛃𐆀"), 5));
+        assert!(!look.matches(B("𝛃𐆀"), 6));
+        assert!(!look.matches(B("𝛃𐆀"), 7));
+        assert!(!look.matches(B("𝛃𐆀"), 8));
+    }
+
+    #[test]
+    fn look_matches_word_unicode_negate() {
+        let look = Look::WordBoundaryUnicodeNegate;
+
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+        // Simple ASCII word boundaries.
+        assert!(!look.matches(B("a"), 0));
+        assert!(!look.matches(B("a"), 1));
+        assert!(!look.matches(B("a "), 1));
+        assert!(!look.matches(B(" a "), 1));
+        assert!(!look.matches(B(" a "), 2));
+
+        // Unicode word boundaries with a non-ASCII codepoint.
+        assert!(!look.matches(B("𝛃"), 0));
+        assert!(!look.matches(B("𝛃"), 4));
+        assert!(!look.matches(B("𝛃 "), 4));
+        assert!(!look.matches(B(" 𝛃 "), 1));
+        assert!(!look.matches(B(" 𝛃 "), 5));
+
+        // Unicode word boundaries between non-ASCII codepoints.
+        assert!(!look.matches(B("𝛃𐆀"), 0));
+        assert!(!look.matches(B("𝛃𐆀"), 4));
+
+        // Non word boundaries for ASCII.
+        assert!(look.matches(B(""), 0));
+        assert!(look.matches(B("ab"), 1));
+        assert!(look.matches(B("a "), 2));
+        assert!(look.matches(B(" a "), 0));
+        assert!(look.matches(B(" a "), 3));
+
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(look.matches(B("𝛃b"), 4));
+        assert!(look.matches(B("𝛃 "), 5));
+        assert!(look.matches(B(" 𝛃 "), 0));
+        assert!(look.matches(B(" 𝛃 "), 6));
+        // These don't match because they could otherwise return an offset that
+        // splits the UTF-8 encoding of a codepoint.
+        assert!(!look.matches(B("𝛃"), 1));
+        assert!(!look.matches(B("𝛃"), 2));
+        assert!(!look.matches(B("𝛃"), 3));
+
+        // Non word boundaries with non-ASCII codepoints. These also don't
+        // match because they could otherwise return an offset that splits the
+        // UTF-8 encoding of a codepoint.
+        assert!(!look.matches(B("𝛃𐆀"), 1));
+        assert!(!look.matches(B("𝛃𐆀"), 2));
+        assert!(!look.matches(B("𝛃𐆀"), 3));
+        assert!(!look.matches(B("𝛃𐆀"), 5));
+        assert!(!look.matches(B("𝛃𐆀"), 6));
+        assert!(!look.matches(B("𝛃𐆀"), 7));
+        // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end
+        // of the haystack. So the "end" of the haystack isn't a word and 𐆀
+        // isn't a word, thus, \B matches.
+        assert!(look.matches(B("𝛃𐆀"), 8));
+    }
+
+    #[test]
+    fn look_matches_word_ascii_negate() {
+        let look = Look::WordBoundaryAsciiNegate;
+
+        // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
+        // \xF0\x90\x86\x80 = 𐆀 (not in \w)
+
+        // Simple ASCII word boundaries.
+        assert!(!look.matches(B("a"), 0));
+        assert!(!look.matches(B("a"), 1));
+        assert!(!look.matches(B("a "), 1));
+        assert!(!look.matches(B(" a "), 1));
+        assert!(!look.matches(B(" a "), 2));
+
+        // Unicode word boundaries with a non-ASCII codepoint. Since this is
+        // an ASCII word boundary, none of these match.
+        assert!(look.matches(B("𝛃"), 0));
+        assert!(look.matches(B("𝛃"), 4));
+        assert!(look.matches(B("𝛃 "), 4));
+        assert!(look.matches(B(" 𝛃 "), 1));
+        assert!(look.matches(B(" 𝛃 "), 5));
+
+        // Unicode word boundaries between non-ASCII codepoints. Again, since
+        // this is an ASCII word boundary, none of these match.
+        assert!(look.matches(B("𝛃𐆀"), 0));
+        assert!(look.matches(B("𝛃𐆀"), 4));
+
+        // Non word boundaries for ASCII.
+        assert!(look.matches(B(""), 0));
+        assert!(look.matches(B("ab"), 1));
+        assert!(look.matches(B("a "), 2));
+        assert!(look.matches(B(" a "), 0));
+        assert!(look.matches(B(" a "), 3));
+
+        // Non word boundaries with a non-ASCII codepoint.
+        assert!(!look.matches(B("𝛃b"), 4));
+        assert!(look.matches(B("𝛃 "), 5));
+        assert!(look.matches(B(" 𝛃 "), 0));
+        assert!(look.matches(B(" 𝛃 "), 6));
+        assert!(look.matches(B("𝛃"), 1));
+        assert!(look.matches(B("𝛃"), 2));
+        assert!(look.matches(B("𝛃"), 3));
+
+        // Non word boundaries with non-ASCII codepoints.
+        assert!(look.matches(B("𝛃𐆀"), 1));
+        assert!(look.matches(B("𝛃𐆀"), 2));
+        assert!(look.matches(B("𝛃𐆀"), 3));
+        assert!(look.matches(B("𝛃𐆀"), 5));
+        assert!(look.matches(B("𝛃𐆀"), 6));
+        assert!(look.matches(B("𝛃𐆀"), 7));
+        assert!(look.matches(B("𝛃𐆀"), 8));
+    }
+
+    fn B<'a, T: 'a + ?Sized + AsRef<[u8]>>(string: &'a T) -> &'a [u8] {
+        string.as_ref()
+    }
+}
diff --git a/src/nfa/thompson/pikevm.rs b/src/nfa/thompson/pikevm.rs

new file mode 100644 (file)

index 0000000..7572f9f
--- /dev/null
+++ b/src/nfa/thompson/pikevm.rs
@@ -0,0 +1,554 @@
+use alloc::{sync::Arc, vec, vec::Vec};
+
+use crate::{
+    nfa::thompson::{self, Error, State, NFA},
+    util::{
+        id::{PatternID, StateID},
+        matchtypes::MultiMatch,
+        sparse_set::SparseSet,
+    },
+};
+
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+    anchored: Option<bool>,
+    utf8: Option<bool>,
+}
+
+impl Config {
+    /// Return a new default PikeVM configuration.
+    pub fn new() -> Config {
+        Config::default()
+    }
+
+    pub fn anchored(mut self, yes: bool) -> Config {
+        self.anchored = Some(yes);
+        self
+    }
+
+    pub fn utf8(mut self, yes: bool) -> Config {
+        self.utf8 = Some(yes);
+        self
+    }
+
+    pub fn get_anchored(&self) -> bool {
+        self.anchored.unwrap_or(false)
+    }
+
+    pub fn get_utf8(&self) -> bool {
+        self.utf8.unwrap_or(true)
+    }
+
+    pub(crate) fn overwrite(self, o: Config) -> Config {
+        Config {
+            anchored: o.anchored.or(self.anchored),
+            utf8: o.utf8.or(self.utf8),
+        }
+    }
+}
+
+/// A builder for a PikeVM.
+#[derive(Clone, Debug)]
+pub struct Builder {
+    config: Config,
+    thompson: thompson::Builder,
+}
+
+impl Builder {
+    /// Create a new PikeVM builder with its default configuration.
+    pub fn new() -> Builder {
+        Builder {
+            config: Config::default(),
+            thompson: thompson::Builder::new(),
+        }
+    }
+
+    pub fn build(&self, pattern: &str) -> Result<PikeVM, Error> {
+        self.build_many(&[pattern])
+    }
+
+    pub fn build_many<P: AsRef<str>>(
+        &self,
+        patterns: &[P],
+    ) -> Result<PikeVM, Error> {
+        let nfa = self.thompson.build_many(patterns)?;
+        self.build_from_nfa(Arc::new(nfa))
+    }
+
+    pub fn build_from_nfa(&self, nfa: Arc<NFA>) -> Result<PikeVM, Error> {
+        // TODO: Check that this is correct.
+        // if !cfg!(all(
+        // feature = "dfa",
+        // feature = "syntax",
+        // feature = "unicode-perl"
+        // )) {
+        if !cfg!(feature = "syntax") {
+            if nfa.has_word_boundary_unicode() {
+                return Err(Error::unicode_word_unavailable());
+            }
+        }
+        Ok(PikeVM { config: self.config, nfa })
+    }
+
+    pub fn configure(&mut self, config: Config) -> &mut Builder {
+        self.config = self.config.overwrite(config);
+        self
+    }
+
+    /// Set the syntax configuration for this builder using
+    /// [`SyntaxConfig`](crate::SyntaxConfig).
+    ///
+    /// This permits setting things like case insensitivity, Unicode and multi
+    /// line mode.
+    ///
+    /// These settings only apply when constructing a PikeVM directly from a
+    /// pattern.
+    pub fn syntax(
+        &mut self,
+        config: crate::util::syntax::SyntaxConfig,
+    ) -> &mut Builder {
+        self.thompson.syntax(config);
+        self
+    }
+
+    /// Set the Thompson NFA configuration for this builder using
+    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+    ///
+    /// This permits setting things like if additional time should be spent
+    /// shrinking the size of the NFA.
+    ///
+    /// These settings only apply when constructing a PikeVM directly from a
+    /// pattern.
+    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+        self.thompson.configure(config);
+        self
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct PikeVM {
+    config: Config,
+    nfa: Arc<NFA>,
+}
+
+impl PikeVM {
+    pub fn new(pattern: &str) -> Result<PikeVM, Error> {
+        PikeVM::builder().build(pattern)
+    }
+
+    pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<PikeVM, Error> {
+        PikeVM::builder().build_many(patterns)
+    }
+
+    pub fn config() -> Config {
+        Config::new()
+    }
+
+    pub fn builder() -> Builder {
+        Builder::new()
+    }
+
+    pub fn create_cache(&self) -> Cache {
+        Cache::new(self.nfa())
+    }
+
+    pub fn create_captures(&self) -> Captures {
+        Captures::new(self.nfa())
+    }
+
+    pub fn nfa(&self) -> &Arc<NFA> {
+        &self.nfa
+    }
+
+    pub fn find_leftmost_iter<'r, 'c, 't>(
+        &'r self,
+        cache: &'c mut Cache,
+        haystack: &'t [u8],
+    ) -> FindLeftmostMatches<'r, 'c, 't> {
+        FindLeftmostMatches::new(self, cache, haystack)
+    }
+
+    // BREADCRUMBS:
+    //
+    // 1) Don't forget about prefilters.
+    //
+    // 2) Consider the case of using a PikeVM with an NFA that has Capture
+    // states, but where we don't want to track capturing groups (other than
+    // group 0). This potentially saves a lot of copying around and what not. I
+    // believe the current regex crate does this, for example. The interesting
+    // bit here is how to handle the case of multiple patterns...
+    //
+    // 3) Permit the caller to specify a pattern ID to run an anchored-only
+    // search on.
+    //
+    // 4) How to do overlapping? The way multi-regex support works in the regex
+    // crate currently is to run the PikeVM until either we reach the end of
+    // the haystack or when we know all regexes have matched. The latter case
+    // is probably quite rare, so the common case is likely that we're always
+    // searching the entire input. The question is: can we emulate that with
+    // our typical 'overlapping' APIs on DFAs? I believe we can. If so, then
+    // all we need to do is provide an overlapping API on the PikeVM that
+    // roughly matches the ones we provide on DFAs. For those APIs, the only
+    // thing they need over non-overlapping APIs is "caller state." For DFAs,
+    // the caller state is simple: it contains the last state visited and the
+    // last match reported. For the PikeVM (and NFAs in general), the "last
+    // state" is actually a *set* of NFA states. So I think what happens here
+    // is that we can just force the `Cache` to subsume this role. We'll still
+    // need some additional state to track the last match reported though.
+    // Because when two or more patterns match at the same location, we need a
+    // way to know to iterate over them. Although maybe it's not match index we
+    // need, but the state index of the last NFA state processed in the cache.
+    // Then we just pick up where we left off. There might be another match
+    // state, in which case, we report it.
+
+    pub fn find_leftmost_at(
+        &self,
+        cache: &mut Cache,
+        haystack: &[u8],
+        start: usize,
+        end: usize,
+        caps: &mut Captures,
+    ) -> Option<MultiMatch> {
+        let anchored =
+            self.config.get_anchored() || self.nfa.is_always_start_anchored();
+        let mut at = start;
+        let mut matched_pid = None;
+        cache.clear();
+        'LOOP: loop {
+            if cache.clist.set.is_empty() {
+                if matched_pid.is_some() || (anchored && at > start) {
+                    break 'LOOP;
+                }
+                // TODO: prefilter
+            }
+            if (!anchored && matched_pid.is_none())
+                || cache.clist.set.is_empty()
+            {
+                self.epsilon_closure(
+                    &mut cache.clist,
+                    &mut caps.slots,
+                    &mut cache.stack,
+                    self.nfa.start_anchored(),
+                    haystack,
+                    at,
+                );
+            }
+            for i in 0..cache.clist.set.len() {
+                let sid = cache.clist.set.get(i);
+                let pid = match self.step(
+                    &mut cache.nlist,
+                    &mut caps.slots,
+                    cache.clist.caps(sid),
+                    &mut cache.stack,
+                    sid,
+                    haystack,
+                    at,
+                ) {
+                    None => continue,
+                    Some(pid) => pid,
+                };
+                matched_pid = Some(pid);
+                break;
+            }
+            if at >= end {
+                break;
+            }
+            at += 1;
+            cache.swap();
+            cache.nlist.set.clear();
+        }
+        matched_pid.map(|pid| {
+            let slots = self.nfa.pattern_slots(pid);
+            let (start, end) = (slots.start, slots.start + 1);
+            MultiMatch::new(
+                pid,
+                caps.slots[start].unwrap(),
+                caps.slots[end].unwrap(),
+            )
+        })
+    }
+
+    #[inline(always)]
+    fn step(
+        &self,
+        nlist: &mut Threads,
+        slots: &mut [Slot],
+        thread_caps: &mut [Slot],
+        stack: &mut Vec<FollowEpsilon>,
+        sid: StateID,
+        haystack: &[u8],
+        at: usize,
+    ) -> Option<PatternID> {
+        match *self.nfa.state(sid) {
+            State::Fail
+            | State::Look { .. }
+            | State::Union { .. }
+            | State::Capture { .. } => None,
+            State::Range { ref range } => {
+                if range.matches(haystack, at) {
+                    self.epsilon_closure(
+                        nlist,
+                        thread_caps,
+                        stack,
+                        range.next,
+                        haystack,
+                        at + 1,
+                    );
+                }
+                None
+            }
+            State::Sparse(ref sparse) => {
+                if let Some(next) = sparse.matches(haystack, at) {
+                    self.epsilon_closure(
+                        nlist,
+                        thread_caps,
+                        stack,
+                        next,
+                        haystack,
+                        at + 1,
+                    );
+                }
+                None
+            }
+            State::Match { id } => {
+                slots.copy_from_slice(thread_caps);
+                Some(id)
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn epsilon_closure(
+        &self,
+        nlist: &mut Threads,
+        thread_caps: &mut [Slot],
+        stack: &mut Vec<FollowEpsilon>,
+        sid: StateID,
+        haystack: &[u8],
+        at: usize,
+    ) {
+        stack.push(FollowEpsilon::StateID(sid));
+        while let Some(frame) = stack.pop() {
+            match frame {
+                FollowEpsilon::StateID(sid) => {
+                    self.epsilon_closure_step(
+                        nlist,
+                        thread_caps,
+                        stack,
+                        sid,
+                        haystack,
+                        at,
+                    );
+                }
+                FollowEpsilon::Capture { slot, pos } => {
+                    thread_caps[slot] = pos;
+                }
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn epsilon_closure_step(
+        &self,
+        nlist: &mut Threads,
+        thread_caps: &mut [Slot],
+        stack: &mut Vec<FollowEpsilon>,
+        mut sid: StateID,
+        haystack: &[u8],
+        at: usize,
+    ) {
+        loop {
+            if !nlist.set.insert(sid) {
+                return;
+            }
+            match *self.nfa.state(sid) {
+                State::Fail
+                | State::Range { .. }
+                | State::Sparse { .. }
+                | State::Match { .. } => {
+                    let t = &mut nlist.caps(sid);
+                    t.copy_from_slice(thread_caps);
+                    return;
+                }
+                State::Look { look, next } => {
+                    if !look.matches(haystack, at) {
+                        return;
+                    }
+                    sid = next;
+                }
+                State::Union { ref alternates } => {
+                    sid = match alternates.get(0) {
+                        None => return,
+                        Some(&sid) => sid,
+                    };
+                    stack.extend(
+                        alternates[1..]
+                            .iter()
+                            .copied()
+                            .rev()
+                            .map(FollowEpsilon::StateID),
+                    );
+                }
+                State::Capture { next, slot } => {
+                    if slot < thread_caps.len() {
+                        stack.push(FollowEpsilon::Capture {
+                            slot,
+                            pos: thread_caps[slot],
+                        });
+                        thread_caps[slot] = Some(at);
+                    }
+                    sid = next;
+                }
+            }
+        }
+    }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct FindLeftmostMatches<'r, 'c, 't> {
+    vm: &'r PikeVM,
+    cache: &'c mut Cache,
+    // scanner: Option<prefilter::Scanner<'r>>,
+    text: &'t [u8],
+    last_end: usize,
+    last_match: Option<usize>,
+}
+
+impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> {
+    fn new(
+        vm: &'r PikeVM,
+        cache: &'c mut Cache,
+        text: &'t [u8],
+    ) -> FindLeftmostMatches<'r, 'c, 't> {
+        FindLeftmostMatches { vm, cache, text, last_end: 0, last_match: None }
+    }
+}
+
+impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> {
+    // type Item = Captures;
+    type Item = MultiMatch;
+
+    // fn next(&mut self) -> Option<Captures> {
+    fn next(&mut self) -> Option<MultiMatch> {
+        if self.last_end > self.text.len() {
+            return None;
+        }
+        let mut caps = self.vm.create_captures();
+        let m = self.vm.find_leftmost_at(
+            self.cache,
+            self.text,
+            self.last_end,
+            self.text.len(),
+            &mut caps,
+        )?;
+        if m.is_empty() {
+            // This is an empty match. To ensure we make progress, start
+            // the next search at the smallest possible starting position
+            // of the next match following this one.
+            self.last_end = if self.vm.config.get_utf8() {
+                crate::util::next_utf8(self.text, m.end())
+            } else {
+                m.end() + 1
+            };
+            // Don't accept empty matches immediately following a match.
+            // Just move on to the next match.
+            if Some(m.end()) == self.last_match {
+                return self.next();
+            }
+        } else {
+            self.last_end = m.end();
+        }
+        self.last_match = Some(m.end());
+        Some(m)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Captures {
+    slots: Vec<Slot>,
+}
+
+impl Captures {
+    pub fn new(nfa: &NFA) -> Captures {
+        Captures { slots: vec![None; nfa.capture_slot_len()] }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Cache {
+    stack: Vec<FollowEpsilon>,
+    clist: Threads,
+    nlist: Threads,
+}
+
+type Slot = Option<usize>;
+
+#[derive(Clone, Debug)]
+struct Threads {
+    set: SparseSet,
+    caps: Vec<Slot>,
+    slots_per_thread: usize,
+}
+
+#[derive(Clone, Debug)]
+enum FollowEpsilon {
+    StateID(StateID),
+    Capture { slot: usize, pos: Slot },
+}
+
+impl Cache {
+    pub fn new(nfa: &NFA) -> Cache {
+        Cache {
+            stack: vec![],
+            clist: Threads::new(nfa),
+            nlist: Threads::new(nfa),
+        }
+    }
+
+    fn clear(&mut self) {
+        self.stack.clear();
+        self.clist.set.clear();
+        self.nlist.set.clear();
+    }
+
+    fn swap(&mut self) {
+        core::mem::swap(&mut self.clist, &mut self.nlist);
+    }
+}
+
+impl Threads {
+    fn new(nfa: &NFA) -> Threads {
+        let mut threads = Threads {
+            set: SparseSet::new(0),
+            caps: vec![],
+            slots_per_thread: 0,
+        };
+        threads.resize(nfa);
+        threads
+    }
+
+    fn resize(&mut self, nfa: &NFA) {
+        if nfa.states().len() == self.set.capacity() {
+            return;
+        }
+        self.slots_per_thread = nfa.capture_slot_len();
+        self.set.resize(nfa.states().len());
+        self.caps.resize(self.slots_per_thread * nfa.states().len(), None);
+    }
+
+    fn caps(&mut self, sid: StateID) -> &mut [Slot] {
+        let i = sid.as_usize() * self.slots_per_thread;
+        &mut self.caps[i..i + self.slots_per_thread]
+    }
+}
diff --git a/src/nfa/thompson/range_trie.rs b/src/nfa/thompson/range_trie.rs

new file mode 100644 (file)

index 0000000..92f36ce
--- /dev/null
+++ b/src/nfa/thompson/range_trie.rs
@@ -0,0 +1,1051 @@
+// I've called the primary data structure in this module a "range trie." As far
+// as I can tell, there is no prior art on a data structure like this, however,
+// it's likely someone somewhere has built something like it. Searching for
+// "range trie" turns up the paper "Range Tries for Scalable Address Lookup,"
+// but it does not appear relevant.
+//
+// The range trie is just like a trie in that it is a special case of a
+// deterministic finite state machine. It has states and each state has a set
+// of transitions to other states. It is acyclic, and, like a normal trie,
+// it makes no attempt to reuse common suffixes among its elements. The key
+// difference between a normal trie and a range trie below is that a range trie
+// operates on *contiguous sequences* of bytes instead of singleton bytes.
+// One could say say that our alphabet is ranges of bytes instead of bytes
+// themselves, except a key part of range trie construction is splitting ranges
+// apart to ensure there is at most one transition that can be taken for any
+// byte in a given state.
+//
+// I've tried to explain the details of how the range trie works below, so
+// for now, we are left with trying to understand what problem we're trying to
+// solve. Which is itself fairly involved!
+//
+// At the highest level, here's what we want to do. We want to convert a
+// sequence of Unicode codepoints into a finite state machine whose transitions
+// are over *bytes* and *not* Unicode codepoints. We want this because it makes
+// said finite state machines much smaller and much faster to execute. As a
+// simple example, consider a byte oriented automaton for all Unicode scalar
+// values (0x00 through 0x10FFFF, not including surrogate codepoints):
+//
+//     [00-7F]
+//     [C2-DF][80-BF]
+//     [E0-E0][A0-BF][80-BF]
+//     [E1-EC][80-BF][80-BF]
+//     [ED-ED][80-9F][80-BF]
+//     [EE-EF][80-BF][80-BF]
+//     [F0-F0][90-BF][80-BF][80-BF]
+//     [F1-F3][80-BF][80-BF][80-BF]
+//     [F4-F4][80-8F][80-BF][80-BF]
+//
+// (These byte ranges are generated via the regex-syntax::utf8 module, which
+// was based on Russ Cox's code in RE2, which was in turn based on Ken
+// Thompson's implementation of the same idea in his Plan9 implementation of
+// grep.)
+//
+// It should be fairly straight-forward to see how one could compile this into
+// a DFA. The sequences are sorted and non-overlapping. Essentially, you could
+// build a trie from this fairly easy. The problem comes when your initial
+// range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class
+// represented by '\w' contains only a tenth of the codepoints that
+// 0x00-0x10FFFF contains, but if we were to write out the byte based ranges
+// as we did above, the list would stretch to 892 entries! This turns into
+// quite a large NFA with a few thousand states. Turning this beast into a DFA
+// takes quite a bit of time. We are thus left with trying to trim down the
+// number of states we produce as early as possible.
+//
+// One approach (used by RE2 and still by the regex crate, at time of writing)
+// is to try to find common suffixes while building NFA states for the above
+// and reuse them. This is very cheap to do and one can control precisely how
+// much extra memory you want to use for the cache.
+//
+// Another approach, however, is to reuse an algorithm for constructing a
+// *minimal* DFA from a sorted sequence of inputs. I don't want to go into
+// the full details here, but I explain it in more depth in my blog post on
+// FSTs[1]. Note that the algorithm was not invented by me, but was published
+// in paper by Daciuk et al. in 2000 called "Incremental Construction of
+// MinimalAcyclic Finite-State Automata." Like the suffix cache approach above,
+// it is also possible to control the amount of extra memory one uses, although
+// this usually comes with the cost of sacrificing true minimality. (But it's
+// typically close enough with a reasonably sized cache of states.)
+//
+// The catch is that Daciuk's algorithm only works if you add your keys in
+// lexicographic ascending order. In our case, since we're dealing with ranges,
+// we also need the additional requirement that ranges are either equivalent
+// or do not overlap at all. For example, if one were given the following byte
+// ranges:
+//
+//     [BC-BF][80-BF]
+//     [BC-BF][90-BF]
+//
+// Then Daciuk's algorithm would not work, since there is nothing to handle the
+// fact that the ranges overlap. They would need to be split apart. Thankfully,
+// Thompson's algorithm for producing byte ranges for Unicode codepoint ranges
+// meets both of our requirements. (A proof for this eludes me, but it appears
+// true.)
+//
+// ... however, we would also like to be able to compile UTF-8 automata in
+// reverse. We want this because in order to find the starting location of a
+// match using a DFA, we need to run a second DFA---a reversed version of the
+// forward DFA---backwards to discover the match location. Unfortunately, if
+// we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are
+// can overlap, even if they are sorted:
+//
+//     [00-7F]
+//     [80-BF][80-9F][ED-ED]
+//     [80-BF][80-BF][80-8F][F4-F4]
+//     [80-BF][80-BF][80-BF][F1-F3]
+//     [80-BF][80-BF][90-BF][F0-F0]
+//     [80-BF][80-BF][E1-EC]
+//     [80-BF][80-BF][EE-EF]
+//     [80-BF][A0-BF][E0-E0]
+//     [80-BF][C2-DF]
+//
+// For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have
+// overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no
+// simple way to apply Daciuk's algorithm.
+//
+// And thus, the range trie was born. The range trie's only purpose is to take
+// sequences of byte ranges like the ones above, collect them into a trie and
+// then spit them in a sorted fashion with no overlapping ranges. For example,
+// 0x00-0x10FFFF gets translated to:
+//
+//     [0-7F]
+//     [80-BF][80-9F][80-8F][F1-F3]
+//     [80-BF][80-9F][80-8F][F4]
+//     [80-BF][80-9F][90-BF][F0]
+//     [80-BF][80-9F][90-BF][F1-F3]
+//     [80-BF][80-9F][E1-EC]
+//     [80-BF][80-9F][ED]
+//     [80-BF][80-9F][EE-EF]
+//     [80-BF][A0-BF][80-8F][F1-F3]
+//     [80-BF][A0-BF][80-8F][F4]
+//     [80-BF][A0-BF][90-BF][F0]
+//     [80-BF][A0-BF][90-BF][F1-F3]
+//     [80-BF][A0-BF][E0]
+//     [80-BF][A0-BF][E1-EC]
+//     [80-BF][A0-BF][EE-EF]
+//     [80-BF][C2-DF]
+//
+// We've thus satisfied our requirements for running Daciuk's algorithm. All
+// sequences of ranges are sorted, and any corresponding ranges are either
+// exactly equivalent or non-overlapping.
+//
+// In effect, a range trie is building a DFA from a sequence of arbitrary
+// byte ranges. But it uses an algoritm custom tailored to its input, so it
+// is not as costly as traditional DFA construction. While it is still quite
+// a bit more costly than the forward's case (which only needs Daciuk's
+// algorithm), it winds up saving a substantial amount of time if one is doing
+// a full DFA powerset construction later by virtue of producing a much much
+// smaller NFA.
+//
+// [1] - https://blog.burntsushi.net/transducers/
+// [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601
+
+use core::{cell::RefCell, fmt, mem, ops::RangeInclusive, u32};
+
+use alloc::{format, string::String, vec, vec::Vec};
+
+use regex_syntax::utf8::Utf8Range;
+
+/// A smaller state ID means more effective use of the CPU cache and less
+/// time spent copying. The implementation below will panic if the state ID
+/// space is exhausted, but in order for that to happen, the range trie itself
+/// would use well over 100GB of memory. Moreover, it's likely impossible
+/// for the state ID space to get that big. In fact, it's likely that even a
+/// u16 would be good enough here. But it's not quite clear how to prove this.
+type StateID = u32;
+
+/// There is only one final state in this trie. Every sequence of byte ranges
+/// added shares the same final state.
+const FINAL: StateID = 0;
+
+/// The root state of the trie.
+const ROOT: StateID = 1;
+
+/// A range trie represents an ordered set of sequences of bytes.
+///
+/// A range trie accepts as input a sequence of byte ranges and merges
+/// them into the existing set such that the trie can produce a sorted
+/// non-overlapping sequence of byte ranges. The sequence emitted corresponds
+/// precisely to the sequence of bytes matched by the given keys, although the
+/// byte ranges themselves may be split at different boundaries.
+///
+/// The order complexity of this data structure seems difficult to analyze.
+/// If the size of a byte is held as a constant, then insertion is clearly
+/// O(n) where n is the number of byte ranges in the input key. However, if
+/// k=256 is our alphabet size, then insertion could be O(k^2 * n). In
+/// particular it seems possible for pathological inputs to cause insertion
+/// to do a lot of work. However, for what we use this data structure for,
+/// there should be no pathological inputs since the ultimate source is always
+/// a sorted set of Unicode scalar value ranges.
+///
+/// Internally, this trie is setup like a finite state machine. Note though
+/// that it is acyclic.
+#[derive(Clone)]
+pub struct RangeTrie {
+    /// The states in this trie. The first is always the shared final state.
+    /// The second is always the root state. Otherwise, there is no
+    /// particular order.
+    states: Vec<State>,
+    /// A free-list of states. When a range trie is cleared, all of its states
+    /// are added to this list. Creating a new state reuses states from this
+    /// list before allocating a new one.
+    free: Vec<State>,
+    /// A stack for traversing this trie to yield sequences of byte ranges in
+    /// lexicographic order.
+    iter_stack: RefCell<Vec<NextIter>>,
+    /// A bufer that stores the current sequence during iteration.
+    iter_ranges: RefCell<Vec<Utf8Range>>,
+    /// A stack used for traversing the trie in order to (deeply) duplicate
+    /// a state. States are recursively duplicated when ranges are split.
+    dupe_stack: Vec<NextDupe>,
+    /// A stack used for traversing the trie during insertion of a new
+    /// sequence of byte ranges.
+    insert_stack: Vec<NextInsert>,
+}
+
+/// A single state in this trie.
+#[derive(Clone)]
+struct State {
+    /// A sorted sequence of non-overlapping transitions to other states. Each
+    /// transition corresponds to a single range of bytes.
+    transitions: Vec<Transition>,
+}
+
+/// A transition is a single range of bytes. If a particular byte is in this
+/// range, then the corresponding machine may transition to the state pointed
+/// to by `next_id`.
+#[derive(Clone)]
+struct Transition {
+    /// The byte range.
+    range: Utf8Range,
+    /// The next state to transition to.
+    next_id: StateID,
+}
+
+impl RangeTrie {
+    /// Create a new empty range trie.
+    pub fn new() -> RangeTrie {
+        let mut trie = RangeTrie {
+            states: vec![],
+            free: vec![],
+            iter_stack: RefCell::new(vec![]),
+            iter_ranges: RefCell::new(vec![]),
+            dupe_stack: vec![],
+            insert_stack: vec![],
+        };
+        trie.clear();
+        trie
+    }
+
+    /// Clear this range trie such that it is empty. Clearing a range trie
+    /// and reusing it can beneficial because this may reuse allocations.
+    pub fn clear(&mut self) {
+        self.free.extend(self.states.drain(..));
+        self.add_empty(); // final
+        self.add_empty(); // root
+    }
+
+    /// Iterate over all of the sequences of byte ranges in this trie, and
+    /// call the provided function for each sequence. Iteration occurs in
+    /// lexicographic order.
+    pub fn iter<E, F: FnMut(&[Utf8Range]) -> Result<(), E>>(
+        &self,
+        mut f: F,
+    ) -> Result<(), E> {
+        let mut stack = self.iter_stack.borrow_mut();
+        stack.clear();
+        let mut ranges = self.iter_ranges.borrow_mut();
+        ranges.clear();
+
+        // We do iteration in a way that permits us to use a single buffer
+        // for our keys. We iterate in a depth first fashion, while being
+        // careful to expand our frontier as we move deeper in the trie.
+        stack.push(NextIter { state_id: ROOT, tidx: 0 });
+        while let Some(NextIter { mut state_id, mut tidx }) = stack.pop() {
+            // This could be implemented more simply without an inner loop
+            // here, but at the cost of more stack pushes.
+            loop {
+                let state = self.state(state_id);
+                // If we've visited all transitions in this state, then pop
+                // back to the parent state.
+                if tidx >= state.transitions.len() {
+                    ranges.pop();
+                    break;
+                }
+
+                let t = &state.transitions[tidx];
+                ranges.push(t.range);
+                if t.next_id == FINAL {
+                    f(&ranges)?;
+                    ranges.pop();
+                    tidx += 1;
+                } else {
+                    // Expand our frontier. Once we come back to this state
+                    // via the stack, start in on the next transition.
+                    stack.push(NextIter { state_id, tidx: tidx + 1 });
+                    // Otherwise, move to the first transition of the next
+                    // state.
+                    state_id = t.next_id;
+                    tidx = 0;
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// Inserts a new sequence of ranges into this trie.
+    ///
+    /// The sequence given must be non-empty and must not have a length
+    /// exceeding 4.
+    pub fn insert(&mut self, ranges: &[Utf8Range]) {
+        assert!(!ranges.is_empty());
+        assert!(ranges.len() <= 4);
+
+        let mut stack = mem::replace(&mut self.insert_stack, vec![]);
+        stack.clear();
+
+        stack.push(NextInsert::new(ROOT, ranges));
+        while let Some(next) = stack.pop() {
+            let (state_id, ranges) = (next.state_id(), next.ranges());
+            assert!(!ranges.is_empty());
+
+            let (mut new, rest) = (ranges[0], &ranges[1..]);
+
+            // i corresponds to the position of the existing transition on
+            // which we are operating. Typically, the result is to remove the
+            // transition and replace it with two or more new transitions
+            // corresponding to the partitions generated by splitting the
+            // 'new' with the ith transition's range.
+            let mut i = self.state(state_id).find(new);
+
+            // In this case, there is no overlap *and* the new range is greater
+            // than all existing ranges. So we can just add it to the end.
+            if i == self.state(state_id).transitions.len() {
+                let next_id = NextInsert::push(self, &mut stack, rest);
+                self.add_transition(state_id, new, next_id);
+                continue;
+            }
+
+            // The need for this loop is a bit subtle, buf basically, after
+            // we've handled the partitions from our initial split, it's
+            // possible that there will be a partition leftover that overlaps
+            // with a subsequent transition. If so, then we have to repeat
+            // the split process again with the leftovers and that subsequent
+            // transition.
+            'OUTER: loop {
+                let old = self.state(state_id).transitions[i].clone();
+                let split = match Split::new(old.range, new) {
+                    Some(split) => split,
+                    None => {
+                        let next_id = NextInsert::push(self, &mut stack, rest);
+                        self.add_transition_at(i, state_id, new, next_id);
+                        continue;
+                    }
+                };
+                let splits = split.as_slice();
+                // If we only have one partition, then the ranges must be
+                // equivalent. There's nothing to do here for this state, so
+                // just move on to the next one.
+                if splits.len() == 1 {
+                    // ... but only if we have anything left to do.
+                    if !rest.is_empty() {
+                        stack.push(NextInsert::new(old.next_id, rest));
+                    }
+                    break;
+                }
+                // At this point, we know that 'split' is non-empty and there
+                // must be some overlap AND that the two ranges are not
+                // equivalent. Therefore, the existing range MUST be removed
+                // and split up somehow. Instead of actually doing the removal
+                // and then a subsequent insertion---with all the memory
+                // shuffling that entails---we simply overwrite the transition
+                // at position `i` for the first new transition we want to
+                // insert. After that, we're forced to do expensive inserts.
+                let mut first = true;
+                let mut add_trans =
+                    |trie: &mut RangeTrie, pos, from, range, to| {
+                        if first {
+                            trie.set_transition_at(pos, from, range, to);
+                            first = false;
+                        } else {
+                            trie.add_transition_at(pos, from, range, to);
+                        }
+                    };
+                for (j, &srange) in splits.iter().enumerate() {
+                    match srange {
+                        SplitRange::Old(r) => {
+                            // Deep clone the state pointed to by the ith
+                            // transition. This is always necessary since 'old'
+                            // is always coupled with at least a 'both'
+                            // partition. We don't want any new changes made
+                            // via the 'both' partition to impact the part of
+                            // the transition that doesn't overlap with the
+                            // new range.
+                            let dup_id = self.duplicate(old.next_id);
+                            add_trans(self, i, state_id, r, dup_id);
+                        }
+                        SplitRange::New(r) => {
+                            // This is a bit subtle, but if this happens to be
+                            // the last partition in our split, it is possible
+                            // that this overlaps with a subsequent transition.
+                            // If it does, then we must repeat the whole
+                            // splitting process over again with `r` and the
+                            // subsequent transition.
+                            {
+                                let trans = &self.state(state_id).transitions;
+                                if j + 1 == splits.len()
+                                    && i < trans.len()
+                                    && intersects(r, trans[i].range)
+                                {
+                                    new = r;
+                                    continue 'OUTER;
+                                }
+                            }
+
+                            // ... otherwise, setup exploration for a new
+                            // empty state and add a brand new transition for
+                            // this new range.
+                            let next_id =
+                                NextInsert::push(self, &mut stack, rest);
+                            add_trans(self, i, state_id, r, next_id);
+                        }
+                        SplitRange::Both(r) => {
+                            // Continue adding the remaining ranges on this
+                            // path and update the transition with the new
+                            // range.
+                            if !rest.is_empty() {
+                                stack.push(NextInsert::new(old.next_id, rest));
+                            }
+                            add_trans(self, i, state_id, r, old.next_id);
+                        }
+                    }
+                    i += 1;
+                }
+                // If we've reached this point, then we know that there are
+                // no subsequent transitions with any overlap. Therefore, we
+                // can stop processing this range and move on to the next one.
+                break;
+            }
+        }
+        self.insert_stack = stack;
+    }
+
+    pub fn add_empty(&mut self) -> StateID {
+        if self.states.len() as u64 > u32::MAX as u64 {
+            // This generally should not happen since a range trie is only
+            // ever used to compile a single sequence of Unicode scalar values.
+            // If we ever got to this point, we would, at *minimum*, be using
+            // 96GB in just the range trie alone.
+            panic!("too many sequences added to range trie");
+        }
+        let id = self.states.len() as StateID;
+        // If we have some free states available, then use them to avoid
+        // more allocations.
+        if let Some(mut state) = self.free.pop() {
+            state.clear();
+            self.states.push(state);
+        } else {
+            self.states.push(State { transitions: vec![] });
+        }
+        id
+    }
+
+    /// Performs a deep clone of the given state and returns the duplicate's
+    /// state ID.
+    ///
+    /// A "deep clone" in this context means that the state given along with
+    /// recursively all states that it points to are copied. Once complete,
+    /// the given state ID and the returned state ID share nothing.
+    ///
+    /// This is useful during range trie insertion when a new range overlaps
+    /// with an existing range that is bigger than the new one. The part
+    /// of the existing range that does *not* overlap with the new one is
+    /// duplicated so that adding the new range to the overlap doesn't disturb
+    /// the non-overlapping portion.
+    ///
+    /// There's one exception: if old_id is the final state, then it is not
+    /// duplicated and the same final state is returned. This is because all
+    /// final states in this trie are equivalent.
+    fn duplicate(&mut self, old_id: StateID) -> StateID {
+        if old_id == FINAL {
+            return FINAL;
+        }
+
+        let mut stack = mem::replace(&mut self.dupe_stack, vec![]);
+        stack.clear();
+
+        let new_id = self.add_empty();
+        // old_id is the state we're cloning and new_id is the ID of the
+        // duplicated state for old_id.
+        stack.push(NextDupe { old_id, new_id });
+        while let Some(NextDupe { old_id, new_id }) = stack.pop() {
+            for i in 0..self.state(old_id).transitions.len() {
+                let t = self.state(old_id).transitions[i].clone();
+                if t.next_id == FINAL {
+                    // All final states are the same, so there's no need to
+                    // duplicate it.
+                    self.add_transition(new_id, t.range, FINAL);
+                    continue;
+                }
+
+                let new_child_id = self.add_empty();
+                self.add_transition(new_id, t.range, new_child_id);
+                stack.push(NextDupe {
+                    old_id: t.next_id,
+                    new_id: new_child_id,
+                });
+            }
+        }
+        self.dupe_stack = stack;
+        new_id
+    }
+
+    /// Adds the given transition to the given state.
+    ///
+    /// Callers must ensure that all previous transitions in this state
+    /// are lexicographically smaller than the given range.
+    fn add_transition(
+        &mut self,
+        from_id: StateID,
+        range: Utf8Range,
+        next_id: StateID,
+    ) {
+        self.state_mut(from_id)
+            .transitions
+            .push(Transition { range, next_id });
+    }
+
+    /// Like `add_transition`, except this inserts the transition just before
+    /// the ith transition.
+    fn add_transition_at(
+        &mut self,
+        i: usize,
+        from_id: StateID,
+        range: Utf8Range,
+        next_id: StateID,
+    ) {
+        self.state_mut(from_id)
+            .transitions
+            .insert(i, Transition { range, next_id });
+    }
+
+    /// Overwrites the transition at position i with the given transition.
+    fn set_transition_at(
+        &mut self,
+        i: usize,
+        from_id: StateID,
+        range: Utf8Range,
+        next_id: StateID,
+    ) {
+        self.state_mut(from_id).transitions[i] = Transition { range, next_id };
+    }
+
+    /// Return an immutable borrow for the state with the given ID.
+    fn state(&self, id: StateID) -> &State {
+        &self.states[id as usize]
+    }
+
+    /// Return a mutable borrow for the state with the given ID.
+    fn state_mut(&mut self, id: StateID) -> &mut State {
+        &mut self.states[id as usize]
+    }
+}
+
+impl State {
+    /// Find the position at which the given range should be inserted in this
+    /// state.
+    ///
+    /// The position returned is always in the inclusive range
+    /// [0, transitions.len()]. If 'transitions.len()' is returned, then the
+    /// given range overlaps with no other range in this state *and* is greater
+    /// than all of them.
+    ///
+    /// For all other possible positions, the given range either overlaps
+    /// with the transition at that position or is otherwise less than it
+    /// with no overlap (and is greater than the previous transition). In the
+    /// former case, careful attention must be paid to inserting this range
+    /// as a new transition. In the latter case, the range can be inserted as
+    /// a new transition at the given position without disrupting any other
+    /// transitions.
+    fn find(&self, range: Utf8Range) -> usize {
+        /// Returns the position `i` at which `pred(xs[i])` first returns true
+        /// such that for all `j >= i`, `pred(xs[j]) == true`. If `pred` never
+        /// returns true, then `xs.len()` is returned.
+        ///
+        /// We roll our own binary search because it doesn't seem like the
+        /// standard library's binary search can be used here. Namely, if
+        /// there is an overlapping range, then we want to find the first such
+        /// occurrence, but there may be many. Or at least, it's not quite
+        /// clear to me how to do it.
+        fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize
+        where
+            F: FnMut(&T) -> bool,
+        {
+            let (mut left, mut right) = (0, xs.len());
+            while left < right {
+                // Overflow is impossible because xs.len() <= 256.
+                let mid = (left + right) / 2;
+                if pred(&xs[mid]) {
+                    right = mid;
+                } else {
+                    left = mid + 1;
+                }
+            }
+            left
+        }
+
+        // Benchmarks suggest that binary search is just a bit faster than
+        // straight linear search. Specifically when using the debug tool:
+        //
+        //   hyperfine "regex-cli debug nfa thompson --quiet --reverse '\w{90} ecurB'"
+        binary_search(&self.transitions, |t| range.start <= t.range.end)
+    }
+
+    /// Clear this state such that it has zero transitions.
+    fn clear(&mut self) {
+        self.transitions.clear();
+    }
+}
+
+/// The next state to process during duplication.
+#[derive(Clone, Debug)]
+struct NextDupe {
+    /// The state we want to duplicate.
+    old_id: StateID,
+    /// The ID of the new state that is a duplicate of old_id.
+    new_id: StateID,
+}
+
+/// The next state (and its corresponding transition) that we want to visit
+/// during iteration in lexicographic order.
+#[derive(Clone, Debug)]
+struct NextIter {
+    state_id: StateID,
+    tidx: usize,
+}
+
+/// The next state to process during insertion and any remaining ranges that we
+/// want to add for a partcular sequence of ranges. The first such instance
+/// is always the root state along with all ranges given.
+#[derive(Clone, Debug)]
+struct NextInsert {
+    /// The next state to begin inserting ranges. This state should be the
+    /// state at which `ranges[0]` should be inserted.
+    state_id: StateID,
+    /// The ranges to insert. We used a fixed-size array here to avoid an
+    /// allocation.
+    ranges: [Utf8Range; 4],
+    /// The number of valid ranges in the above array.
+    len: u8,
+}
+
+impl NextInsert {
+    /// Create the next item to visit. The given state ID should correspond
+    /// to the state at which the first range in the given slice should be
+    /// inserted. The slice given must not be empty and it must be no longer
+    /// than 4.
+    fn new(state_id: StateID, ranges: &[Utf8Range]) -> NextInsert {
+        let len = ranges.len();
+        assert!(len > 0);
+        assert!(len <= 4);
+
+        let mut tmp = [Utf8Range { start: 0, end: 0 }; 4];
+        tmp[..len].copy_from_slice(ranges);
+        NextInsert { state_id, ranges: tmp, len: len as u8 }
+    }
+
+    /// Push a new empty state to visit along with any remaining ranges that
+    /// still need to be inserted. The ID of the new empty state is returned.
+    ///
+    /// If ranges is empty, then no new state is created and FINAL is returned.
+    fn push(
+        trie: &mut RangeTrie,
+        stack: &mut Vec<NextInsert>,
+        ranges: &[Utf8Range],
+    ) -> StateID {
+        if ranges.is_empty() {
+            FINAL
+        } else {
+            let next_id = trie.add_empty();
+            stack.push(NextInsert::new(next_id, ranges));
+            next_id
+        }
+    }
+
+    /// Return the ID of the state to visit.
+    fn state_id(&self) -> StateID {
+        self.state_id
+    }
+
+    /// Return the remaining ranges to insert.
+    fn ranges(&self) -> &[Utf8Range] {
+        &self.ranges[..self.len as usize]
+    }
+}
+
+/// Split represents a partitioning of two ranges into one or more ranges. This
+/// is the secret sauce that makes a range trie work, as it's what tells us
+/// how to deal with two overlapping but unequal ranges during insertion.
+///
+/// Essentially, either two ranges overlap or they don't. If they don't, then
+/// handling insertion is easy: just insert the new range into its
+/// lexicographically correct position. Since it does not overlap with anything
+/// else, no other transitions are impacted by the new range.
+///
+/// If they do overlap though, there are generally three possible cases to
+/// handle:
+///
+/// 1. The part where the two ranges actually overlap. i.e., The intersection.
+/// 2. The part of the existing range that is not in the the new range.
+/// 3. The part of the new range that is not in the old range.
+///
+/// (1) is guaranteed to always occur since all overlapping ranges have a
+/// non-empty intersection. If the two ranges are not equivalent, then at
+/// least one of (2) or (3) is guaranteed to occur as well. In some cases,
+/// e.g., `[0-4]` and `[4-9]`, all three cases will occur.
+///
+/// This `Split` type is responsible for providing (1), (2) and (3) for any
+/// possible pair of byte ranges.
+///
+/// As for insertion, for the overlap in (1), the remaining ranges to insert
+/// should be added by following the corresponding transition. However, this
+/// should only be done for the overlapping parts of the range. If there was
+/// a part of the existing range that was not in the new range, then that
+/// existing part must be split off from the transition and duplicated. The
+/// remaining parts of the overlap can then be added to using the new ranges
+/// without disturbing the existing range.
+///
+/// Handling the case for the part of a new range that is not in an existing
+/// range is seemingly easy. Just treat it as if it were a non-overlapping
+/// range. The problem here is that if this new non-overlapping range occurs
+/// after both (1) and (2), then it's possible that it can overlap with the
+/// next transition in the current state. If it does, then the whole process
+/// must be repeated!
+///
+/// # Details of the 3 cases
+///
+/// The following details the various cases that are implemented in code
+/// below. It's plausible that the number of cases is not actually minimal,
+/// but it's important for this code to remain at least somewhat readable.
+///
+/// Given [a,b] and [x,y], where a <= b, x <= y, b < 256 and y < 256, we define
+/// the follow distinct relationships where at least one must apply. The order
+/// of these matters, since multiple can match. The first to match applies.
+///
+///   1. b < x <=> [a,b] < [x,y]
+///   2. y < a <=> [x,y] < [a,b]
+///
+/// In the case of (1) and (2), these are the only cases where there is no
+/// overlap. Or otherwise, the intersection of [a,b] and [x,y] is empty. In
+/// order to compute the intersection, one can do [max(a,x), min(b,y)]. The
+/// intersection in all of the following cases is non-empty.
+///
+///    3. a = x && b = y <=> [a,b] == [x,y]
+///    4. a = x && b < y <=> [x,y] right-extends [a,b]
+///    5. b = y && a > x <=> [x,y] left-extends [a,b]
+///    6. x = a && y < b <=> [a,b] right-extends [x,y]
+///    7. y = b && x > a <=> [a,b] left-extends [x,y]
+///    8. a > x && b < y <=> [x,y] covers [a,b]
+///    9. x > a && y < b <=> [a,b] covers [x,y]
+///   10. b = x && a < y <=> [a,b] is left-adjacent to [x,y]
+///   11. y = a && x < b <=> [x,y] is left-adjacent to [a,b]
+///   12. b > x && b < y <=> [a,b] left-overlaps [x,y]
+///   13. y > a && y < b <=> [x,y] left-overlaps [a,b]
+///
+/// In cases 3-13, we can form rules that partition the ranges into a
+/// non-overlapping ordered sequence of ranges:
+///
+///    3. [a,b]
+///    4. [a,b], [b+1,y]
+///    5. [x,a-1], [a,b]
+///    6. [x,y], [y+1,b]
+///    7. [a,x-1], [x,y]
+///    8. [x,a-1], [a,b], [b+1,y]
+///    9. [a,x-1], [x,y], [y+1,b]
+///   10. [a,b-1], [b,b], [b+1,y]
+///   11. [x,y-1], [y,y], [y+1,b]
+///   12. [a,x-1], [x,b], [b+1,y]
+///   13. [x,a-1], [a,y], [y+1,b]
+///
+/// In the code below, we go a step further and identify each of the above
+/// outputs as belonging either to the overlap of the two ranges or to one
+/// of [a,b] or [x,y] exclusively.
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct Split {
+    partitions: [SplitRange; 3],
+    len: usize,
+}
+
+/// A tagged range indicating how it was derived from a pair of ranges.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum SplitRange {
+    Old(Utf8Range),
+    New(Utf8Range),
+    Both(Utf8Range),
+}
+
+impl Split {
+    /// Create a partitioning of the given ranges.
+    ///
+    /// If the given ranges have an empty intersection, then None is returned.
+    fn new(o: Utf8Range, n: Utf8Range) -> Option<Split> {
+        let range = |r: RangeInclusive<u8>| Utf8Range {
+            start: *r.start(),
+            end: *r.end(),
+        };
+        let old = |r| SplitRange::Old(range(r));
+        let new = |r| SplitRange::New(range(r));
+        let both = |r| SplitRange::Both(range(r));
+
+        // Use same names as the comment above to make it easier to compare.
+        let (a, b, x, y) = (o.start, o.end, n.start, n.end);
+
+        if b < x || y < a {
+            // case 1, case 2
+            None
+        } else if a == x && b == y {
+            // case 3
+            Some(Split::parts1(both(a..=b)))
+        } else if a == x && b < y {
+            // case 4
+            Some(Split::parts2(both(a..=b), new(b + 1..=y)))
+        } else if b == y && a > x {
+            // case 5
+            Some(Split::parts2(new(x..=a - 1), both(a..=b)))
+        } else if x == a && y < b {
+            // case 6
+            Some(Split::parts2(both(x..=y), old(y + 1..=b)))
+        } else if y == b && x > a {
+            // case 7
+            Some(Split::parts2(old(a..=x - 1), both(x..=y)))
+        } else if a > x && b < y {
+            // case 8
+            Some(Split::parts3(new(x..=a - 1), both(a..=b), new(b + 1..=y)))
+        } else if x > a && y < b {
+            // case 9
+            Some(Split::parts3(old(a..=x - 1), both(x..=y), old(y + 1..=b)))
+        } else if b == x && a < y {
+            // case 10
+            Some(Split::parts3(old(a..=b - 1), both(b..=b), new(b + 1..=y)))
+        } else if y == a && x < b {
+            // case 11
+            Some(Split::parts3(new(x..=y - 1), both(y..=y), old(y + 1..=b)))
+        } else if b > x && b < y {
+            // case 12
+            Some(Split::parts3(old(a..=x - 1), both(x..=b), new(b + 1..=y)))
+        } else if y > a && y < b {
+            // case 13
+            Some(Split::parts3(new(x..=a - 1), both(a..=y), old(y + 1..=b)))
+        } else {
+            unreachable!()
+        }
+    }
+
+    /// Create a new split with a single partition. This only occurs when two
+    /// ranges are equivalent.
+    fn parts1(r1: SplitRange) -> Split {
+        // This value doesn't matter since it is never accessed.
+        let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 });
+        Split { partitions: [r1, nada, nada], len: 1 }
+    }
+
+    /// Create a new split with two partitions.
+    fn parts2(r1: SplitRange, r2: SplitRange) -> Split {
+        // This value doesn't matter since it is never accessed.
+        let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 });
+        Split { partitions: [r1, r2, nada], len: 2 }
+    }
+
+    /// Create a new split with three partitions.
+    fn parts3(r1: SplitRange, r2: SplitRange, r3: SplitRange) -> Split {
+        Split { partitions: [r1, r2, r3], len: 3 }
+    }
+
+    /// Return the partitions in this split as a slice.
+    fn as_slice(&self) -> &[SplitRange] {
+        &self.partitions[..self.len]
+    }
+}
+
+impl fmt::Debug for RangeTrie {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "")?;
+        for (i, state) in self.states.iter().enumerate() {
+            let status = if i == FINAL as usize { '*' } else { ' ' };
+            writeln!(f, "{}{:06}: {:?}", status, i, state)?;
+        }
+        Ok(())
+    }
+}
+
+impl fmt::Debug for State {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let rs = self
+            .transitions
+            .iter()
+            .map(|t| format!("{:?}", t))
+            .collect::<Vec<String>>()
+            .join(", ");
+        write!(f, "{}", rs)
+    }
+}
+
+impl fmt::Debug for Transition {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.range.start == self.range.end {
+            write!(f, "{:02X} => {:02X}", self.range.start, self.next_id)
+        } else {
+            write!(
+                f,
+                "{:02X}-{:02X} => {:02X}",
+                self.range.start, self.range.end, self.next_id
+            )
+        }
+    }
+}
+
+/// Returns true if and only if the given ranges intersect.
+fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool {
+    !(r1.end < r2.start || r2.end < r1.start)
+}
+
+#[cfg(test)]
+mod tests {
+    use core::ops::RangeInclusive;
+
+    use regex_syntax::utf8::Utf8Range;
+
+    use super::*;
+
+    fn r(range: RangeInclusive<u8>) -> Utf8Range {
+        Utf8Range { start: *range.start(), end: *range.end() }
+    }
+
+    fn split_maybe(
+        old: RangeInclusive<u8>,
+        new: RangeInclusive<u8>,
+    ) -> Option<Split> {
+        Split::new(r(old), r(new))
+    }
+
+    fn split(
+        old: RangeInclusive<u8>,
+        new: RangeInclusive<u8>,
+    ) -> Vec<SplitRange> {
+        split_maybe(old, new).unwrap().as_slice().to_vec()
+    }
+
+    #[test]
+    fn no_splits() {
+        // case 1
+        assert_eq!(None, split_maybe(0..=1, 2..=3));
+        // case 2
+        assert_eq!(None, split_maybe(2..=3, 0..=1));
+    }
+
+    #[test]
+    fn splits() {
+        let range = |r: RangeInclusive<u8>| Utf8Range {
+            start: *r.start(),
+            end: *r.end(),
+        };
+        let old = |r| SplitRange::Old(range(r));
+        let new = |r| SplitRange::New(range(r));
+        let both = |r| SplitRange::Both(range(r));
+
+        // case 3
+        assert_eq!(split(0..=0, 0..=0), vec![both(0..=0)]);
+        assert_eq!(split(9..=9, 9..=9), vec![both(9..=9)]);
+
+        // case 4
+        assert_eq!(split(0..=5, 0..=6), vec![both(0..=5), new(6..=6)]);
+        assert_eq!(split(0..=5, 0..=8), vec![both(0..=5), new(6..=8)]);
+        assert_eq!(split(5..=5, 5..=8), vec![both(5..=5), new(6..=8)]);
+
+        // case 5
+        assert_eq!(split(1..=5, 0..=5), vec![new(0..=0), both(1..=5)]);
+        assert_eq!(split(3..=5, 0..=5), vec![new(0..=2), both(3..=5)]);
+        assert_eq!(split(5..=5, 0..=5), vec![new(0..=4), both(5..=5)]);
+
+        // case 6
+        assert_eq!(split(0..=6, 0..=5), vec![both(0..=5), old(6..=6)]);
+        assert_eq!(split(0..=8, 0..=5), vec![both(0..=5), old(6..=8)]);
+        assert_eq!(split(5..=8, 5..=5), vec![both(5..=5), old(6..=8)]);
+
+        // case 7
+        assert_eq!(split(0..=5, 1..=5), vec![old(0..=0), both(1..=5)]);
+        assert_eq!(split(0..=5, 3..=5), vec![old(0..=2), both(3..=5)]);
+        assert_eq!(split(0..=5, 5..=5), vec![old(0..=4), both(5..=5)]);
+
+        // case 8
+        assert_eq!(
+            split(3..=6, 2..=7),
+            vec![new(2..=2), both(3..=6), new(7..=7)],
+        );
+        assert_eq!(
+            split(3..=6, 1..=8),
+            vec![new(1..=2), both(3..=6), new(7..=8)],
+        );
+
+        // case 9
+        assert_eq!(
+            split(2..=7, 3..=6),
+            vec![old(2..=2), both(3..=6), old(7..=7)],
+        );
+        assert_eq!(
+            split(1..=8, 3..=6),
+            vec![old(1..=2), both(3..=6), old(7..=8)],
+        );
+
+        // case 10
+        assert_eq!(
+            split(3..=6, 6..=7),
+            vec![old(3..=5), both(6..=6), new(7..=7)],
+        );
+        assert_eq!(
+            split(3..=6, 6..=8),
+            vec![old(3..=5), both(6..=6), new(7..=8)],
+        );
+        assert_eq!(
+            split(5..=6, 6..=7),
+            vec![old(5..=5), both(6..=6), new(7..=7)],
+        );
+
+        // case 11
+        assert_eq!(
+            split(6..=7, 3..=6),
+            vec![new(3..=5), both(6..=6), old(7..=7)],
+        );
+        assert_eq!(
+            split(6..=8, 3..=6),
+            vec![new(3..=5), both(6..=6), old(7..=8)],
+        );
+        assert_eq!(
+            split(6..=7, 5..=6),
+            vec![new(5..=5), both(6..=6), old(7..=7)],
+        );
+
+        // case 12
+        assert_eq!(
+            split(3..=7, 5..=9),
+            vec![old(3..=4), both(5..=7), new(8..=9)],
+        );
+        assert_eq!(
+            split(3..=5, 4..=6),
+            vec![old(3..=3), both(4..=5), new(6..=6)],
+        );
+
+        // case 13
+        assert_eq!(
+            split(5..=9, 3..=7),
+            vec![new(3..=4), both(5..=7), old(8..=9)],
+        );
+        assert_eq!(
+            split(4..=6, 3..=5),
+            vec![new(3..=3), both(4..=5), old(6..=6)],
+        );
+    }
+
+    // Arguably there should be more tests here, but in practice, this data
+    // structure is well covered by the huge number of regex tests.
+}
diff --git a/src/util/alphabet.rs b/src/util/alphabet.rs

new file mode 100644 (file)

index 0000000..0bc1ece
--- /dev/null
+++ b/src/util/alphabet.rs
@@ -0,0 +1,790 @@
+use core::convert::TryFrom;
+
+use crate::util::{
+    bytes::{DeserializeError, SerializeError},
+    DebugByte,
+};
+
+/// Unit represents a single unit of input for DFA based regex engines.
+///
+/// **NOTE:** It is not expected for consumers of this crate to need to use
+/// this type unless they are implementing their own DFA. And even then, it's
+/// not required: implementors may use other techniques to handle input.
+///
+/// Typically, a single unit of input for a DFA would be a single byte.
+/// However, for the DFAs in this crate, matches are delayed by a single byte
+/// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once
+/// we have consumed the haystack, we must run the DFA through one additional
+/// transition using an input that indicates the haystack has ended.
+///
+/// Since there is no way to represent a sentinel with a `u8` since all
+/// possible values *may* be valid inputs to a DFA, this type explicitly adds
+/// room for a sentinel value.
+///
+/// The sentinel EOI value is always its own equivalence class and is
+/// ultimately represented by adding 1 to the maximum equivalence class value.
+/// So for example, the regex `^[a-z]+$` might be split into the following
+/// equivalence classes:
+///
+/// ```text
+/// 0 => [\x00-`]
+/// 1 => [a-z]
+/// 2 => [{-\xFF]
+/// 3 => [EOI]
+/// ```
+///
+/// Where EOI is the special sentinel value that is always in its own
+/// singleton equivalence class.
+#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
+pub enum Unit {
+    U8(u8),
+    EOI(u16),
+}
+
+impl Unit {
+    /// Create a new input unit from a byte value.
+    ///
+    /// All possible byte values are legal. However, when creating an input
+    /// unit for a specific DFA, one should be careful to only construct input
+    /// units that are in that DFA's alphabet. Namely, one way to compact a
+    /// DFA's in-memory representation is to collapse its transitions to a set
+    /// of equivalence classes into a set of all possible byte values. If a
+    /// DFA uses equivalence classes instead of byte values, then the byte
+    /// given here should be the equivalence class.
+    pub fn u8(byte: u8) -> Unit {
+        Unit::U8(byte)
+    }
+
+    pub fn eoi(num_byte_equiv_classes: usize) -> Unit {
+        assert!(
+            num_byte_equiv_classes <= 256,
+            "max number of byte-based equivalent classes is 256, but got {}",
+            num_byte_equiv_classes,
+        );
+        Unit::EOI(u16::try_from(num_byte_equiv_classes).unwrap())
+    }
+
+    pub fn as_u8(self) -> Option<u8> {
+        match self {
+            Unit::U8(b) => Some(b),
+            Unit::EOI(_) => None,
+        }
+    }
+
+    #[cfg(feature = "alloc")]
+    pub fn as_eoi(self) -> Option<usize> {
+        match self {
+            Unit::U8(_) => None,
+            Unit::EOI(eoi) => Some(eoi as usize),
+        }
+    }
+
+    pub fn as_usize(self) -> usize {
+        match self {
+            Unit::U8(b) => b as usize,
+            Unit::EOI(eoi) => eoi as usize,
+        }
+    }
+
+    pub fn is_eoi(&self) -> bool {
+        match *self {
+            Unit::EOI(_) => true,
+            _ => false,
+        }
+    }
+
+    #[cfg(feature = "alloc")]
+    pub fn is_word_byte(&self) -> bool {
+        self.as_u8().map_or(false, crate::util::is_word_byte)
+    }
+}
+
+impl core::fmt::Debug for Unit {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        match *self {
+            Unit::U8(b) => write!(f, "{:?}", DebugByte(b)),
+            Unit::EOI(_) => write!(f, "EOI"),
+        }
+    }
+}
+
+/// A representation of byte oriented equivalence classes.
+///
+/// This is used in a DFA to reduce the size of the transition table. This can
+/// have a particularly large impact not only on the total size of a dense DFA,
+/// but also on compile times.
+#[derive(Clone, Copy)]
+pub struct ByteClasses([u8; 256]);
+
+impl ByteClasses {
+    /// Creates a new set of equivalence classes where all bytes are mapped to
+    /// the same class.
+    pub fn empty() -> ByteClasses {
+        ByteClasses([0; 256])
+    }
+
+    /// Creates a new set of equivalence classes where each byte belongs to
+    /// its own equivalence class.
+    #[cfg(feature = "alloc")]
+    pub fn singletons() -> ByteClasses {
+        let mut classes = ByteClasses::empty();
+        for i in 0..256 {
+            classes.set(i as u8, i as u8);
+        }
+        classes
+    }
+
+    /// Deserializes a byte class map from the given slice. If the slice is of
+    /// insufficient length or otherwise contains an impossible mapping, then
+    /// an error is returned. Upon success, the number of bytes read along with
+    /// the map are returned. The number of bytes read is always a multiple of
+    /// 8.
+    pub fn from_bytes(
+        slice: &[u8],
+    ) -> Result<(ByteClasses, usize), DeserializeError> {
+        if slice.len() < 256 {
+            return Err(DeserializeError::buffer_too_small("byte class map"));
+        }
+        let mut classes = ByteClasses::empty();
+        for (b, &class) in slice[..256].iter().enumerate() {
+            classes.set(b as u8, class);
+        }
+        for b in classes.iter() {
+            if b.as_usize() >= classes.alphabet_len() {
+                return Err(DeserializeError::generic(
+                    "found equivalence class greater than alphabet len",
+                ));
+            }
+        }
+        Ok((classes, 256))
+    }
+
+    /// Writes this byte class map to the given byte buffer. if the given
+    /// buffer is too small, then an error is returned. Upon success, the total
+    /// number of bytes written is returned. The number of bytes written is
+    /// guaranteed to be a multiple of 8.
+    pub fn write_to(
+        &self,
+        mut dst: &mut [u8],
+    ) -> Result<usize, SerializeError> {
+        let nwrite = self.write_to_len();
+        if dst.len() < nwrite {
+            return Err(SerializeError::buffer_too_small("byte class map"));
+        }
+        for b in 0..=255 {
+            dst[0] = self.get(b);
+            dst = &mut dst[1..];
+        }
+        Ok(nwrite)
+    }
+
+    /// Returns the total number of bytes written by `write_to`.
+    pub fn write_to_len(&self) -> usize {
+        256
+    }
+
+    /// Set the equivalence class for the given byte.
+    #[inline]
+    pub fn set(&mut self, byte: u8, class: u8) {
+        self.0[byte as usize] = class;
+    }
+
+    /// Get the equivalence class for the given byte.
+    #[inline]
+    pub fn get(&self, byte: u8) -> u8 {
+        self.0[byte as usize]
+    }
+
+    /// Get the equivalence class for the given byte while forcefully
+    /// eliding bounds checks.
+    #[inline]
+    pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
+        *self.0.get_unchecked(byte as usize)
+    }
+
+    /// Get the equivalence class for the given input unit and return the
+    /// class as a `usize`.
+    #[inline]
+    pub fn get_by_unit(&self, unit: Unit) -> usize {
+        match unit {
+            Unit::U8(b) => usize::try_from(self.get(b)).unwrap(),
+            Unit::EOI(b) => usize::try_from(b).unwrap(),
+        }
+    }
+
+    #[inline]
+    pub fn eoi(&self) -> Unit {
+        Unit::eoi(self.alphabet_len().checked_sub(1).unwrap())
+    }
+
+    /// Return the total number of elements in the alphabet represented by
+    /// these equivalence classes. Equivalently, this returns the total number
+    /// of equivalence classes.
+    #[inline]
+    pub fn alphabet_len(&self) -> usize {
+        // Add one since the number of equivalence classes is one bigger than
+        // the last one. But add another to account for the final EOI class
+        // that isn't explicitly represented.
+        self.0[255] as usize + 1 + 1
+    }
+
+    /// Returns the stride, as a base-2 exponent, required for these
+    /// equivalence classes.
+    ///
+    /// The stride is always the smallest power of 2 that is greater than or
+    /// equal to the alphabet length. This is done so that converting between
+    /// state IDs and indices can be done with shifts alone, which is much
+    /// faster than integer division.
+    #[cfg(feature = "alloc")]
+    pub fn stride2(&self) -> usize {
+        self.alphabet_len().next_power_of_two().trailing_zeros() as usize
+    }
+
+    /// Returns true if and only if every byte in this class maps to its own
+    /// equivalence class. Equivalently, there are 257 equivalence classes
+    /// and each class contains exactly one byte (plus the special EOI class).
+    #[inline]
+    pub fn is_singleton(&self) -> bool {
+        self.alphabet_len() == 257
+    }
+
+    /// Returns an iterator over all equivalence classes in this set.
+    pub fn iter(&self) -> ByteClassIter<'_> {
+        ByteClassIter { classes: self, i: 0 }
+    }
+
+    /// Returns an iterator over a sequence of representative bytes from each
+    /// equivalence class. Namely, this yields exactly N items, where N is
+    /// equivalent to the number of equivalence classes. Each item is an
+    /// arbitrary byte drawn from each equivalence class.
+    ///
+    /// This is useful when one is determinizing an NFA and the NFA's alphabet
+    /// hasn't been converted to equivalence classes yet. Picking an arbitrary
+    /// byte from each equivalence class then permits a full exploration of
+    /// the NFA instead of using every possible byte value.
+    #[cfg(feature = "alloc")]
+    pub fn representatives(&self) -> ByteClassRepresentatives<'_> {
+        ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
+    }
+
+    /// Returns an iterator of the bytes in the given equivalence class.
+    pub fn elements(&self, class: Unit) -> ByteClassElements {
+        ByteClassElements { classes: self, class, byte: 0 }
+    }
+
+    /// Returns an iterator of byte ranges in the given equivalence class.
+    ///
+    /// That is, a sequence of contiguous ranges are returned. Typically, every
+    /// class maps to a single contiguous range.
+    fn element_ranges(&self, class: Unit) -> ByteClassElementRanges {
+        ByteClassElementRanges { elements: self.elements(class), range: None }
+    }
+}
+
+impl core::fmt::Debug for ByteClasses {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        if self.is_singleton() {
+            write!(f, "ByteClasses({{singletons}})")
+        } else {
+            write!(f, "ByteClasses(")?;
+            for (i, class) in self.iter().enumerate() {
+                if i > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "{:?} => [", class.as_usize())?;
+                for (start, end) in self.element_ranges(class) {
+                    if start == end {
+                        write!(f, "{:?}", start)?;
+                    } else {
+                        write!(f, "{:?}-{:?}", start, end)?;
+                    }
+                }
+                write!(f, "]")?;
+            }
+            write!(f, ")")
+        }
+    }
+}
+
+/// An iterator over each equivalence class.
+#[derive(Debug)]
+pub struct ByteClassIter<'a> {
+    classes: &'a ByteClasses,
+    i: usize,
+}
+
+impl<'a> Iterator for ByteClassIter<'a> {
+    type Item = Unit;
+
+    fn next(&mut self) -> Option<Unit> {
+        if self.i + 1 == self.classes.alphabet_len() {
+            self.i += 1;
+            Some(self.classes.eoi())
+        } else if self.i < self.classes.alphabet_len() {
+            let class = self.i as u8;
+            self.i += 1;
+            Some(Unit::u8(class))
+        } else {
+            None
+        }
+    }
+}
+
+/// An iterator over representative bytes from each equivalence class.
+#[cfg(feature = "alloc")]
+#[derive(Debug)]
+pub struct ByteClassRepresentatives<'a> {
+    classes: &'a ByteClasses,
+    byte: usize,
+    last_class: Option<u8>,
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> Iterator for ByteClassRepresentatives<'a> {
+    type Item = Unit;
+
+    fn next(&mut self) -> Option<Unit> {
+        while self.byte < 256 {
+            let byte = self.byte as u8;
+            let class = self.classes.get(byte);
+            self.byte += 1;
+
+            if self.last_class != Some(class) {
+                self.last_class = Some(class);
+                return Some(Unit::u8(byte));
+            }
+        }
+        if self.byte == 256 {
+            self.byte += 1;
+            return Some(self.classes.eoi());
+        }
+        None
+    }
+}
+
+/// An iterator over all elements in an equivalence class.
+#[derive(Debug)]
+pub struct ByteClassElements<'a> {
+    classes: &'a ByteClasses,
+    class: Unit,
+    byte: usize,
+}
+
+impl<'a> Iterator for ByteClassElements<'a> {
+    type Item = Unit;
+
+    fn next(&mut self) -> Option<Unit> {
+        while self.byte < 256 {
+            let byte = self.byte as u8;
+            self.byte += 1;
+            if self.class.as_u8() == Some(self.classes.get(byte)) {
+                return Some(Unit::u8(byte));
+            }
+        }
+        if self.byte < 257 {
+            self.byte += 1;
+            if self.class.is_eoi() {
+                return Some(Unit::eoi(256));
+            }
+        }
+        None
+    }
+}
+
+/// An iterator over all elements in an equivalence class expressed as a
+/// sequence of contiguous ranges.
+#[derive(Debug)]
+pub struct ByteClassElementRanges<'a> {
+    elements: ByteClassElements<'a>,
+    range: Option<(Unit, Unit)>,
+}
+
+impl<'a> Iterator for ByteClassElementRanges<'a> {
+    type Item = (Unit, Unit);
+
+    fn next(&mut self) -> Option<(Unit, Unit)> {
+        loop {
+            let element = match self.elements.next() {
+                None => return self.range.take(),
+                Some(element) => element,
+            };
+            match self.range.take() {
+                None => {
+                    self.range = Some((element, element));
+                }
+                Some((start, end)) => {
+                    if end.as_usize() + 1 != element.as_usize()
+                        || element.is_eoi()
+                    {
+                        self.range = Some((element, element));
+                        return Some((start, end));
+                    }
+                    self.range = Some((start, element));
+                }
+            }
+        }
+    }
+}
+
+/// A byte class set keeps track of an *approximation* of equivalence classes
+/// of bytes during NFA construction. That is, every byte in an equivalence
+/// class cannot discriminate between a match and a non-match.
+///
+/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
+/// same equivalence class because it never matters whether an `a` or a `b` is
+/// seen, and no combination of `a`s and `b`s in the text can discriminate a
+/// match.
+///
+/// Note though that this does not compute the minimal set of equivalence
+/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
+/// same equivalence class for the same reason that `a` and `b` are in the
+/// same equivalence class in the aforementioned regex. However, in this
+/// implementation, `a` and `c` are put into distinct equivalence classes. The
+/// reason for this is implementation complexity. In the future, we should
+/// endeavor to compute the minimal equivalence classes since they can have a
+/// rather large impact on the size of the DFA. (Doing this will likely require
+/// rethinking how equivalence classes are computed, including changing the
+/// representation here, which is only able to group contiguous bytes into the
+/// same equivalence class.)
+#[derive(Clone, Debug)]
+pub struct ByteClassSet(ByteSet);
+
+impl ByteClassSet {
+    /// Create a new set of byte classes where all bytes are part of the same
+    /// equivalence class.
+    #[cfg(feature = "alloc")]
+    pub fn empty() -> Self {
+        ByteClassSet(ByteSet::empty())
+    }
+
+    /// Indicate the the range of byte given (inclusive) can discriminate a
+    /// match between it and all other bytes outside of the range.
+    #[cfg(feature = "alloc")]
+    pub fn set_range(&mut self, start: u8, end: u8) {
+        debug_assert!(start <= end);
+        if start > 0 {
+            self.0.add(start - 1);
+        }
+        self.0.add(end);
+    }
+
+    /// Add the contiguous ranges in the set given to this byte class set.
+    #[cfg(feature = "alloc")]
+    pub fn add_set(&mut self, set: &ByteSet) {
+        for (start, end) in set.iter_ranges() {
+            self.set_range(start, end);
+        }
+    }
+
+    /// Convert this boolean set to a map that maps all byte values to their
+    /// corresponding equivalence class. The last mapping indicates the largest
+    /// equivalence class identifier (which is never bigger than 255).
+    #[cfg(feature = "alloc")]
+    pub fn byte_classes(&self) -> ByteClasses {
+        let mut classes = ByteClasses::empty();
+        let mut class = 0u8;
+        let mut b = 0u8;
+        loop {
+            classes.set(b, class);
+            if b == 255 {
+                break;
+            }
+            if self.0.contains(b) {
+                class = class.checked_add(1).unwrap();
+            }
+            b = b.checked_add(1).unwrap();
+        }
+        classes
+    }
+}
+
+/// A simple set of bytes that is reasonably cheap to copy and allocation free.
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+pub struct ByteSet {
+    bits: BitSet,
+}
+
+/// The representation of a byte set. Split out so that we can define a
+/// convenient Debug impl for it while keeping "ByteSet" in the output.
+#[derive(Clone, Copy, Default, Eq, PartialEq)]
+struct BitSet([u128; 2]);
+
+impl ByteSet {
+    /// Create an empty set of bytes.
+    #[cfg(feature = "alloc")]
+    pub fn empty() -> ByteSet {
+        ByteSet { bits: BitSet([0; 2]) }
+    }
+
+    /// Add a byte to this set.
+    ///
+    /// If the given byte already belongs to this set, then this is a no-op.
+    #[cfg(feature = "alloc")]
+    pub fn add(&mut self, byte: u8) {
+        let bucket = byte / 128;
+        let bit = byte % 128;
+        self.bits.0[bucket as usize] |= 1 << bit;
+    }
+
+    /// Add an inclusive range of bytes.
+    #[cfg(feature = "alloc")]
+    pub fn add_all(&mut self, start: u8, end: u8) {
+        for b in start..=end {
+            self.add(b);
+        }
+    }
+
+    /// Remove a byte from this set.
+    ///
+    /// If the given byte is not in this set, then this is a no-op.
+    #[cfg(feature = "alloc")]
+    pub fn remove(&mut self, byte: u8) {
+        let bucket = byte / 128;
+        let bit = byte % 128;
+        self.bits.0[bucket as usize] &= !(1 << bit);
+    }
+
+    /// Remove an inclusive range of bytes.
+    #[cfg(feature = "alloc")]
+    pub fn remove_all(&mut self, start: u8, end: u8) {
+        for b in start..=end {
+            self.remove(b);
+        }
+    }
+
+    /// Return true if and only if the given byte is in this set.
+    pub fn contains(&self, byte: u8) -> bool {
+        let bucket = byte / 128;
+        let bit = byte % 128;
+        self.bits.0[bucket as usize] & (1 << bit) > 0
+    }
+
+    /// Return true if and only if the given inclusive range of bytes is in
+    /// this set.
+    #[cfg(feature = "alloc")]
+    pub fn contains_range(&self, start: u8, end: u8) -> bool {
+        (start..=end).all(|b| self.contains(b))
+    }
+
+    /// Returns an iterator over all bytes in this set.
+    #[cfg(feature = "alloc")]
+    pub fn iter(&self) -> ByteSetIter {
+        ByteSetIter { set: self, b: 0 }
+    }
+
+    /// Returns an iterator over all contiguous ranges of bytes in this set.
+    #[cfg(feature = "alloc")]
+    pub fn iter_ranges(&self) -> ByteSetRangeIter {
+        ByteSetRangeIter { set: self, b: 0 }
+    }
+
+    /// Return the number of bytes in this set.
+    #[cfg(feature = "alloc")]
+    pub fn len(&self) -> usize {
+        (self.bits.0[0].count_ones() + self.bits.0[1].count_ones()) as usize
+    }
+
+    /// Return true if and only if this set is empty.
+    #[cfg(feature = "alloc")]
+    pub fn is_empty(&self) -> bool {
+        self.bits.0 == [0, 0]
+    }
+}
+
+impl core::fmt::Debug for BitSet {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        let mut fmtd = f.debug_set();
+        for b in (0..256).map(|b| b as u8) {
+            if (ByteSet { bits: *self }).contains(b) {
+                fmtd.entry(&b);
+            }
+        }
+        fmtd.finish()
+    }
+}
+
+#[derive(Debug)]
+pub struct ByteSetIter<'a> {
+    set: &'a ByteSet,
+    b: usize,
+}
+
+impl<'a> Iterator for ByteSetIter<'a> {
+    type Item = u8;
+
+    fn next(&mut self) -> Option<u8> {
+        while self.b <= 255 {
+            let b = self.b as u8;
+            self.b += 1;
+            if self.set.contains(b) {
+                return Some(b);
+            }
+        }
+        None
+    }
+}
+
+#[derive(Debug)]
+pub struct ByteSetRangeIter<'a> {
+    set: &'a ByteSet,
+    b: usize,
+}
+
+impl<'a> Iterator for ByteSetRangeIter<'a> {
+    type Item = (u8, u8);
+
+    fn next(&mut self) -> Option<(u8, u8)> {
+        while self.b <= 255 {
+            let start = self.b as u8;
+            self.b += 1;
+            if !self.set.contains(start) {
+                continue;
+            }
+
+            let mut end = start;
+            while self.b <= 255 && self.set.contains(self.b as u8) {
+                end = self.b as u8;
+                self.b += 1;
+            }
+            return Some((start, end));
+        }
+        None
+    }
+}
+
+#[cfg(test)]
+#[cfg(feature = "alloc")]
+mod tests {
+    use alloc::{vec, vec::Vec};
+
+    use super::*;
+
+    #[test]
+    fn byte_classes() {
+        let mut set = ByteClassSet::empty();
+        set.set_range(b'a', b'z');
+
+        let classes = set.byte_classes();
+        assert_eq!(classes.get(0), 0);
+        assert_eq!(classes.get(1), 0);
+        assert_eq!(classes.get(2), 0);
+        assert_eq!(classes.get(b'a' - 1), 0);
+        assert_eq!(classes.get(b'a'), 1);
+        assert_eq!(classes.get(b'm'), 1);
+        assert_eq!(classes.get(b'z'), 1);
+        assert_eq!(classes.get(b'z' + 1), 2);
+        assert_eq!(classes.get(254), 2);
+        assert_eq!(classes.get(255), 2);
+
+        let mut set = ByteClassSet::empty();
+        set.set_range(0, 2);
+        set.set_range(4, 6);
+        let classes = set.byte_classes();
+        assert_eq!(classes.get(0), 0);
+        assert_eq!(classes.get(1), 0);
+        assert_eq!(classes.get(2), 0);
+        assert_eq!(classes.get(3), 1);
+        assert_eq!(classes.get(4), 2);
+        assert_eq!(classes.get(5), 2);
+        assert_eq!(classes.get(6), 2);
+        assert_eq!(classes.get(7), 3);
+        assert_eq!(classes.get(255), 3);
+    }
+
+    #[test]
+    fn full_byte_classes() {
+        let mut set = ByteClassSet::empty();
+        for i in 0..256u16 {
+            set.set_range(i as u8, i as u8);
+        }
+        assert_eq!(set.byte_classes().alphabet_len(), 257);
+    }
+
+    #[test]
+    fn elements_typical() {
+        let mut set = ByteClassSet::empty();
+        set.set_range(b'b', b'd');
+        set.set_range(b'g', b'm');
+        set.set_range(b'z', b'z');
+        let classes = set.byte_classes();
+        // class 0: \x00-a
+        // class 1: b-d
+        // class 2: e-f
+        // class 3: g-m
+        // class 4: n-y
+        // class 5: z-z
+        // class 6: \x7B-\xFF
+        // class 7: EOI
+        assert_eq!(classes.alphabet_len(), 8);
+
+        let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>();
+        assert_eq!(elements.len(), 98);
+        assert_eq!(elements[0], Unit::u8(b'\x00'));
+        assert_eq!(elements[97], Unit::u8(b'a'));
+
+        let elements = classes.elements(Unit::u8(1)).collect::<Vec<_>>();
+        assert_eq!(
+            elements,
+            vec![Unit::u8(b'b'), Unit::u8(b'c'), Unit::u8(b'd')],
+        );
+
+        let elements = classes.elements(Unit::u8(2)).collect::<Vec<_>>();
+        assert_eq!(elements, vec![Unit::u8(b'e'), Unit::u8(b'f')],);
+
+        let elements = classes.elements(Unit::u8(3)).collect::<Vec<_>>();
+        assert_eq!(
+            elements,
+            vec![
+                Unit::u8(b'g'),
+                Unit::u8(b'h'),
+                Unit::u8(b'i'),
+                Unit::u8(b'j'),
+                Unit::u8(b'k'),
+                Unit::u8(b'l'),
+                Unit::u8(b'm'),
+            ],
+        );
+
+        let elements = classes.elements(Unit::u8(4)).collect::<Vec<_>>();
+        assert_eq!(elements.len(), 12);
+        assert_eq!(elements[0], Unit::u8(b'n'));
+        assert_eq!(elements[11], Unit::u8(b'y'));
+
+        let elements = classes.elements(Unit::u8(5)).collect::<Vec<_>>();
+        assert_eq!(elements, vec![Unit::u8(b'z')]);
+
+        let elements = classes.elements(Unit::u8(6)).collect::<Vec<_>>();
+        assert_eq!(elements.len(), 133);
+        assert_eq!(elements[0], Unit::u8(b'\x7B'));
+        assert_eq!(elements[132], Unit::u8(b'\xFF'));
+
+        let elements = classes.elements(Unit::eoi(7)).collect::<Vec<_>>();
+        assert_eq!(elements, vec![Unit::eoi(256)]);
+    }
+
+    #[test]
+    fn elements_singletons() {
+        let classes = ByteClasses::singletons();
+        assert_eq!(classes.alphabet_len(), 257);
+
+        let elements = classes.elements(Unit::u8(b'a')).collect::<Vec<_>>();
+        assert_eq!(elements, vec![Unit::u8(b'a')]);
+
+        let elements = classes.elements(Unit::eoi(5)).collect::<Vec<_>>();
+        assert_eq!(elements, vec![Unit::eoi(256)]);
+    }
+
+    #[test]
+    fn elements_empty() {
+        let classes = ByteClasses::empty();
+        assert_eq!(classes.alphabet_len(), 2);
+
+        let elements = classes.elements(Unit::u8(0)).collect::<Vec<_>>();
+        assert_eq!(elements.len(), 256);
+        assert_eq!(elements[0], Unit::u8(b'\x00'));
+        assert_eq!(elements[255], Unit::u8(b'\xFF'));
+
+        let elements = classes.elements(Unit::eoi(1)).collect::<Vec<_>>();
+        assert_eq!(elements, vec![Unit::eoi(256)]);
+    }
+}
diff --git a/src/util/bytes.rs b/src/util/bytes.rs

new file mode 100644 (file)

index 0000000..5877bb1
--- /dev/null
+++ b/src/util/bytes.rs
@@ -0,0 +1,950 @@
+/*
+A collection of helper functions, types and traits for serializing automata.
+
+This crate defines its own bespoke serialization mechanism for some structures
+provided in the public API, namely, DFAs. A bespoke mechanism was developed
+primarily because structures like automata demand a specific binary format.
+Attempting to encode their rich structure in an existing serialization
+format is just not feasible. Moreover, the format for each structure is
+generally designed such that deserialization is cheap. More specifically, that
+deserialization can be done in constant time. (The idea being that you can
+embed it into your binary or mmap it, and then use it immediately.)
+
+In order to achieve this, most of the structures in this crate use an in-memory
+representation that very closely corresponds to its binary serialized form.
+This pervades and complicates everything, and in some cases, requires dealing
+with alignment and reasoning about safety.
+
+This technique does have major advantages. In particular, it permits doing
+the potentially costly work of compiling a finite state machine in an offline
+manner, and then loading it at runtime not only without having to re-compile
+the regex, but even without the code required to do the compilation. This, for
+example, permits one to use a pre-compiled DFA not only in environments without
+Rust's standard library, but also in environments without a heap.
+
+In the code below, whenever we insert some kind of padding, it's to enforce a
+4-byte alignment, unless otherwise noted. Namely, u32 is the only state ID type
+supported. (In a previous version of this library, DFAs were generic over the
+state ID representation.)
+
+Also, serialization generally requires the caller to specify endianness,
+where as deserialization always assumes native endianness (otherwise cheap
+deserialization would be impossible). This implies that serializing a structure
+generally requires serializing both its big-endian and little-endian variants,
+and then loading the correct one based on the target's endianness.
+*/
+
+use core::{
+    cmp,
+    convert::{TryFrom, TryInto},
+    mem::size_of,
+};
+
+#[cfg(feature = "alloc")]
+use alloc::{vec, vec::Vec};
+
+use crate::util::id::{PatternID, PatternIDError, StateID, StateIDError};
+
+/// An error that occurs when serializing an object from this crate.
+///
+/// Serialization, as used in this crate, universally refers to the process
+/// of transforming a structure (like a DFA) into a custom binary format
+/// represented by `&[u8]`. To this end, serialization is generally infallible.
+/// However, it can fail when caller provided buffer sizes are too small. When
+/// that occurs, a serialization error is reported.
+///
+/// A `SerializeError` provides no introspection capabilities. Its only
+/// supported operation is conversion to a human readable error message.
+///
+/// This error type implements the `std::error::Error` trait only when the
+/// `std` feature is enabled. Otherwise, this type is defined in all
+/// configurations.
+#[derive(Debug)]
+pub struct SerializeError {
+    /// The name of the thing that a buffer is too small for.
+    ///
+    /// Currently, the only kind of serialization error is one that is
+    /// committed by a caller: providing a destination buffer that is too
+    /// small to fit the serialized object. This makes sense conceptually,
+    /// since every valid inhabitant of a type should be serializable.
+    ///
+    /// This is somewhat exposed in the public API of this crate. For example,
+    /// the `to_bytes_{big,little}_endian` APIs return a `Vec<u8>` and are
+    /// guaranteed to never panic or error. This is only possible because the
+    /// implementation guarantees that it will allocate a `Vec<u8>` that is
+    /// big enough.
+    ///
+    /// In summary, if a new serialization error kind needs to be added, then
+    /// it will need careful consideration.
+    what: &'static str,
+}
+
+impl SerializeError {
+    pub(crate) fn buffer_too_small(what: &'static str) -> SerializeError {
+        SerializeError { what }
+    }
+}
+
+impl core::fmt::Display for SerializeError {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "destination buffer is too small to write {}", self.what)
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for SerializeError {}
+
+/// An error that occurs when deserializing an object defined in this crate.
+///
+/// Serialization, as used in this crate, universally refers to the process
+/// of transforming a structure (like a DFA) into a custom binary format
+/// represented by `&[u8]`. Deserialization, then, refers to the process of
+/// cheaply converting this binary format back to the object's in-memory
+/// representation as defined in this crate. To the extent possible,
+/// deserialization will report this error whenever this process fails.
+///
+/// A `DeserializeError` provides no introspection capabilities. Its only
+/// supported operation is conversion to a human readable error message.
+///
+/// This error type implements the `std::error::Error` trait only when the
+/// `std` feature is enabled. Otherwise, this type is defined in all
+/// configurations.
+#[derive(Debug)]
+pub struct DeserializeError(DeserializeErrorKind);
+
+#[derive(Debug)]
+enum DeserializeErrorKind {
+    Generic { msg: &'static str },
+    BufferTooSmall { what: &'static str },
+    InvalidUsize { what: &'static str },
+    InvalidVarint { what: &'static str },
+    VersionMismatch { expected: u32, found: u32 },
+    EndianMismatch { expected: u32, found: u32 },
+    AlignmentMismatch { alignment: usize, address: usize },
+    LabelMismatch { expected: &'static str },
+    ArithmeticOverflow { what: &'static str },
+    PatternID { err: PatternIDError, what: &'static str },
+    StateID { err: StateIDError, what: &'static str },
+}
+
+impl DeserializeError {
+    pub(crate) fn generic(msg: &'static str) -> DeserializeError {
+        DeserializeError(DeserializeErrorKind::Generic { msg })
+    }
+
+    pub(crate) fn buffer_too_small(what: &'static str) -> DeserializeError {
+        DeserializeError(DeserializeErrorKind::BufferTooSmall { what })
+    }
+
+    pub(crate) fn invalid_usize(what: &'static str) -> DeserializeError {
+        DeserializeError(DeserializeErrorKind::InvalidUsize { what })
+    }
+
+    fn invalid_varint(what: &'static str) -> DeserializeError {
+        DeserializeError(DeserializeErrorKind::InvalidVarint { what })
+    }
+
+    fn version_mismatch(expected: u32, found: u32) -> DeserializeError {
+        DeserializeError(DeserializeErrorKind::VersionMismatch {
+            expected,
+            found,
+        })
+    }
+
+    fn endian_mismatch(expected: u32, found: u32) -> DeserializeError {
+        DeserializeError(DeserializeErrorKind::EndianMismatch {
+            expected,
+            found,
+        })
+    }
+
+    fn alignment_mismatch(
+        alignment: usize,
+        address: usize,
+    ) -> DeserializeError {
+        DeserializeError(DeserializeErrorKind::AlignmentMismatch {
+            alignment,
+            address,
+        })
+    }
+
+    fn label_mismatch(expected: &'static str) -> DeserializeError {
+        DeserializeError(DeserializeErrorKind::LabelMismatch { expected })
+    }
+
+    fn arithmetic_overflow(what: &'static str) -> DeserializeError {
+        DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what })
+    }
+
+    pub(crate) fn pattern_id_error(
+        err: PatternIDError,
+        what: &'static str,
+    ) -> DeserializeError {
+        DeserializeError(DeserializeErrorKind::PatternID { err, what })
+    }
+
+    pub(crate) fn state_id_error(
+        err: StateIDError,
+        what: &'static str,
+    ) -> DeserializeError {
+        DeserializeError(DeserializeErrorKind::StateID { err, what })
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for DeserializeError {}
+
+impl core::fmt::Display for DeserializeError {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        use self::DeserializeErrorKind::*;
+
+        match self.0 {
+            Generic { msg } => write!(f, "{}", msg),
+            BufferTooSmall { what } => {
+                write!(f, "buffer is too small to read {}", what)
+            }
+            InvalidUsize { what } => {
+                write!(f, "{} is too big to fit in a usize", what)
+            }
+            InvalidVarint { what } => {
+                write!(f, "could not decode valid varint for {}", what)
+            }
+            VersionMismatch { expected, found } => write!(
+                f,
+                "unsupported version: \
+                 expected version {} but found version {}",
+                expected, found,
+            ),
+            EndianMismatch { expected, found } => write!(
+                f,
+                "endianness mismatch: expected 0x{:X} but got 0x{:X}. \
+                 (Are you trying to load an object serialized with a \
+                 different endianness?)",
+                expected, found,
+            ),
+            AlignmentMismatch { alignment, address } => write!(
+                f,
+                "alignment mismatch: slice starts at address \
+                 0x{:X}, which is not aligned to a {} byte boundary",
+                address, alignment,
+            ),
+            LabelMismatch { expected } => write!(
+                f,
+                "label mismatch: start of serialized object should \
+                 contain a NUL terminated {:?} label, but a different \
+                 label was found",
+                expected,
+            ),
+            ArithmeticOverflow { what } => {
+                write!(f, "arithmetic overflow for {}", what)
+            }
+            PatternID { ref err, what } => {
+                write!(f, "failed to read pattern ID for {}: {}", what, err)
+            }
+            StateID { ref err, what } => {
+                write!(f, "failed to read state ID for {}: {}", what, err)
+            }
+        }
+    }
+}
+
+/// Checks that the given slice has an alignment that matches `T`.
+///
+/// This is useful for checking that a slice has an appropriate alignment
+/// before casting it to a &[T]. Note though that alignment is not itself
+/// sufficient to perform the cast for any `T`.
+pub fn check_alignment<T>(slice: &[u8]) -> Result<(), DeserializeError> {
+    let alignment = core::mem::align_of::<T>();
+    let address = slice.as_ptr() as usize;
+    if address % alignment == 0 {
+        return Ok(());
+    }
+    Err(DeserializeError::alignment_mismatch(alignment, address))
+}
+
+/// Reads a possibly empty amount of padding, up to 7 bytes, from the beginning
+/// of the given slice. All padding bytes must be NUL bytes.
+///
+/// This is useful because it can be theoretically necessary to pad the
+/// beginning of a serialized object with NUL bytes to ensure that it starts
+/// at a correctly aligned address. These padding bytes should come immediately
+/// before the label.
+///
+/// This returns the number of bytes read from the given slice.
+pub fn skip_initial_padding(slice: &[u8]) -> usize {
+    let mut nread = 0;
+    while nread < 7 && nread < slice.len() && slice[nread] == 0 {
+        nread += 1;
+    }
+    nread
+}
+
+/// Allocate a byte buffer of the given size, along with some initial padding
+/// such that `buf[padding..]` has the same alignment as `T`, where the
+/// alignment of `T` must be at most `8`. In particular, callers should treat
+/// the first N bytes (second return value) as padding bytes that must not be
+/// overwritten. In all cases, the following identity holds:
+///
+/// ```ignore
+/// let (buf, padding) = alloc_aligned_buffer::<StateID>(SIZE);
+/// assert_eq!(SIZE, buf[padding..].len());
+/// ```
+///
+/// In practice, padding is often zero.
+///
+/// The requirement for `8` as a maximum here is somewhat arbitrary. In
+/// practice, we never need anything bigger in this crate, and so this function
+/// does some sanity asserts under the assumption of a max alignment of `8`.
+#[cfg(feature = "alloc")]
+pub fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) {
+    // FIXME: This is a kludge because there's no easy way to allocate a
+    // Vec<u8> with an alignment guaranteed to be greater than 1. We could
+    // create a Vec<u32>, but this cannot be safely transmuted to a Vec<u8>
+    // without concern, since reallocing or dropping the Vec<u8> is UB
+    // (different alignment than the initial allocation). We could define a
+    // wrapper type to manage this for us, but it seems like more machinery
+    // than it's worth.
+    let mut buf = vec![0; size];
+    let align = core::mem::align_of::<T>();
+    let address = buf.as_ptr() as usize;
+    if address % align == 0 {
+        return (buf, 0);
+    }
+    // It's not quite clear how to robustly test this code, since the allocator
+    // in my environment appears to always return addresses aligned to at
+    // least 8 bytes, even when the alignment requirement is smaller. A feeble
+    // attempt at ensuring correctness is provided with asserts.
+    let padding = ((address & !0b111).checked_add(8).unwrap())
+        .checked_sub(address)
+        .unwrap();
+    assert!(padding <= 7, "padding of {} is bigger than 7", padding);
+    buf.extend(core::iter::repeat(0).take(padding));
+    assert_eq!(size + padding, buf.len());
+    assert_eq!(
+        0,
+        buf[padding..].as_ptr() as usize % align,
+        "expected end of initial padding to be aligned to {}",
+        align,
+    );
+    (buf, padding)
+}
+
+/// Reads a NUL terminated label starting at the beginning of the given slice.
+///
+/// If a NUL terminated label could not be found, then an error is returned.
+/// Similary, if a label is found but doesn't match the expected label, then
+/// an error is returned.
+///
+/// Upon success, the total number of bytes read (including padding bytes) is
+/// returned.
+pub fn read_label(
+    slice: &[u8],
+    expected_label: &'static str,
+) -> Result<usize, DeserializeError> {
+    // Set an upper bound on how many bytes we scan for a NUL. Since no label
+    // in this crate is longer than 256 bytes, if we can't find one within that
+    // range, then we have corrupted data.
+    let first_nul =
+        slice[..cmp::min(slice.len(), 256)].iter().position(|&b| b == 0);
+    let first_nul = match first_nul {
+        Some(first_nul) => first_nul,
+        None => {
+            return Err(DeserializeError::generic(
+                "could not find NUL terminated label \
+                 at start of serialized object",
+            ));
+        }
+    };
+    let len = first_nul + padding_len(first_nul);
+    if slice.len() < len {
+        return Err(DeserializeError::generic(
+            "could not find properly sized label at start of serialized object"
+        ));
+    }
+    if expected_label.as_bytes() != &slice[..first_nul] {
+        return Err(DeserializeError::label_mismatch(expected_label));
+    }
+    Ok(len)
+}
+
+/// Writes the given label to the buffer as a NUL terminated string. The label
+/// given must not contain NUL, otherwise this will panic. Similarly, the label
+/// must not be longer than 255 bytes, otherwise this will panic.
+///
+/// Additional NUL bytes are written as necessary to ensure that the number of
+/// bytes written is always a multiple of 4.
+///
+/// Upon success, the total number of bytes written (including padding) is
+/// returned.
+pub fn write_label(
+    label: &str,
+    dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+    let nwrite = write_label_len(label);
+    if dst.len() < nwrite {
+        return Err(SerializeError::buffer_too_small("label"));
+    }
+    dst[..label.len()].copy_from_slice(label.as_bytes());
+    for i in 0..(nwrite - label.len()) {
+        dst[label.len() + i] = 0;
+    }
+    assert_eq!(nwrite % 4, 0);
+    Ok(nwrite)
+}
+
+/// Returns the total number of bytes (including padding) that would be written
+/// for the given label. This panics if the given label contains a NUL byte or
+/// is longer than 255 bytes. (The size restriction exists so that searching
+/// for a label during deserialization can be done in small bounded space.)
+pub fn write_label_len(label: &str) -> usize {
+    if label.len() > 255 {
+        panic!("label must not be longer than 255 bytes");
+    }
+    if label.as_bytes().iter().position(|&b| b == 0).is_some() {
+        panic!("label must not contain NUL bytes");
+    }
+    let label_len = label.len() + 1; // +1 for the NUL terminator
+    label_len + padding_len(label_len)
+}
+
+/// Reads the endianness check from the beginning of the given slice and
+/// confirms that the endianness of the serialized object matches the expected
+/// endianness. If the slice is too small or if the endianness check fails,
+/// this returns an error.
+///
+/// Upon success, the total number of bytes read is returned.
+pub fn read_endianness_check(slice: &[u8]) -> Result<usize, DeserializeError> {
+    let (n, nr) = try_read_u32(slice, "endianness check")?;
+    assert_eq!(nr, write_endianness_check_len());
+    if n != 0xFEFF {
+        return Err(DeserializeError::endian_mismatch(0xFEFF, n));
+    }
+    Ok(nr)
+}
+
+/// Writes 0xFEFF as an integer using the given endianness.
+///
+/// This is useful for writing into the header of a serialized object. It can
+/// be read during deserialization as a sanity check to ensure the proper
+/// endianness is used.
+///
+/// Upon success, the total number of bytes written is returned.
+pub fn write_endianness_check<E: Endian>(
+    dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+    let nwrite = write_endianness_check_len();
+    if dst.len() < nwrite {
+        return Err(SerializeError::buffer_too_small("endianness check"));
+    }
+    E::write_u32(0xFEFF, dst);
+    Ok(nwrite)
+}
+
+/// Returns the number of bytes written by the endianness check.
+pub fn write_endianness_check_len() -> usize {
+    size_of::<u32>()
+}
+
+/// Reads a version number from the beginning of the given slice and confirms
+/// that is matches the expected version number given. If the slice is too
+/// small or if the version numbers aren't equivalent, this returns an error.
+///
+/// Upon success, the total number of bytes read is returned.
+///
+/// N.B. Currently, we require that the version number is exactly equivalent.
+/// In the future, if we bump the version number without a semver bump, then
+/// we'll need to relax this a bit and support older versions.
+pub fn read_version(
+    slice: &[u8],
+    expected_version: u32,
+) -> Result<usize, DeserializeError> {
+    let (n, nr) = try_read_u32(slice, "version")?;
+    assert_eq!(nr, write_version_len());
+    if n != expected_version {
+        return Err(DeserializeError::version_mismatch(expected_version, n));
+    }
+    Ok(nr)
+}
+
+/// Writes the given version number to the beginning of the given slice.
+///
+/// This is useful for writing into the header of a serialized object. It can
+/// be read during deserialization as a sanity check to ensure that the library
+/// code supports the format of the serialized object.
+///
+/// Upon success, the total number of bytes written is returned.
+pub fn write_version<E: Endian>(
+    version: u32,
+    dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+    let nwrite = write_version_len();
+    if dst.len() < nwrite {
+        return Err(SerializeError::buffer_too_small("version number"));
+    }
+    E::write_u32(version, dst);
+    Ok(nwrite)
+}
+
+/// Returns the number of bytes written by writing the version number.
+pub fn write_version_len() -> usize {
+    size_of::<u32>()
+}
+
+/// Reads a pattern ID from the given slice. If the slice has insufficient
+/// length, then this panics. If the deserialized integer exceeds the pattern
+/// ID limit for the current target, then this returns an error.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn read_pattern_id(
+    slice: &[u8],
+    what: &'static str,
+) -> Result<(PatternID, usize), DeserializeError> {
+    let bytes: [u8; PatternID::SIZE] =
+        slice[..PatternID::SIZE].try_into().unwrap();
+    let pid = PatternID::from_ne_bytes(bytes)
+        .map_err(|err| DeserializeError::pattern_id_error(err, what))?;
+    Ok((pid, PatternID::SIZE))
+}
+
+/// Reads a pattern ID from the given slice. If the slice has insufficient
+/// length, then this panics. Otherwise, the deserialized integer is assumed
+/// to be a valid pattern ID.
+///
+/// This also returns the number of bytes read.
+pub fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) {
+    let pid = PatternID::from_ne_bytes_unchecked(
+        slice[..PatternID::SIZE].try_into().unwrap(),
+    );
+    (pid, PatternID::SIZE)
+}
+
+/// Write the given pattern ID to the beginning of the given slice of bytes
+/// using the specified endianness. The given slice must have length at least
+/// `PatternID::SIZE`, or else this panics. Upon success, the total number of
+/// bytes written is returned.
+pub fn write_pattern_id<E: Endian>(pid: PatternID, dst: &mut [u8]) -> usize {
+    E::write_u32(pid.as_u32(), dst);
+    PatternID::SIZE
+}
+
+/// Attempts to read a state ID from the given slice. If the slice has an
+/// insufficient number of bytes or if the state ID exceeds the limit for
+/// the current target, then this returns an error.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_state_id(
+    slice: &[u8],
+    what: &'static str,
+) -> Result<(StateID, usize), DeserializeError> {
+    if slice.len() < StateID::SIZE {
+        return Err(DeserializeError::buffer_too_small(what));
+    }
+    read_state_id(slice, what)
+}
+
+/// Reads a state ID from the given slice. If the slice has insufficient
+/// length, then this panics. If the deserialized integer exceeds the state ID
+/// limit for the current target, then this returns an error.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn read_state_id(
+    slice: &[u8],
+    what: &'static str,
+) -> Result<(StateID, usize), DeserializeError> {
+    let bytes: [u8; StateID::SIZE] =
+        slice[..StateID::SIZE].try_into().unwrap();
+    let sid = StateID::from_ne_bytes(bytes)
+        .map_err(|err| DeserializeError::state_id_error(err, what))?;
+    Ok((sid, StateID::SIZE))
+}
+
+/// Reads a state ID from the given slice. If the slice has insufficient
+/// length, then this panics. Otherwise, the deserialized integer is assumed
+/// to be a valid state ID.
+///
+/// This also returns the number of bytes read.
+pub fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) {
+    let sid = StateID::from_ne_bytes_unchecked(
+        slice[..StateID::SIZE].try_into().unwrap(),
+    );
+    (sid, StateID::SIZE)
+}
+
+/// Write the given state ID to the beginning of the given slice of bytes
+/// using the specified endianness. The given slice must have length at least
+/// `StateID::SIZE`, or else this panics. Upon success, the total number of
+/// bytes written is returned.
+pub fn write_state_id<E: Endian>(sid: StateID, dst: &mut [u8]) -> usize {
+    E::write_u32(sid.as_u32(), dst);
+    StateID::SIZE
+}
+
+/// Try to read a u16 as a usize from the beginning of the given slice in
+/// native endian format. If the slice has fewer than 2 bytes or if the
+/// deserialized number cannot be represented by usize, then this returns an
+/// error. The error message will include the `what` description of what is
+/// being deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u16_as_usize(
+    slice: &[u8],
+    what: &'static str,
+) -> Result<(usize, usize), DeserializeError> {
+    try_read_u16(slice, what).and_then(|(n, nr)| {
+        usize::try_from(n)
+            .map(|n| (n, nr))
+            .map_err(|_| DeserializeError::invalid_usize(what))
+    })
+}
+
+/// Try to read a u32 as a usize from the beginning of the given slice in
+/// native endian format. If the slice has fewer than 4 bytes or if the
+/// deserialized number cannot be represented by usize, then this returns an
+/// error. The error message will include the `what` description of what is
+/// being deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u32_as_usize(
+    slice: &[u8],
+    what: &'static str,
+) -> Result<(usize, usize), DeserializeError> {
+    try_read_u32(slice, what).and_then(|(n, nr)| {
+        usize::try_from(n)
+            .map(|n| (n, nr))
+            .map_err(|_| DeserializeError::invalid_usize(what))
+    })
+}
+
+/// Try to read a u16 from the beginning of the given slice in native endian
+/// format. If the slice has fewer than 2 bytes, then this returns an error.
+/// The error message will include the `what` description of what is being
+/// deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u16(
+    slice: &[u8],
+    what: &'static str,
+) -> Result<(u16, usize), DeserializeError> {
+    if slice.len() < size_of::<u16>() {
+        return Err(DeserializeError::buffer_too_small(what));
+    }
+    Ok((read_u16(slice), size_of::<u16>()))
+}
+
+/// Try to read a u32 from the beginning of the given slice in native endian
+/// format. If the slice has fewer than 4 bytes, then this returns an error.
+/// The error message will include the `what` description of what is being
+/// deserialized, for better error messages. `what` should be a noun in
+/// singular form.
+///
+/// Upon success, this also returns the number of bytes read.
+pub fn try_read_u32(
+    slice: &[u8],
+    what: &'static str,
+) -> Result<(u32, usize), DeserializeError> {
+    if slice.len() < size_of::<u32>() {
+        return Err(DeserializeError::buffer_too_small(what));
+    }
+    Ok((read_u32(slice), size_of::<u32>()))
+}
+
+/// Read a u16 from the beginning of the given slice in native endian format.
+/// If the slice has fewer than 2 bytes, then this panics.
+///
+/// Marked as inline to speed up sparse searching which decodes integers from
+/// its automaton at search time.
+#[inline(always)]
+pub fn read_u16(slice: &[u8]) -> u16 {
+    let bytes: [u8; 2] = slice[..size_of::<u16>()].try_into().unwrap();
+    u16::from_ne_bytes(bytes)
+}
+
+/// Read a u32 from the beginning of the given slice in native endian format.
+/// If the slice has fewer than 4 bytes, then this panics.
+///
+/// Marked as inline to speed up sparse searching which decodes integers from
+/// its automaton at search time.
+#[inline(always)]
+pub fn read_u32(slice: &[u8]) -> u32 {
+    let bytes: [u8; 4] = slice[..size_of::<u32>()].try_into().unwrap();
+    u32::from_ne_bytes(bytes)
+}
+
+/// Read a u64 from the beginning of the given slice in native endian format.
+/// If the slice has fewer than 8 bytes, then this panics.
+///
+/// Marked as inline to speed up sparse searching which decodes integers from
+/// its automaton at search time.
+#[inline(always)]
+pub fn read_u64(slice: &[u8]) -> u64 {
+    let bytes: [u8; 8] = slice[..size_of::<u64>()].try_into().unwrap();
+    u64::from_ne_bytes(bytes)
+}
+
+/// Write a variable sized integer and return the total number of bytes
+/// written. If the slice was not big enough to contain the bytes, then this
+/// returns an error including the "what" description in it. This does no
+/// padding.
+///
+/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints
+#[allow(dead_code)]
+pub fn write_varu64(
+    mut n: u64,
+    what: &'static str,
+    dst: &mut [u8],
+) -> Result<usize, SerializeError> {
+    let mut i = 0;
+    while n >= 0b1000_0000 {
+        if i >= dst.len() {
+            return Err(SerializeError::buffer_too_small(what));
+        }
+        dst[i] = (n as u8) | 0b1000_0000;
+        n >>= 7;
+        i += 1;
+    }
+    if i >= dst.len() {
+        return Err(SerializeError::buffer_too_small(what));
+    }
+    dst[i] = n as u8;
+    Ok(i + 1)
+}
+
+/// Returns the total number of bytes that would be writen to encode n as a
+/// variable sized integer.
+///
+/// See: https://developers.google.com/protocol-buffers/docs/encoding#varints
+#[allow(dead_code)]
+pub fn write_varu64_len(mut n: u64) -> usize {
+    let mut i = 0;
+    while n >= 0b1000_0000 {
+        n >>= 7;
+        i += 1;
+    }
+    i + 1
+}
+
+/// Like read_varu64, but attempts to cast the result to usize. If the integer
+/// cannot fit into a usize, then an error is returned.
+#[allow(dead_code)]
+pub fn read_varu64_as_usize(
+    slice: &[u8],
+    what: &'static str,
+) -> Result<(usize, usize), DeserializeError> {
+    let (n, nread) = read_varu64(slice, what)?;
+    let n = usize::try_from(n)
+        .map_err(|_| DeserializeError::invalid_usize(what))?;
+    Ok((n, nread))
+}
+
+/// Reads a variable sized integer from the beginning of slice, and returns the
+/// integer along with the total number of bytes read. If a valid variable
+/// sized integer could not be found, then an error is returned that includes
+/// the "what" description in it.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+#[allow(dead_code)]
+pub fn read_varu64(
+    slice: &[u8],
+    what: &'static str,
+) -> Result<(u64, usize), DeserializeError> {
+    let mut n: u64 = 0;
+    let mut shift: u32 = 0;
+    // The biggest possible value is u64::MAX, which needs all 64 bits which
+    // requires 10 bytes (because 7 * 9 < 64). We use a limit to avoid reading
+    // an unnecessary number of bytes.
+    let limit = cmp::min(slice.len(), 10);
+    for (i, &b) in slice[..limit].iter().enumerate() {
+        if b < 0b1000_0000 {
+            return match (b as u64).checked_shl(shift) {
+                None => Err(DeserializeError::invalid_varint(what)),
+                Some(b) => Ok((n | b, i + 1)),
+            };
+        }
+        match ((b as u64) & 0b0111_1111).checked_shl(shift) {
+            None => return Err(DeserializeError::invalid_varint(what)),
+            Some(b) => n |= b,
+        }
+        shift += 7;
+    }
+    Err(DeserializeError::invalid_varint(what))
+}
+
+/// Checks that the given slice has some minimal length. If it's smaller than
+/// the bound given, then a "buffer too small" error is returned with `what`
+/// describing what the buffer represents.
+pub fn check_slice_len<T>(
+    slice: &[T],
+    at_least_len: usize,
+    what: &'static str,
+) -> Result<(), DeserializeError> {
+    if slice.len() < at_least_len {
+        return Err(DeserializeError::buffer_too_small(what));
+    }
+    Ok(())
+}
+
+/// Multiply the given numbers, and on overflow, return an error that includes
+/// 'what' in the error message.
+///
+/// This is useful when doing arithmetic with untrusted data.
+pub fn mul(
+    a: usize,
+    b: usize,
+    what: &'static str,
+) -> Result<usize, DeserializeError> {
+    match a.checked_mul(b) {
+        Some(c) => Ok(c),
+        None => Err(DeserializeError::arithmetic_overflow(what)),
+    }
+}
+
+/// Add the given numbers, and on overflow, return an error that includes
+/// 'what' in the error message.
+///
+/// This is useful when doing arithmetic with untrusted data.
+pub fn add(
+    a: usize,
+    b: usize,
+    what: &'static str,
+) -> Result<usize, DeserializeError> {
+    match a.checked_add(b) {
+        Some(c) => Ok(c),
+        None => Err(DeserializeError::arithmetic_overflow(what)),
+    }
+}
+
+/// Shift `a` left by `b`, and on overflow, return an error that includes
+/// 'what' in the error message.
+///
+/// This is useful when doing arithmetic with untrusted data.
+pub fn shl(
+    a: usize,
+    b: usize,
+    what: &'static str,
+) -> Result<usize, DeserializeError> {
+    let amount = u32::try_from(b)
+        .map_err(|_| DeserializeError::arithmetic_overflow(what))?;
+    match a.checked_shl(amount) {
+        Some(c) => Ok(c),
+        None => Err(DeserializeError::arithmetic_overflow(what)),
+    }
+}
+
+/// A simple trait for writing code generic over endianness.
+///
+/// This is similar to what byteorder provides, but we only need a very small
+/// subset.
+pub trait Endian {
+    /// Writes a u16 to the given destination buffer in a particular
+    /// endianness. If the destination buffer has a length smaller than 2, then
+    /// this panics.
+    fn write_u16(n: u16, dst: &mut [u8]);
+
+    /// Writes a u32 to the given destination buffer in a particular
+    /// endianness. If the destination buffer has a length smaller than 4, then
+    /// this panics.
+    fn write_u32(n: u32, dst: &mut [u8]);
+
+    /// Writes a u64 to the given destination buffer in a particular
+    /// endianness. If the destination buffer has a length smaller than 8, then
+    /// this panics.
+    fn write_u64(n: u64, dst: &mut [u8]);
+}
+
+/// Little endian writing.
+pub enum LE {}
+/// Big endian writing.
+pub enum BE {}
+
+#[cfg(target_endian = "little")]
+pub type NE = LE;
+#[cfg(target_endian = "big")]
+pub type NE = BE;
+
+impl Endian for LE {
+    fn write_u16(n: u16, dst: &mut [u8]) {
+        dst[..2].copy_from_slice(&n.to_le_bytes());
+    }
+
+    fn write_u32(n: u32, dst: &mut [u8]) {
+        dst[..4].copy_from_slice(&n.to_le_bytes());
+    }
+
+    fn write_u64(n: u64, dst: &mut [u8]) {
+        dst[..8].copy_from_slice(&n.to_le_bytes());
+    }
+}
+
+impl Endian for BE {
+    fn write_u16(n: u16, dst: &mut [u8]) {
+        dst[..2].copy_from_slice(&n.to_be_bytes());
+    }
+
+    fn write_u32(n: u32, dst: &mut [u8]) {
+        dst[..4].copy_from_slice(&n.to_be_bytes());
+    }
+
+    fn write_u64(n: u64, dst: &mut [u8]) {
+        dst[..8].copy_from_slice(&n.to_be_bytes());
+    }
+}
+
+/// Returns the number of additional bytes required to add to the given length
+/// in order to make the total length a multiple of 4. The return value is
+/// always less than 4.
+pub fn padding_len(non_padding_len: usize) -> usize {
+    (4 - (non_padding_len & 0b11)) & 0b11
+}
+
+#[cfg(all(test, feature = "alloc"))]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn labels() {
+        let mut buf = [0; 1024];
+
+        let nwrite = write_label("fooba", &mut buf).unwrap();
+        assert_eq!(nwrite, 8);
+        assert_eq!(&buf[..nwrite], b"fooba\x00\x00\x00");
+
+        let nread = read_label(&buf, "fooba").unwrap();
+        assert_eq!(nread, 8);
+    }
+
+    #[test]
+    #[should_panic]
+    fn bad_label_interior_nul() {
+        // interior NULs are not allowed
+        write_label("foo\x00bar", &mut [0; 1024]).unwrap();
+    }
+
+    #[test]
+    fn bad_label_almost_too_long() {
+        // ok
+        write_label(&"z".repeat(255), &mut [0; 1024]).unwrap();
+    }
+
+    #[test]
+    #[should_panic]
+    fn bad_label_too_long() {
+        // labels longer than 255 bytes are banned
+        write_label(&"z".repeat(256), &mut [0; 1024]).unwrap();
+    }
+
+    #[test]
+    fn padding() {
+        assert_eq!(0, padding_len(8));
+        assert_eq!(3, padding_len(9));
+        assert_eq!(2, padding_len(10));
+        assert_eq!(1, padding_len(11));
+        assert_eq!(0, padding_len(12));
+        assert_eq!(3, padding_len(13));
+        assert_eq!(2, padding_len(14));
+        assert_eq!(1, padding_len(15));
+        assert_eq!(0, padding_len(16));
+    }
+}
diff --git a/src/util/determinize/mod.rs b/src/util/determinize/mod.rs

new file mode 100644 (file)

index 0000000..b384de8
--- /dev/null
+++ b/src/util/determinize/mod.rs
@@ -0,0 +1,493 @@
+/*!
+This module contains types and routines for implementing determinization.
+
+In this crate, there are at least two places where we implement
+determinization: fully ahead-of-time compiled DFAs in the `dfa` module and
+lazily compiled DFAs in the `hybrid` module. The stuff in this module
+corresponds to the things that are in common between these implementations.
+
+There are three broad things that our implementations of determinization have
+in common, as defined by this module:
+
+* The classification of start states. That is, whether we're dealing with
+word boundaries, line boundaries, etc., is all the same. This also includes
+the look-behind assertions that are satisfied by each starting state
+classification.
+
+* The representation of DFA states as sets of NFA states, including
+convenience types for building these DFA states that are amenable to reusing
+allocations.
+
+* Routines for the "classical" parts of determinization: computing the
+epsilon closure, tracking match states (with corresponding pattern IDs, since
+we support multi-pattern finite automata) and, of course, computing the
+transition function between states for units of input.
+
+I did consider a couple of alternatives to this particular form of code reuse:
+
+1. Don't do any code reuse. The problem here is that we *really* want both
+forms of determinization to do exactly identical things when it comes to
+their handling of NFA states. While our tests generally ensure this, the code
+is tricky and large enough where not reusing code is a pretty big bummer.
+
+2. Implement all of determinization once and make it generic over fully
+compiled DFAs and lazily compiled DFAs. While I didn't actually try this
+approach, my instinct is that it would be more complex than is needed here.
+And the interface required would be pretty hairy. Instead, I think splitting
+it into logical sub-components works better.
+*/
+
+use alloc::vec::Vec;
+
+pub(crate) use self::state::{
+    State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA,
+};
+
+use crate::{
+    nfa::thompson::{self, Look, LookSet},
+    util::{
+        alphabet,
+        id::StateID,
+        matchtypes::MatchKind,
+        sparse_set::{SparseSet, SparseSets},
+        start::Start,
+    },
+};
+
+mod state;
+
+/// Compute the set of all eachable NFA states, including the full epsilon
+/// closure, from a DFA state for a single unit of input. The set of reachable
+/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned
+/// also includes any look-behind assertions satisfied by `unit`, in addition
+/// to whether it is a match state. For multi-pattern DFAs, the builder will
+/// also include the pattern IDs that match (in the order seen).
+///
+/// `nfa` must be able to resolve any NFA state in `state` and any NFA state
+/// reachable via the epsilon closure of any NFA state in `state`. `sparses`
+/// must have capacity equivalent to `nfa.len()`.
+///
+/// `match_kind` should correspond to the match semantics implemented by the
+/// DFA being built. Generally speaking, for leftmost-first match semantics,
+/// states that appear after the first NFA match state will not be included in
+/// the `StateBuilderNFA` returned since they are impossible to visit.
+///
+/// `sparses` is used as scratch space for NFA traversal. Other than their
+/// capacity requirements (detailed above), there are no requirements on what's
+/// contained within them (if anything). Similarly, what's inside of them once
+/// this routine returns is unspecified.
+///
+/// `stack` must have length 0. It is used as scratch space for depth first
+/// traversal. After returning, it is guaranteed that `stack` will have length
+/// 0.
+///
+/// `state` corresponds to the current DFA state on which one wants to compute
+/// the transition for the input `unit`.
+///
+/// `empty_builder` corresponds to the builder allocation to use to produce a
+/// complete `StateBuilderNFA` state. If the state is not needed (or is already
+/// cached), then it can be cleared and reused without needing to create a new
+/// `State`. The `StateBuilderNFA` state returned is final and ready to be
+/// turned into a `State` if necessary.
+pub(crate) fn next(
+    nfa: &thompson::NFA,
+    match_kind: MatchKind,
+    sparses: &mut SparseSets,
+    stack: &mut Vec<StateID>,
+    state: &State,
+    unit: alphabet::Unit,
+    empty_builder: StateBuilderEmpty,
+) -> StateBuilderNFA {
+    sparses.clear();
+
+    // Put the NFA state IDs into a sparse set in case we need to
+    // re-compute their epsilon closure.
+    //
+    // Doing this state shuffling is technically not necessary unless some
+    // kind of look-around is used in the DFA. Some ad hoc experiments
+    // suggested that avoiding this didn't lead to much of an improvement,
+    // but perhaps more rigorous experimentation should be done. And in
+    // particular, avoiding this check requires some light refactoring of
+    // the code below.
+    state.iter_nfa_state_ids(|nfa_id| {
+        sparses.set1.insert(nfa_id);
+    });
+
+    // Compute look-ahead assertions originating from the current state.
+    // Based on the input unit we're transitioning over, some additional
+    // set of assertions may be true. Thus, we re-compute this state's
+    // epsilon closure (but only if necessary).
+    if !state.look_need().is_empty() {
+        // Add look-ahead assertions that are now true based on the current
+        // input unit.
+        let mut look_have = state.look_have().clone();
+        match unit.as_u8() {
+            Some(b'\n') => {
+                look_have.insert(Look::EndLine);
+            }
+            Some(_) => {}
+            None => {
+                look_have.insert(Look::EndText);
+                look_have.insert(Look::EndLine);
+            }
+        }
+        if state.is_from_word() == unit.is_word_byte() {
+            look_have.insert(Look::WordBoundaryUnicodeNegate);
+            look_have.insert(Look::WordBoundaryAsciiNegate);
+        } else {
+            look_have.insert(Look::WordBoundaryUnicode);
+            look_have.insert(Look::WordBoundaryAscii);
+        }
+        // If we have new assertions satisfied that are among the set of
+        // assertions that exist in this state (that is, just because
+        // we added an EndLine assertion above doesn't mean there is an
+        // EndLine conditional epsilon transition in this state), then we
+        // re-compute this state's epsilon closure using the updated set of
+        // assertions.
+        if !look_have
+            .subtract(state.look_have())
+            .intersect(state.look_need())
+            .is_empty()
+        {
+            for nfa_id in &sparses.set1 {
+                epsilon_closure(
+                    nfa,
+                    nfa_id,
+                    look_have,
+                    stack,
+                    &mut sparses.set2,
+                );
+            }
+            sparses.swap();
+            sparses.set2.clear();
+        }
+    }
+
+    // Convert our empty builder into one that can record assertions and match
+    // pattern IDs.
+    let mut builder = empty_builder.into_matches();
+    // Set whether the StartLine look-behind assertion is true for this
+    // transition or not. The look-behind assertion for ASCII word boundaries
+    // is handled below.
+    if nfa.has_any_anchor() {
+        if unit.as_u8().map_or(false, |b| b == b'\n') {
+            // Why only handle StartLine here and not StartText? That's
+            // because StartText can only impact the starting state, which
+            // is speical cased in start state handling.
+            builder.look_have().insert(Look::StartLine);
+        }
+    }
+    for nfa_id in &sparses.set1 {
+        match *nfa.state(nfa_id) {
+            thompson::State::Union { .. }
+            | thompson::State::Fail
+            | thompson::State::Look { .. }
+            | thompson::State::Capture { .. } => {}
+            thompson::State::Match { id } => {
+                // Notice here that we are calling the NEW state a match
+                // state if the OLD state we are transitioning from
+                // contains an NFA match state. This is precisely how we
+                // delay all matches by one byte and also what therefore
+                // guarantees that starting states cannot be match states.
+                //
+                // If we didn't delay matches by one byte, then whether
+                // a DFA is a matching state or not would be determined
+                // by whether one of its own constituent NFA states
+                // was a match state. (And that would be done in
+                // 'add_nfa_states'.)
+                //
+                // Also, 'add_match_pattern_id' requires that callers never
+                // pass duplicative pattern IDs. We do in fact uphold that
+                // guarantee here, but it's subtle. In particular, a Thompson
+                // NFA guarantees that each pattern has exactly one match
+                // state. Moreover, since we're iterating over the NFA state
+                // IDs in a set, we are guarateed not to have any duplicative
+                // match states. Thus, it is impossible to add the same pattern
+                // ID more than once.
+                builder.add_match_pattern_id(id);
+                if !match_kind.continue_past_first_match() {
+                    break;
+                }
+            }
+            thompson::State::Range { range: ref r } => {
+                if r.matches_unit(unit) {
+                    epsilon_closure(
+                        nfa,
+                        r.next,
+                        *builder.look_have(),
+                        stack,
+                        &mut sparses.set2,
+                    );
+                }
+            }
+            thompson::State::Sparse(ref sparse) => {
+                if let Some(next) = sparse.matches_unit(unit) {
+                    epsilon_closure(
+                        nfa,
+                        next,
+                        *builder.look_have(),
+                        stack,
+                        &mut sparses.set2,
+                    );
+                }
+            }
+        }
+    }
+    // We only set the word byte if there's a word boundary look-around
+    // anywhere in this regex. Otherwise, there's no point in bloating the
+    // number of states if we don't have one.
+    //
+    // We also only set it when the state has a non-zero number of NFA states.
+    // Otherwise, we could wind up with states that *should* be DEAD states
+    // but are otherwise distinct from DEAD states because of this look-behind
+    // assertion being set. While this can't technically impact correctness *in
+    // theory*, it can create pathological DFAs that consume input until EOI or
+    // a quit byte is seen. Consuming until EOI isn't a correctness problem,
+    // but a (serious) perf problem. Hitting a quit byte, however, could be a
+    // correctness problem since it could cause search routines to report an
+    // error instead of a detected match once the quit state is entered. (The
+    // search routine could be made to be a bit smarter by reporting a match
+    // if one was detected once it enters a quit state (and indeed, the search
+    // routines in this crate do just that), but it seems better to prevent
+    // these things by construction if possible.)
+    if nfa.has_word_boundary()
+        && unit.is_word_byte()
+        && !sparses.set2.is_empty()
+    {
+        builder.set_is_from_word();
+    }
+    let mut builder_nfa = builder.into_nfa();
+    add_nfa_states(nfa, &sparses.set2, &mut builder_nfa);
+    builder_nfa
+}
+
+/// Compute the epsilon closure for the given NFA state. The epsilon closure
+/// consists of all NFA state IDs, including `start_nfa_id`, that can be
+/// reached from `start_nfa_id` without consuming any input. These state IDs
+/// are written to `set` in the order they are visited, but only if they are
+/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA
+/// given.
+///
+/// `look_have` consists of the satisfied assertions at the current
+/// position. For conditional look-around epsilon transitions, these are
+/// only followed if they are satisfied by `look_have`.
+///
+/// `stack` must have length 0. It is used as scratch space for depth first
+/// traversal. After returning, it is guaranteed that `stack` will have length
+/// 0.
+pub(crate) fn epsilon_closure(
+    nfa: &thompson::NFA,
+    start_nfa_id: StateID,
+    look_have: LookSet,
+    stack: &mut Vec<StateID>,
+    set: &mut SparseSet,
+) {
+    assert!(stack.is_empty());
+    // If this isn't an epsilon state, then the epsilon closure is always just
+    // itself, so there's no need to spin up the machinery below to handle it.
+    if !nfa.state(start_nfa_id).is_epsilon() {
+        set.insert(start_nfa_id);
+        return;
+    }
+
+    stack.push(start_nfa_id);
+    while let Some(mut id) = stack.pop() {
+        // In many cases, we can avoid stack operations when an NFA state only
+        // adds one new state to visit. In that case, we just set our ID to
+        // that state and mush on. We only use the stack when an NFA state
+        // introduces multiple new states to visit.
+        loop {
+            // Insert this NFA state, and if it's already in the set and thus
+            // already visited, then we can move on to the next one.
+            if !set.insert(id) {
+                break;
+            }
+            match *nfa.state(id) {
+                thompson::State::Range { .. }
+                | thompson::State::Sparse { .. }
+                | thompson::State::Fail
+                | thompson::State::Match { .. } => break,
+                thompson::State::Look { look, next } => {
+                    if !look_have.contains(look) {
+                        break;
+                    }
+                    id = next;
+                }
+                thompson::State::Union { ref alternates } => {
+                    id = match alternates.get(0) {
+                        None => break,
+                        Some(&id) => id,
+                    };
+                    // We need to process our alternates in order to preserve
+                    // match preferences, so put the earliest alternates closer
+                    // to the top of the stack.
+                    stack.extend(alternates[1..].iter().rev());
+                }
+                thompson::State::Capture { next, .. } => {
+                    id = next;
+                }
+            }
+        }
+    }
+}
+
+/// Add the NFA state IDs in the given `set` to the given DFA builder state.
+/// The order in which states are added corresponds to the order in which they
+/// were added to `set`.
+///
+/// The DFA builder state given should already have its complete set of match
+/// pattern IDs added (if any) and any look-behind assertions (StartLine,
+/// StartText and whether this state is being generated for a transition over a
+/// word byte when applicable) that are true immediately prior to transitioning
+/// into this state (via `builder.look_have()`). The match pattern IDs should
+/// correspond to matches that occured on the previous transition, since all
+/// matches are delayed by one byte. The things that should _not_ be set are
+/// look-ahead assertions (EndLine, EndText and whether the next byte is a
+/// word byte or not). The builder state should also not have anything in
+/// `look_need` set, as this routine will compute that for you.
+///
+/// The given NFA should be able to resolve all identifiers in `set` to a
+/// particular NFA state. Additionally, `set` must have capacity equivalent
+/// to `nfa.len()`.
+pub(crate) fn add_nfa_states(
+    nfa: &thompson::NFA,
+    set: &SparseSet,
+    builder: &mut StateBuilderNFA,
+) {
+    for nfa_id in set {
+        match *nfa.state(nfa_id) {
+            thompson::State::Range { .. } => {
+                builder.add_nfa_state_id(nfa_id);
+            }
+            thompson::State::Sparse { .. } => {
+                builder.add_nfa_state_id(nfa_id);
+            }
+            thompson::State::Look { look, .. } => {
+                builder.add_nfa_state_id(nfa_id);
+                builder.look_need().insert(look);
+            }
+            thompson::State::Union { .. }
+            | thompson::State::Capture { .. } => {
+                // Pure epsilon transitions don't need to be tracked
+                // as part of the DFA state. Tracking them is actually
+                // superfluous; they won't cause any harm other than making
+                // determinization slower.
+                //
+                // Why aren't these needed? Well, in an NFA, epsilon
+                // transitions are really just jumping points to other
+                // states. So once you hit an epsilon transition, the same
+                // set of resulting states always appears. Therefore,
+                // putting them in a DFA's set of ordered NFA states is
+                // strictly redundant.
+                //
+                // Look-around states are also epsilon transitions, but
+                // they are *conditional*. So their presence could be
+                // discriminatory, and thus, they are tracked above.
+                //
+                // But wait... why are epsilon states in our `set` in the
+                // first place? Why not just leave them out? They're in
+                // our `set` because it was generated by computing an
+                // epsilon closure, and we want to keep track of all states
+                // we visited to avoid re-visiting them. In exchange, we
+                // have to do this second iteration over our collected
+                // states to finalize our DFA state.
+                //
+                // Note that this optimization requires that we re-compute
+                // the epsilon closure to account for look-ahead in 'next'
+                // *only when necessary*. Namely, only when the set of
+                // look-around assertions changes and only when those
+                // changes are within the set of assertions that are
+                // needed in order to step through the closure correctly.
+                // Otherwise, if we re-do the epsilon closure needlessly,
+                // it could change based on the fact that we are omitting
+                // epsilon states here.
+            }
+            thompson::State::Fail => {
+                break;
+            }
+            thompson::State::Match { .. } => {
+                // Normally, the NFA match state doesn't actually need to
+                // be inside the DFA state. But since we delay matches by
+                // one byte, the matching DFA state corresponds to states
+                // that transition from the one we're building here. And
+                // the way we detect those cases is by looking for an NFA
+                // match state. See 'next' for how this is handled.
+                builder.add_nfa_state_id(nfa_id);
+            }
+        }
+    }
+    // If we know this state contains no look-around assertions, then
+    // there's no reason to track which look-around assertions were
+    // satisfied when this state was created.
+    if builder.look_need().is_empty() {
+        builder.look_have().clear();
+    }
+}
+
+/// Sets the appropriate look-behind assertions on the given state based on
+/// this starting configuration.
+pub(crate) fn set_lookbehind_from_start(
+    start: &Start,
+    builder: &mut StateBuilderMatches,
+) {
+    match *start {
+        Start::NonWordByte => {}
+        Start::WordByte => {
+            builder.set_is_from_word();
+        }
+        Start::Text => {
+            builder.look_have().insert(Look::StartText);
+            builder.look_have().insert(Look::StartLine);
+        }
+        Start::Line => {
+            builder.look_have().insert(Look::StartLine);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Start;
+
+    #[test]
+    #[should_panic]
+    fn start_fwd_bad_range() {
+        Start::from_position_fwd(&[], 0, 1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn start_rev_bad_range() {
+        Start::from_position_rev(&[], 0, 1);
+    }
+
+    #[test]
+    fn start_fwd() {
+        let f = Start::from_position_fwd;
+
+        assert_eq!(Start::Text, f(&[], 0, 0));
+        assert_eq!(Start::Text, f(b"abc", 0, 3));
+        assert_eq!(Start::Text, f(b"\nabc", 0, 3));
+
+        assert_eq!(Start::Line, f(b"\nabc", 1, 3));
+
+        assert_eq!(Start::WordByte, f(b"abc", 1, 3));
+
+        assert_eq!(Start::NonWordByte, f(b" abc", 1, 3));
+    }
+
+    #[test]
+    fn start_rev() {
+        let f = Start::from_position_rev;
+
+        assert_eq!(Start::Text, f(&[], 0, 0));
+        assert_eq!(Start::Text, f(b"abc", 0, 3));
+        assert_eq!(Start::Text, f(b"abc\n", 0, 4));
+
+        assert_eq!(Start::Line, f(b"abc\nz", 0, 3));
+
+        assert_eq!(Start::WordByte, f(b"abc", 0, 2));
+
+        assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3));
+    }
+}
diff --git a/src/util/determinize/state.rs b/src/util/determinize/state.rs

new file mode 100644 (file)

index 0000000..567e600
--- /dev/null
+++ b/src/util/determinize/state.rs
@@ -0,0 +1,873 @@
+/*!
+This module defines a DFA state representation and builders for constructing
+DFA states.
+
+This representation is specifically for use in implementations of NFA-to-DFA
+conversion via powerset construction. (Also called "determinization" in this
+crate.)
+
+The term "DFA state" is somewhat overloaded in this crate. In some cases, it
+refers to the set of transitions over an alphabet for a particular state. In
+other cases, it refers to a set of NFA states. The former is really about the
+final representation of a state in a DFA's transition table, where as the
+latter---what this module is focusedon---is closer to an intermediate form that
+is used to help eventually build the transition table.
+
+This module exports four types. All four types represent the same idea: an
+ordered set of NFA states. This ordered set represents the epsilon closure of a
+particular NFA state, where the "epsilon closure" is the set of NFA states that
+can be transitioned to without consuming any input. i.e., Follow all of theNFA
+state's epsilon transitions. In addition, this implementation of DFA states
+cares about two other things: the ordered set of pattern IDs corresponding
+to the patterns that match if the state is a match state, and the set of
+look-behind assertions that were true when the state was created.
+
+The first, `State`, is a frozen representation of a state that cannot be
+modified. It may be cheaply cloned without copying the state itself and can be
+accessed safely from multiple threads simultaneously. This type is useful for
+when one knows that the DFA state being constructed is distinct from any other
+previously constructed states. Namely, powerset construction, in practice,
+requires one to keep a cache of previously created DFA states. Otherwise,
+the number of DFA states created in memory balloons to an impractically
+large number. For this reason, equivalent states should endeavor to have an
+equivalent byte-level representation. (In general, "equivalency" here means,
+"equivalent assertions, pattern IDs and NFA state IDs." We do not require that
+full DFA minimization be implemented here. This form of equivalency is only
+surface deep and is more-or-less a practical necessity.)
+
+The other three types represent different phases in the construction of a
+DFA state. Internally, these three types (and `State`) all use the same
+byte-oriented representation. That means one can use any of the builder types
+to check whether the state it represents already exists or not. If it does,
+then there is no need to freeze it into a `State` (which requires an alloc and
+a copy). Here are the three types described succinctly:
+
+* `StateBuilderEmpty` represents a state with no pattern IDs, no assertions
+and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A
+`StateBuilderEmpty` can only be used to query its underlying memory capacity,
+or to convert into a builder for recording pattern IDs and/or assertions.
+* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero
+or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches`
+can only be used for adding pattern IDs and recording assertions.
+* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or
+more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA`
+can only be used for adding NFA state IDs and recording some assertions.
+
+The expected flow here is to use the above builders to construct a candidate
+DFA state to check if it already exists. If it does, then there's no need to
+freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state`
+can be called to freeze the builder into an immutable `State`. In either
+case, `clear` should be called on the builder to turn it back into a
+`StateBuilderEmpty` that reuses the underyling memory.
+
+The main purpose for splitting the builder into these distinct types is to
+make it impossible to do things like adding a pattern ID after adding an NFA
+state ID. Namely, this makes it simpler to use a space-and-time efficient
+binary representation for the state. (The format is documented on the `Repr`
+type below.) If we just used one type for everything, it would be possible for
+callers to use an incorrect interleaving of calls and thus result in a corrupt
+representation. I chose to use more type machinery to make this impossible to
+do because 1) determinization is itself pretty complex and it wouldn't be too
+hard to foul this up and 2) there isn't too much machinery involve and it's
+well contained.
+
+As an optimization, sometimes states won't have certain things set. For
+example, if the underlying NFA has no word boundary assertions, then there is
+no reason to set a state's look-behind assertion as to whether it was generated
+from a word byte or not. Similarly, if a state has no NFA states corresponding
+to look-around assertions, then there is no reason to set `look_have` to a
+non-empty set. Finally, callers usually omit unconditional epsilon transitions
+when adding NFA state IDs since they aren't discriminatory.
+
+Finally, the binary representation used by these states is, thankfully, not
+serialized anywhere. So any kind of change can be made with reckless abandon,
+as long as everything in this module agrees.
+*/
+
+use core::{convert::TryFrom, mem};
+
+use alloc::{sync::Arc, vec::Vec};
+
+use crate::{
+    nfa::thompson::LookSet,
+    util::{
+        bytes::{self, Endian},
+        id::{PatternID, StateID},
+    },
+};
+
+/// A DFA state that, at its core, is represented by an ordered set of NFA
+/// states.
+///
+/// This type is intended to be used only in NFA-to-DFA conversion via powerset
+/// construction.
+///
+/// It may be cheaply cloned and accessed safely from mulitple threads
+/// simultaneously.
+#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub(crate) struct State(Arc<[u8]>);
+
+/// This Borrow impl permits us to lookup any state in a map by its byte
+/// representation. This is particularly convenient when one has a StateBuilder
+/// and we want to see if a correspondingly equivalent state already exists. If
+/// one does exist, then we can reuse the allocation required by StateBuilder
+/// without having to convert it into a State first.
+impl core::borrow::Borrow<[u8]> for State {
+    fn borrow(&self) -> &[u8] {
+        &*self.0
+    }
+}
+
+impl core::fmt::Debug for State {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        f.debug_tuple("State").field(&self.repr()).finish()
+    }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl State {
+    pub(crate) fn dead() -> State {
+        StateBuilderEmpty::new().into_matches().into_nfa().to_state()
+    }
+
+    pub(crate) fn is_match(&self) -> bool {
+        self.repr().is_match()
+    }
+
+    pub(crate) fn is_from_word(&self) -> bool {
+        self.repr().is_from_word()
+    }
+
+    pub(crate) fn look_have(&self) -> LookSet {
+        self.repr().look_have()
+    }
+
+    pub(crate) fn look_need(&self) -> LookSet {
+        self.repr().look_need()
+    }
+
+    pub(crate) fn match_count(&self) -> usize {
+        self.repr().match_count()
+    }
+
+    pub(crate) fn match_pattern(&self, index: usize) -> PatternID {
+        self.repr().match_pattern(index)
+    }
+
+    pub(crate) fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
+        self.repr().match_pattern_ids()
+    }
+
+    pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) {
+        self.repr().iter_match_pattern_ids(f)
+    }
+
+    pub(crate) fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, f: F) {
+        self.repr().iter_nfa_state_ids(f)
+    }
+
+    pub(crate) fn memory_usage(&self) -> usize {
+        self.0.len()
+    }
+
+    fn repr(&self) -> Repr<'_> {
+        Repr(&*self.0)
+    }
+}
+
+/// A state builder that represents an empty state.
+///
+/// This is a useful "initial condition" for state construction. It has no
+/// NFA state IDs, no assertions set and no pattern IDs. No allocations are
+/// made when new() is called. Its main use is for being converted into a
+/// builder that can capture assertions and pattern IDs.
+#[derive(Clone, Debug)]
+pub(crate) struct StateBuilderEmpty(Vec<u8>);
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderEmpty {
+    pub(crate) fn new() -> StateBuilderEmpty {
+        StateBuilderEmpty(alloc::vec![])
+    }
+
+    pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
+        self.0.extend_from_slice(&[0, 0, 0]);
+        StateBuilderMatches(self.0)
+    }
+
+    fn clear(&mut self) {
+        self.0.clear();
+    }
+
+    pub(crate) fn capacity(&self) -> usize {
+        self.0.capacity()
+    }
+}
+
+/// A state builder that collects assertions and pattern IDs.
+///
+/// When collecting pattern IDs is finished, this can be converted into a
+/// builder that collects NFA state IDs.
+#[derive(Clone)]
+pub(crate) struct StateBuilderMatches(Vec<u8>);
+
+impl core::fmt::Debug for StateBuilderMatches {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        f.debug_tuple("StateBuilderMatches").field(&self.repr()).finish()
+    }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderMatches {
+    pub(crate) fn into_nfa(mut self) -> StateBuilderNFA {
+        self.repr_vec().close_match_pattern_ids();
+        StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO }
+    }
+
+    pub(crate) fn clear(self) -> StateBuilderEmpty {
+        let mut builder = StateBuilderEmpty(self.0);
+        builder.clear();
+        builder
+    }
+
+    pub(crate) fn is_match(&self) -> bool {
+        self.repr().is_match()
+    }
+
+    pub(crate) fn is_from_word(&self) -> bool {
+        self.repr().is_from_word()
+    }
+
+    pub(crate) fn set_is_from_word(&mut self) {
+        self.repr_vec().set_is_from_word()
+    }
+
+    pub(crate) fn look_have(&mut self) -> &mut LookSet {
+        LookSet::from_repr_mut(&mut self.0[1])
+    }
+
+    pub(crate) fn look_need(&mut self) -> &mut LookSet {
+        LookSet::from_repr_mut(&mut self.0[2])
+    }
+
+    pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) {
+        self.repr_vec().add_match_pattern_id(pid)
+    }
+
+    fn repr(&self) -> Repr<'_> {
+        Repr(&self.0)
+    }
+
+    fn repr_vec(&mut self) -> ReprVec<'_> {
+        ReprVec(&mut self.0)
+    }
+}
+
+/// A state builder that collects some assertions and NFA state IDs.
+///
+/// When collecting NFA state IDs is finished, this can be used to build a
+/// `State` if necessary.
+///
+/// When dont with building a state (regardless of whether it got kept or not),
+/// it's usually a good idea to call `clear` to get an empty builder back so
+/// that it can be reused to build the next state.
+#[derive(Clone)]
+pub(crate) struct StateBuilderNFA {
+    repr: Vec<u8>,
+    prev_nfa_state_id: StateID,
+}
+
+impl core::fmt::Debug for StateBuilderNFA {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        f.debug_tuple("StateBuilderNFA").field(&self.repr()).finish()
+    }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderNFA {
+    pub(crate) fn to_state(&self) -> State {
+        State(Arc::from(&*self.repr))
+    }
+
+    pub(crate) fn clear(self) -> StateBuilderEmpty {
+        let mut builder = StateBuilderEmpty(self.repr);
+        builder.clear();
+        builder
+    }
+
+    pub(crate) fn is_match(&self) -> bool {
+        self.repr().is_match()
+    }
+
+    pub(crate) fn is_from_word(&self) -> bool {
+        self.repr().is_from_word()
+    }
+
+    pub(crate) fn look_have(&mut self) -> &mut LookSet {
+        LookSet::from_repr_mut(&mut self.repr[1])
+    }
+
+    pub(crate) fn look_need(&mut self) -> &mut LookSet {
+        LookSet::from_repr_mut(&mut self.repr[2])
+    }
+
+    pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) {
+        ReprVec(&mut self.repr)
+            .add_nfa_state_id(&mut self.prev_nfa_state_id, sid)
+    }
+
+    pub(crate) fn memory_usage(&self) -> usize {
+        self.repr.len()
+    }
+
+    pub(crate) fn as_bytes(&self) -> &[u8] {
+        &self.repr
+    }
+
+    fn repr(&self) -> Repr<'_> {
+        Repr(&self.repr)
+    }
+
+    fn repr_vec(&mut self) -> ReprVec<'_> {
+        ReprVec(&mut self.repr)
+    }
+}
+
+/// Repr is a read-only view into the representation of a DFA state.
+///
+/// Primarily, a Repr is how we achieve DRY: we implement decoding the format
+/// in one place, and then use a Repr to implement the various methods on the
+/// public state types.
+///
+/// The format is as follows:
+///
+/// The first three bytes correspond to bitsets.
+///
+/// Byte 0 is a bitset corresponding to miscellaneous flags associated with the
+/// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1
+/// if the state has pattern IDs explicitly written to it. (This is a flag that
+/// is not meant to be set by determinization, but rather, is used as part of
+/// an internal space-saving optimization.) Bit 2 is set to 1 if the state was
+/// generated by a transition over a "word" byte. (Callers may not always set
+/// this. For example, if the NFA has no word boundary assertion, then needing
+/// to track whether a state came from a word byte or not is superfluous and
+/// wasteful.)
+///
+/// Byte 1 corresponds to the look-behind assertions that were satisfied by
+/// the transition that created this state. This generally only includes the
+/// StartLine and StartText assertions. (Look-ahead assertions are not tracked
+/// as part of states. Instead, these are applied by re-computing the epsilon
+/// closure of a state when computing the transition function. See `next` in
+/// the parent module.)
+///
+/// Byte 2 corresponds to the set of look-around assertions (including both
+/// look-behind and look-ahead) that appear somewhere in this state's set of
+/// NFA state IDs. This is used to determine whether this state's epsilon
+/// closure should be re-computed when computing the transition function.
+/// Namely, look-around assertions are "just" conditional epsilon transitions,
+/// so if there are new assertions available when computing the transition
+/// function, we should only re-compute the epsilon closure if those new
+/// assertions are relevant to this particular state.
+///
+/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer
+/// corresponding to the number of patterns encoded in this state. If the state
+/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is
+/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte
+/// offset 3 is the position at which the first NFA state ID is encoded.
+///
+/// For a match state with at least one non-ZERO pattern ID, the next bytes
+/// correspond to a sequence of 32-bit native endian encoded integers that
+/// represent each pattern ID, in order, that this match state represents.
+///
+/// After the pattern IDs (if any), NFA state IDs are delta encoded as
+/// varints.[1] The first NFA state ID is encoded as itself, and each
+/// subsequent NFA state ID is encoded as the difference between itself and the
+/// previous NFA state ID.
+///
+/// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints
+struct Repr<'a>(&'a [u8]);
+
+impl<'a> Repr<'a> {
+    /// Returns true if and only if this is a match state.
+    ///
+    /// If callers have added pattern IDs to this state, then callers MUST set
+    /// this state as a match state explicitly. However, as a special case,
+    /// states that are marked as match states but with no pattern IDs, then
+    /// the state is treated as if it had a single pattern ID equivalent to
+    /// PatternID::ZERO.
+    fn is_match(&self) -> bool {
+        self.0[0] & (1 << 0) > 0
+    }
+
+    /// Returns true if and only if this state has had at least one pattern
+    /// ID added to it.
+    ///
+    /// This is an internal-only flag that permits the representation to save
+    /// space in the common case of an NFA with one pattern in it. In that
+    /// case, a match state can only ever have exactly one pattern ID:
+    /// PatternID::ZERO. So there's no need to represent it.
+    fn has_pattern_ids(&self) -> bool {
+        self.0[0] & (1 << 1) > 0
+    }
+
+    /// Returns true if and only if this state is marked as having been created
+    /// from a transition over a word byte. This is useful for checking whether
+    /// a word boundary assertion is true or not, which requires look-behind
+    /// (whether the current state came from a word byte or not) and look-ahead
+    /// (whether the transition byte is a word byte or not).
+    ///
+    /// Since states with this set are distinct from states that don't have
+    /// this set (even if they are otherwise equivalent), callers should not
+    /// set this assertion unless the underlying NFA has at least one word
+    /// boundary assertion somewhere. Otherwise, a superfluous number of states
+    /// may be created.
+    fn is_from_word(&self) -> bool {
+        self.0[0] & (1 << 2) > 0
+    }
+
+    /// The set of look-behind assertions that were true in the transition that
+    /// created this state.
+    ///
+    /// Generally, this should be empty if 'look_need' is empty, since there is
+    /// no reason to track which look-behind assertions are true if the state
+    /// has no conditional epsilon transitions.
+    ///
+    /// Satisfied look-ahead assertions are not tracked in states. Instead,
+    /// these are re-computed on demand via epsilon closure when computing the
+    /// transition function.
+    fn look_have(&self) -> LookSet {
+        LookSet::from_repr(self.0[1])
+    }
+
+    /// The set of look-around (both behind and ahead) assertions that appear
+    /// at least once in this state's set of NFA states.
+    ///
+    /// This is used to determine whether the epsilon closure needs to be
+    /// re-computed when computing the transition function. Namely, if the
+    /// state has no conditional epsilon transitions, then there is no need
+    /// to re-compute the epsilon closure.
+    fn look_need(&self) -> LookSet {
+        LookSet::from_repr(self.0[2])
+    }
+
+    /// Returns the total number of match pattern IDs in this state.
+    ///
+    /// If this state is not a match state, then this always returns 0.
+    fn match_count(&self) -> usize {
+        if !self.is_match() {
+            return 0;
+        } else if !self.has_pattern_ids() {
+            1
+        } else {
+            self.encoded_pattern_count()
+        }
+    }
+
+    /// Returns the pattern ID for this match state at the given index.
+    ///
+    /// If the given index is greater than or equal to `match_count()` for this
+    /// state, then this could panic or return incorrect results.
+    fn match_pattern(&self, index: usize) -> PatternID {
+        if !self.has_pattern_ids() {
+            PatternID::ZERO
+        } else {
+            let offset = 7 + index * PatternID::SIZE;
+            // This is OK since we only ever serialize valid PatternIDs to
+            // states.
+            bytes::read_pattern_id_unchecked(&self.0[offset..]).0
+        }
+    }
+
+    /// Returns a copy of all match pattern IDs in this state. If this state
+    /// is not a match state, then this returns None.
+    fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
+        if !self.is_match() {
+            return None;
+        }
+        let mut pids = alloc::vec![];
+        self.iter_match_pattern_ids(|pid| pids.push(pid));
+        Some(pids)
+    }
+
+    /// Calls the given function on every pattern ID in this state.
+    fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, mut f: F) {
+        if !self.is_match() {
+            return;
+        }
+        // As an optimization for a very common case, when this is a match
+        // state for an NFA with only one pattern, we don't actually write the
+        // pattern ID to the state representation. Instead, we know it must
+        // be there since it is the only possible choice.
+        if !self.has_pattern_ids() {
+            f(PatternID::ZERO);
+            return;
+        }
+        let mut pids = &self.0[7..self.pattern_offset_end()];
+        while !pids.is_empty() {
+            let pid = bytes::read_u32(pids);
+            pids = &pids[PatternID::SIZE..];
+            // This is OK since we only ever serialize valid PatternIDs to
+            // states. And since pattern IDs can never exceed a usize, the
+            // unwrap is OK.
+            f(PatternID::new_unchecked(usize::try_from(pid).unwrap()));
+        }
+    }
+
+    /// Calls the given function on every NFA state ID in this state.
+    fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, mut f: F) {
+        let mut sids = &self.0[self.pattern_offset_end()..];
+        let mut prev = 0i32;
+        while !sids.is_empty() {
+            let (delta, nr) = read_vari32(sids);
+            sids = &sids[nr..];
+            let sid = prev + delta;
+            prev = sid;
+            // This is OK since we only ever serialize valid StateIDs to
+            // states. And since state IDs can never exceed an isize, they must
+            // always be able to fit into a usize, and thus cast is OK.
+            f(StateID::new_unchecked(sid as usize))
+        }
+    }
+
+    /// Returns the offset into this state's representation where the pattern
+    /// IDs end and the NFA state IDs begin.
+    fn pattern_offset_end(&self) -> usize {
+        let encoded = self.encoded_pattern_count();
+        if encoded == 0 {
+            return 3;
+        }
+        // This arithmetic is OK since we were able to address this many bytes
+        // when writing to the state, thus, it must fit into a usize.
+        encoded.checked_mul(4).unwrap().checked_add(7).unwrap()
+    }
+
+    /// Returns the total number of *encoded* pattern IDs in this state.
+    ///
+    /// This may return 0 even when this is a match state, since the pattern
+    /// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in
+    /// the match state (the overwhelming common case).
+    fn encoded_pattern_count(&self) -> usize {
+        if !self.has_pattern_ids() {
+            return 0;
+        }
+        // This unwrap is OK since the total number of patterns is always
+        // guaranteed to fit into a usize.
+        usize::try_from(bytes::read_u32(&self.0[3..7])).unwrap()
+    }
+}
+
+impl<'a> core::fmt::Debug for Repr<'a> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        let mut nfa_ids = alloc::vec![];
+        self.iter_nfa_state_ids(|sid| nfa_ids.push(sid));
+        f.debug_struct("Repr")
+            .field("is_match", &self.is_match())
+            .field("is_from_word", &self.is_from_word())
+            .field("look_have", &self.look_have())
+            .field("look_need", &self.look_need())
+            .field("match_pattern_ids", &self.match_pattern_ids())
+            .field("nfa_state_ids", &nfa_ids)
+            .finish()
+    }
+}
+
+/// ReprVec is a write-only view into the representation of a DFA state.
+///
+/// See Repr for more details on the purpose of this type and also the format.
+///
+/// Note that not all possible combinations of methods may be called. This is
+/// precisely what the various StateBuilder types encapsulate: they only
+/// permit valid combinations via Rust's linear typing.
+struct ReprVec<'a>(&'a mut Vec<u8>);
+
+impl<'a> ReprVec<'a> {
+    /// Set this state as a match state.
+    ///
+    /// This should not be exposed explicitly outside of this module. It is
+    /// set automatically when a pattern ID is added.
+    fn set_is_match(&mut self) {
+        self.0[0] |= 1 << 0;
+    }
+
+    /// Set that this state has pattern IDs explicitly written to it.
+    ///
+    /// This should not be exposed explicitly outside of this module. This is
+    /// used internally as a space saving optimization. Namely, if the state
+    /// is a match state but does not have any pattern IDs written to it,
+    /// then it is automatically inferred to have a pattern ID of ZERO.
+    fn set_has_pattern_ids(&mut self) {
+        self.0[0] |= 1 << 1;
+    }
+
+    /// Set this state as being built from a transition over a word byte.
+    ///
+    /// Setting this is only necessary when one needs to deal with word
+    /// boundary assertions. Therefore, if the underlying NFA has no word
+    /// boundary assertions, callers should not set this.
+    fn set_is_from_word(&mut self) {
+        self.0[0] |= 1 << 2;
+    }
+
+    /// Return a mutable reference to the 'look_have' assertion set.
+    fn look_have_mut(&mut self) -> &mut LookSet {
+        LookSet::from_repr_mut(&mut self.0[1])
+    }
+
+    /// Return a mutable reference to the 'look_need' assertion set.
+    fn look_need_mut(&mut self) -> &mut LookSet {
+        LookSet::from_repr_mut(&mut self.0[2])
+    }
+
+    /// Add a pattern ID to this state. All match states must have at least
+    /// one pattern ID associated with it.
+    ///
+    /// Callers must never add duplicative pattern IDs.
+    ///
+    /// The order in which patterns are added must correspond to the order
+    /// in which patterns are reported as matches.
+    fn add_match_pattern_id(&mut self, pid: PatternID) {
+        // As a (somewhat small) space saving optimization, in the case where
+        // a matching state has exactly one pattern ID, PatternID::ZERO, we do
+        // not write either the pattern ID or the number of patterns encoded.
+        // Instead, all we do is set the 'is_match' bit on this state. Overall,
+        // this saves 8 bytes per match state for the overwhelming majority of
+        // match states.
+        //
+        // In order to know whether pattern IDs need to be explicitly read or
+        // not, we use another internal-only bit, 'has_pattern_ids', to
+        // indicate whether they have been explicitly written or not.
+        if !self.repr().has_pattern_ids() {
+            if pid == PatternID::ZERO {
+                self.set_is_match();
+                return;
+            }
+            // Make room for 'close_match_pattern_ids' to write the total
+            // number of pattern IDs written.
+            self.0.extend(core::iter::repeat(0).take(PatternID::SIZE));
+            self.set_has_pattern_ids();
+            // If this was already a match state, then the only way that's
+            // possible when the state doesn't have pattern IDs is if
+            // PatternID::ZERO was added by the caller previously. In this
+            // case, we are now adding a non-ZERO pattern ID after it, in
+            // which case, we want to make sure to represent ZERO explicitly
+            // now.
+            if self.repr().is_match() {
+                write_u32(self.0, 0)
+            } else {
+                // Otherwise, just make sure the 'is_match' bit is set.
+                self.set_is_match();
+            }
+        }
+        write_u32(self.0, pid.as_u32());
+    }
+
+    /// Indicate that no more pattern IDs will be added to this state.
+    ///
+    /// Once this is called, callers must not call it or 'add_match_pattern_id'
+    /// again.
+    ///
+    /// This should not be exposed explicitly outside of this module. It
+    /// should be called only when converting a StateBuilderMatches into a
+    /// StateBuilderNFA.
+    fn close_match_pattern_ids(&mut self) {
+        // If we never wrote any pattern IDs, then there's nothing to do here.
+        if !self.repr().has_pattern_ids() {
+            return;
+        }
+        let patsize = PatternID::SIZE;
+        let pattern_bytes = self.0.len() - 7;
+        // Every pattern ID uses 4 bytes, so number of bytes should be
+        // divisible by 4.
+        assert_eq!(pattern_bytes % patsize, 0);
+        // This unwrap is OK since we are guaranteed that the maximum number
+        // of possible patterns fits into a u32.
+        let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
+        bytes::NE::write_u32(count32, &mut self.0[3..7]);
+    }
+
+    /// Add an NFA state ID to this state. The order in which NFA states are
+    /// added matters. It is the caller's responsibility to ensure that
+    /// duplicate NFA state IDs are not added.
+    fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) {
+        let delta = sid.as_i32() - prev.as_i32();
+        write_vari32(self.0, delta);
+        *prev = sid;
+    }
+
+    /// Return a read-only view of this state's representation.
+    fn repr(&self) -> Repr<'_> {
+        Repr(self.0.as_slice())
+    }
+}
+
+/// Write a signed 32-bit integer using zig-zag encoding.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_vari32(data: &mut Vec<u8>, n: i32) {
+    let mut un = (n as u32) << 1;
+    if n < 0 {
+        un = !un;
+    }
+    write_varu32(data, un)
+}
+
+/// Read a signed 32-bit integer using zig-zag encoding. Also, return the
+/// number of bytes read.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_vari32(data: &[u8]) -> (i32, usize) {
+    let (un, i) = read_varu32(data);
+    let mut n = (un >> 1) as i32;
+    if un & 1 != 0 {
+        n = !n;
+    }
+    (n, i)
+}
+
+/// Write an unsigned 32-bit integer as a varint. In essence, `n` is written
+/// as a sequence of bytes where all bytes except for the last one have the
+/// most significant bit set. The least significant 7 bits correspond to the
+/// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in
+/// very common cases, it uses fewer than 4.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
+    while n >= 0b1000_0000 {
+        data.push((n as u8) | 0b1000_0000);
+        n >>= 7;
+    }
+    data.push(n as u8);
+}
+
+/// Read an unsigned 32-bit varint. Also, return the number of bytes read.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_varu32(data: &[u8]) -> (u32, usize) {
+    // N.B. We can assume correctness here since we know that all varuints are
+    // written with write_varu32. Hence, the 'as' uses and unchecked arithmetic
+    // is all okay.
+    let mut n: u32 = 0;
+    let mut shift: u32 = 0;
+    for (i, &b) in data.iter().enumerate() {
+        if b < 0b1000_0000 {
+            return (n | ((b as u32) << shift), i + 1);
+        }
+        n |= ((b as u32) & 0b0111_1111) << shift;
+        shift += 7;
+    }
+    (0, 0)
+}
+
+/// Push a native-endian encoded `n` on to `dst`.
+fn write_u32(dst: &mut Vec<u8>, n: u32) {
+    use crate::util::bytes::{Endian, NE};
+
+    let start = dst.len();
+    dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>()));
+    NE::write_u32(n, &mut dst[start..]);
+}
+
+#[cfg(test)]
+mod tests {
+    use alloc::vec;
+
+    use quickcheck::quickcheck;
+
+    use super::*;
+
+    quickcheck! {
+        fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool {
+            // Builders states do not permit duplicate IDs.
+            let sids = dedup_state_ids(sids);
+
+            let mut b = StateBuilderEmpty::new().into_matches().into_nfa();
+            for &sid in &sids {
+                b.add_nfa_state_id(sid);
+            }
+            let s = b.to_state();
+            let mut got = vec![];
+            s.iter_nfa_state_ids(|sid| got.push(sid));
+            got == sids
+        }
+
+        fn prop_state_read_write_pattern_ids(pids: Vec<PatternID>) -> bool {
+            // Builders states do not permit duplicate IDs.
+            let pids = dedup_pattern_ids(pids);
+
+            let mut b = StateBuilderEmpty::new().into_matches();
+            for &pid in &pids {
+                b.add_match_pattern_id(pid);
+            }
+            let s = b.into_nfa().to_state();
+            let mut got = vec![];
+            s.iter_match_pattern_ids(|pid| got.push(pid));
+            got == pids
+        }
+
+        fn prop_state_read_write_nfa_state_and_pattern_ids(
+            sids: Vec<StateID>,
+            pids: Vec<PatternID>
+        ) -> bool {
+            // Builders states do not permit duplicate IDs.
+            let sids = dedup_state_ids(sids);
+            let pids = dedup_pattern_ids(pids);
+
+            let mut b = StateBuilderEmpty::new().into_matches();
+            for &pid in &pids {
+                b.add_match_pattern_id(pid);
+            }
+
+            let mut b = b.into_nfa();
+            for &sid in &sids {
+                b.add_nfa_state_id(sid);
+            }
+
+            let s = b.to_state();
+            let mut got_pids = vec![];
+            s.iter_match_pattern_ids(|pid| got_pids.push(pid));
+            let mut got_sids = vec![];
+            s.iter_nfa_state_ids(|sid| got_sids.push(sid));
+            got_pids == pids && got_sids == sids
+        }
+
+        fn prop_read_write_varu32(n: u32) -> bool {
+            let mut buf = vec![];
+            write_varu32(&mut buf, n);
+            let (got, nread) = read_varu32(&buf);
+            nread == buf.len() && got == n
+        }
+
+        fn prop_read_write_vari32(n: i32) -> bool {
+            let mut buf = vec![];
+            write_vari32(&mut buf, n);
+            let (got, nread) = read_vari32(&buf);
+            nread == buf.len() && got == n
+        }
+    }
+
+    fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> {
+        let mut set = alloc::collections::BTreeSet::new();
+        let mut deduped = vec![];
+        for sid in sids {
+            if set.contains(&sid) {
+                continue;
+            }
+            set.insert(sid);
+            deduped.push(sid);
+        }
+        deduped
+    }
+
+    fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> {
+        let mut set = alloc::collections::BTreeSet::new();
+        let mut deduped = vec![];
+        for pid in pids {
+            if set.contains(&pid) {
+                continue;
+            }
+            set.insert(pid);
+            deduped.push(pid);
+        }
+        deduped
+    }
+}
diff --git a/src/util/id.rs b/src/util/id.rs

new file mode 100644 (file)

index 0000000..70bf0a9
--- /dev/null
+++ b/src/util/id.rs
@@ -0,0 +1,608 @@
+/*!
+Type definitions for identifier types.
+
+A [`StateID`] represents the possible set of identifiers used in regex engine
+implementations in this crate. For example, they are used to identify both NFA
+and DFA states.
+
+A [`PatternID`] represents the possible set of identifiers for patterns. All
+regex engine implementations in this crate support searching for multiple
+patterns simultaneously. A `PatternID` is how each pattern is uniquely
+identified for a particular instance of a regex engine. Namely, a pattern is
+assigned an auto-incrementing integer, starting at `0`, based on the order of
+patterns supplied during the construction of the regex engine.
+
+These identifier types represent a way for this crate to make correctness
+guarantees around the possible set of values that a `StateID` or a `PatternID`
+might represent. Similarly, they also provide a way of constraining the size of
+these identifiers to reduce space usage while still guaranteeing that all such
+identifiers are repsentable by a `usize` for the current target.
+
+Moreover, the identifier types clamp the range of permissible values to a range
+that is typically smaller than its internal representation. (With the maximum
+value being, e.g., `StateID::MAX`.) Users of these types may not rely this
+clamping for the purpose of memory safety. Users may, however, rely on these
+invariants to avoid panics or other types of logic bugs.
+*/
+
+// Continuing from the above comment about correctness guarantees, an example
+// of a way in which we use the guarantees on these types is delta encoding.
+// Namely, we require that IDs can be at most 2^31 - 2, which means the
+// difference between any two IDs is always representable as an i32.
+
+use core::{
+    convert::{Infallible, TryFrom},
+    mem, ops,
+};
+
+#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+/// An identifier for a regex pattern.
+///
+/// The identifier for a pattern corresponds to its relative position among
+/// other patterns in a single finite state machine. Namely, when building
+/// a multi-pattern regex engine, one must supply a sequence of patterns to
+/// match. The position (starting at 0) of each pattern in that sequence
+/// represents its identifier. This identifier is in turn used to identify and
+/// report matches of that pattern in various APIs.
+///
+/// A pattern ID is guaranteed to be representable by a `usize`. Similarly,
+/// the number of patterns in any regex engine in this crate is guaranteed to
+/// be representable by a `usize`. This applies to regex engines that have
+/// been deserialized; a deserialization error will be returned if it contains
+/// pattern IDs that violate these requirements in your current environment.
+///
+/// For extra convenience in some cases, this type also guarantees that all
+/// IDs can fit into an `i32` and an `isize` without overflowing.
+///
+/// # Representation
+///
+/// This type is always represented internally by a `u32` and is marked as
+/// `repr(transparent)`. Thus, this type always has the same representation as
+/// a `u32`.
+///
+/// # Indexing
+///
+/// For convenience, callers may use a `PatternID` to index slices.
+///
+/// # Safety
+///
+/// While a `PatternID` is meant to guarantee that its value fits into `usize`
+/// (while using a possibly smaller representation than `usize` on some
+/// targets), callers must not rely on this property for safety. Callers may
+/// choose to rely on this property for correctness however.
+#[repr(transparent)]
+#[derive(
+    Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+pub struct PatternID(u32);
+
+impl PatternID {
+    /// The maximum pattern ID value, represented as a `usize`.
+    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+    pub const MAX: PatternID =
+        PatternID::new_unchecked(core::i32::MAX as usize - 1);
+
+    /// The maximum pattern ID value, represented as a `usize`.
+    #[cfg(target_pointer_width = "16")]
+    pub const MAX: PatternID = PatternID::new_unchecked(core::isize::MAX - 1);
+
+    /// The total number of patterns that are allowed in any single regex
+    /// engine.
+    pub const LIMIT: usize = PatternID::MAX.as_usize() + 1;
+
+    /// The zero pattern ID value.
+    pub const ZERO: PatternID = PatternID::new_unchecked(0);
+
+    /// The number of bytes that a single `PatternID` uses in memory.
+    pub const SIZE: usize = core::mem::size_of::<PatternID>();
+
+    /// Create a new pattern ID.
+    ///
+    /// If the given identifier exceeds [`PatternID::MAX`], then this returns
+    /// an error.
+    #[inline]
+    pub fn new(id: usize) -> Result<PatternID, PatternIDError> {
+        PatternID::try_from(id)
+    }
+
+    /// Create a new pattern ID without checking whether the given value
+    /// exceeds [`PatternID::MAX`].
+    ///
+    /// While this is unchecked, providing an incorrect value must never
+    /// sacrifice memory safety, as documented above.
+    #[inline]
+    pub const fn new_unchecked(id: usize) -> PatternID {
+        PatternID(id as u32)
+    }
+
+    /// Like [`PatternID::new`], but panics if the given ID is not valid.
+    #[inline]
+    pub fn must(id: usize) -> PatternID {
+        PatternID::new(id).unwrap()
+    }
+
+    /// Return this pattern ID as a `usize`.
+    #[inline]
+    pub const fn as_usize(&self) -> usize {
+        self.0 as usize
+    }
+
+    /// Return the internal u32 of this pattern ID.
+    #[inline]
+    pub const fn as_u32(&self) -> u32 {
+        self.0
+    }
+
+    /// Return the internal u32 of this pattern ID represented as an i32.
+    ///
+    /// This is guaranteed to never overflow an `i32`.
+    #[inline]
+    pub const fn as_i32(&self) -> i32 {
+        self.0 as i32
+    }
+
+    /// Returns one more than this pattern ID as a usize.
+    ///
+    /// Since a pattern ID has constraints on its maximum value, adding `1` to
+    /// it will always fit in a `usize` (and a `u32`).
+    #[inline]
+    pub fn one_more(&self) -> usize {
+        self.as_usize().checked_add(1).unwrap()
+    }
+
+    /// Decode this pattern ID from the bytes given using the native endian
+    /// byte order for the current target.
+    ///
+    /// If the decoded integer is not representable as a pattern ID for the
+    /// current target, then this returns an error.
+    #[inline]
+    pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<PatternID, PatternIDError> {
+        let id = u32::from_ne_bytes(bytes);
+        if id > PatternID::MAX.as_u32() {
+            return Err(PatternIDError { attempted: id as u64 });
+        }
+        Ok(PatternID::new_unchecked(id as usize))
+    }
+
+    /// Decode this pattern ID from the bytes given using the native endian
+    /// byte order for the current target.
+    ///
+    /// This is analogous to [`PatternID::new_unchecked`] in that is does not
+    /// check whether the decoded integer is representable as a pattern ID.
+    #[inline]
+    pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> PatternID {
+        PatternID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
+    }
+
+    /// Return the underlying pattern ID integer as raw bytes in native endian
+    /// format.
+    #[inline]
+    pub fn to_ne_bytes(&self) -> [u8; 4] {
+        self.0.to_ne_bytes()
+    }
+
+    /// Returns an iterator over all pattern IDs from 0 up to and not including
+    /// the given length.
+    ///
+    /// If the given length exceeds [`PatternID::LIMIT`], then this panics.
+    #[cfg(feature = "alloc")]
+    pub(crate) fn iter(len: usize) -> PatternIDIter {
+        PatternIDIter::new(len)
+    }
+}
+
+/// This error occurs when a pattern ID could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum pattern ID value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct PatternIDError {
+    attempted: u64,
+}
+
+impl PatternIDError {
+    /// Returns the value that failed to constructed a pattern ID.
+    pub fn attempted(&self) -> u64 {
+        self.attempted
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for PatternIDError {}
+
+impl core::fmt::Display for PatternIDError {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(
+            f,
+            "failed to create PatternID from {:?}, which exceeds {:?}",
+            self.attempted(),
+            PatternID::MAX,
+        )
+    }
+}
+
+/// An identifier for a state in a regex engine.
+///
+/// A state ID is guaranteed to be representable by a `usize`. Similarly, the
+/// number of states in any regex engine in this crate is guaranteed to be
+/// representable by a `usize`. This applies to regex engines that have been
+/// deserialized; a deserialization error will be returned if it contains state
+/// IDs that violate these requirements in your current environment.
+///
+/// For extra convenience in some cases, this type also guarantees that all
+/// IDs can fit into an `i32` and an `isize` without overflowing.
+///
+/// # Representation
+///
+/// This type is always represented internally by a `u32` and is marked as
+/// `repr(transparent)`. Thus, this type always has the same representation as
+/// a `u32`.
+///
+/// # Indexing
+///
+/// For convenience, callers may use a `StateID` to index slices.
+///
+/// # Safety
+///
+/// While a `StateID` is meant to guarantee that its value fits into `usize`
+/// (while using a possibly smaller representation than `usize` on some
+/// targets), callers must not rely on this property for safety. Callers may
+/// choose to rely on this property for correctness however.
+#[repr(transparent)]
+#[derive(
+    Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+pub struct StateID(u32);
+
+impl StateID {
+    /// The maximum state ID value.
+    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+    pub const MAX: StateID =
+        StateID::new_unchecked(core::i32::MAX as usize - 1);
+
+    /// The maximum state ID value.
+    #[cfg(target_pointer_width = "16")]
+    pub const MAX: StateID = StateID::new_unchecked(core::isize::MAX - 1);
+
+    /// The total number of states that are allowed in any single regex
+    /// engine, represented as a `usize`.
+    pub const LIMIT: usize = StateID::MAX.as_usize() + 1;
+
+    /// The zero state ID value.
+    pub const ZERO: StateID = StateID::new_unchecked(0);
+
+    /// The number of bytes that a single `StateID` uses in memory.
+    pub const SIZE: usize = core::mem::size_of::<StateID>();
+
+    /// Create a new state ID.
+    ///
+    /// If the given identifier exceeds [`StateID::MAX`], then this returns
+    /// an error.
+    #[inline]
+    pub fn new(id: usize) -> Result<StateID, StateIDError> {
+        StateID::try_from(id)
+    }
+
+    /// Create a new state ID without checking whether the given value
+    /// exceeds [`StateID::MAX`].
+    ///
+    /// While this is unchecked, providing an incorrect value must never
+    /// sacrifice memory safety, as documented above.
+    #[inline]
+    pub const fn new_unchecked(id: usize) -> StateID {
+        StateID(id as u32)
+    }
+
+    /// Like [`StateID::new`], but panics if the given ID is not valid.
+    #[inline]
+    pub fn must(id: usize) -> StateID {
+        StateID::new(id).unwrap()
+    }
+
+    /// Return this state ID as a `usize`.
+    #[inline]
+    pub const fn as_usize(&self) -> usize {
+        self.0 as usize
+    }
+
+    /// Return the internal u32 of this state ID.
+    #[inline]
+    pub const fn as_u32(&self) -> u32 {
+        self.0
+    }
+
+    /// Return the internal u32 of this pattern ID represented as an i32.
+    ///
+    /// This is guaranteed to never overflow an `i32`.
+    #[inline]
+    pub const fn as_i32(&self) -> i32 {
+        self.0 as i32
+    }
+
+    /// Returns one more than this state ID as a usize.
+    ///
+    /// Since a state ID has constraints on its maximum value, adding `1` to
+    /// it will always fit in a `usize` (and a `u32`).
+    #[inline]
+    pub fn one_more(&self) -> usize {
+        self.as_usize().checked_add(1).unwrap()
+    }
+
+    /// Decode this state ID from the bytes given using the native endian byte
+    /// order for the current target.
+    ///
+    /// If the decoded integer is not representable as a state ID for the
+    /// current target, then this returns an error.
+    #[inline]
+    pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<StateID, StateIDError> {
+        let id = u32::from_ne_bytes(bytes);
+        if id > StateID::MAX.as_u32() {
+            return Err(StateIDError { attempted: id as u64 });
+        }
+        Ok(StateID::new_unchecked(id as usize))
+    }
+
+    /// Decode this state ID from the bytes given using the native endian
+    /// byte order for the current target.
+    ///
+    /// This is analogous to [`StateID::new_unchecked`] in that is does not
+    /// check whether the decoded integer is representable as a state ID.
+    #[inline]
+    pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> StateID {
+        StateID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
+    }
+
+    /// Return the underlying state ID integer as raw bytes in native endian
+    /// format.
+    #[inline]
+    pub fn to_ne_bytes(&self) -> [u8; 4] {
+        self.0.to_ne_bytes()
+    }
+
+    /// Returns an iterator over all state IDs from 0 up to and not including
+    /// the given length.
+    ///
+    /// If the given length exceeds [`StateID::LIMIT`], then this panics.
+    #[cfg(feature = "alloc")]
+    pub(crate) fn iter(len: usize) -> StateIDIter {
+        StateIDIter::new(len)
+    }
+}
+
+/// This error occurs when a state ID could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum state ID value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct StateIDError {
+    attempted: u64,
+}
+
+impl StateIDError {
+    /// Returns the value that failed to constructed a state ID.
+    pub fn attempted(&self) -> u64 {
+        self.attempted
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for StateIDError {}
+
+impl core::fmt::Display for StateIDError {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(
+            f,
+            "failed to create StateID from {:?}, which exceeds {:?}",
+            self.attempted(),
+            StateID::MAX,
+        )
+    }
+}
+
+/// A macro for defining exactly identical (modulo names) impls for ID types.
+macro_rules! impls {
+    ($ty:ident, $tyerr:ident, $tyiter:ident) => {
+        #[derive(Clone, Debug)]
+        pub(crate) struct $tyiter {
+            rng: ops::Range<usize>,
+        }
+
+        impl $tyiter {
+            #[cfg(feature = "alloc")]
+            fn new(len: usize) -> $tyiter {
+                assert!(
+                    len <= $ty::LIMIT,
+                    "cannot create iterator with IDs when number of \
+                     elements exceed {:?}",
+                    $ty::LIMIT,
+                );
+                $tyiter { rng: 0..len }
+            }
+        }
+
+        impl Iterator for $tyiter {
+            type Item = $ty;
+
+            fn next(&mut self) -> Option<$ty> {
+                if self.rng.start >= self.rng.end {
+                    return None;
+                }
+                let next_id = self.rng.start + 1;
+                let id = mem::replace(&mut self.rng.start, next_id);
+                // new_unchecked is OK since we asserted that the number of
+                // elements in this iterator will fit in an ID at construction.
+                Some($ty::new_unchecked(id))
+            }
+        }
+
+        impl<T> core::ops::Index<$ty> for [T] {
+            type Output = T;
+
+            #[inline]
+            fn index(&self, index: $ty) -> &T {
+                &self[index.as_usize()]
+            }
+        }
+
+        impl<T> core::ops::IndexMut<$ty> for [T] {
+            #[inline]
+            fn index_mut(&mut self, index: $ty) -> &mut T {
+                &mut self[index.as_usize()]
+            }
+        }
+
+        #[cfg(feature = "alloc")]
+        impl<T> core::ops::Index<$ty> for Vec<T> {
+            type Output = T;
+
+            #[inline]
+            fn index(&self, index: $ty) -> &T {
+                &self[index.as_usize()]
+            }
+        }
+
+        #[cfg(feature = "alloc")]
+        impl<T> core::ops::IndexMut<$ty> for Vec<T> {
+            #[inline]
+            fn index_mut(&mut self, index: $ty) -> &mut T {
+                &mut self[index.as_usize()]
+            }
+        }
+
+        impl TryFrom<usize> for $ty {
+            type Error = $tyerr;
+
+            fn try_from(id: usize) -> Result<$ty, $tyerr> {
+                if id > $ty::MAX.as_usize() {
+                    return Err($tyerr { attempted: id as u64 });
+                }
+                Ok($ty::new_unchecked(id))
+            }
+        }
+
+        impl TryFrom<u8> for $ty {
+            type Error = Infallible;
+
+            fn try_from(id: u8) -> Result<$ty, Infallible> {
+                Ok($ty::new_unchecked(id as usize))
+            }
+        }
+
+        impl TryFrom<u16> for $ty {
+            type Error = $tyerr;
+
+            fn try_from(id: u16) -> Result<$ty, $tyerr> {
+                if id as u32 > $ty::MAX.as_u32() {
+                    return Err($tyerr { attempted: id as u64 });
+                }
+                Ok($ty::new_unchecked(id as usize))
+            }
+        }
+
+        impl TryFrom<u32> for $ty {
+            type Error = $tyerr;
+
+            fn try_from(id: u32) -> Result<$ty, $tyerr> {
+                if id > $ty::MAX.as_u32() {
+                    return Err($tyerr { attempted: id as u64 });
+                }
+                Ok($ty::new_unchecked(id as usize))
+            }
+        }
+
+        impl TryFrom<u64> for $ty {
+            type Error = $tyerr;
+
+            fn try_from(id: u64) -> Result<$ty, $tyerr> {
+                if id > $ty::MAX.as_u32() as u64 {
+                    return Err($tyerr { attempted: id });
+                }
+                Ok($ty::new_unchecked(id as usize))
+            }
+        }
+
+        #[cfg(test)]
+        impl quickcheck::Arbitrary for $ty {
+            fn arbitrary(gen: &mut quickcheck::Gen) -> $ty {
+                use core::cmp::max;
+
+                let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs();
+                if id > $ty::MAX.as_i32() {
+                    $ty::MAX
+                } else {
+                    $ty::new(usize::try_from(id).unwrap()).unwrap()
+                }
+            }
+        }
+    };
+}
+
+impls!(PatternID, PatternIDError, PatternIDIter);
+impls!(StateID, StateIDError, StateIDIter);
+
+/// A utility trait that defines a couple of adapters for making it convenient
+/// to access indices as ID types. We require ExactSizeIterator so that
+/// iterator construction can do a single check to make sure the index of each
+/// element is representable by its ID type.
+#[cfg(feature = "alloc")]
+pub(crate) trait IteratorIDExt: Iterator {
+    fn with_pattern_ids(self) -> WithPatternIDIter<Self>
+    where
+        Self: Sized + ExactSizeIterator,
+    {
+        WithPatternIDIter::new(self)
+    }
+
+    fn with_state_ids(self) -> WithStateIDIter<Self>
+    where
+        Self: Sized + ExactSizeIterator,
+    {
+        WithStateIDIter::new(self)
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl<I: Iterator> IteratorIDExt for I {}
+
+#[cfg(feature = "alloc")]
+macro_rules! iditer {
+    ($ty:ident, $iterty:ident, $withiterty:ident) => {
+        /// An iterator adapter that is like std::iter::Enumerate, but attaches
+        /// IDs. It requires ExactSizeIterator. At construction, it ensures
+        /// that the index of each element in the iterator is representable in
+        /// the corresponding ID type.
+        #[derive(Clone, Debug)]
+        pub(crate) struct $withiterty<I> {
+            it: I,
+            ids: $iterty,
+        }
+
+        impl<I: Iterator + ExactSizeIterator> $withiterty<I> {
+            fn new(it: I) -> $withiterty<I> {
+                let ids = $ty::iter(it.len());
+                $withiterty { it, ids }
+            }
+        }
+
+        impl<I: Iterator + ExactSizeIterator> Iterator for $withiterty<I> {
+            type Item = ($ty, I::Item);
+
+            fn next(&mut self) -> Option<($ty, I::Item)> {
+                let item = self.it.next()?;
+                // Number of elements in this iterator must match, according
+                // to contract of ExactSizeIterator.
+                let id = self.ids.next().unwrap();
+                Some((id, item))
+            }
+        }
+    };
+}
+
+#[cfg(feature = "alloc")]
+iditer!(PatternID, PatternIDIter, WithPatternIDIter);
+#[cfg(feature = "alloc")]
+iditer!(StateID, StateIDIter, WithStateIDIter);
diff --git a/src/util/lazy.rs b/src/util/lazy.rs

new file mode 100644 (file)

index 0000000..d8cac6e
--- /dev/null
+++ b/src/util/lazy.rs
@@ -0,0 +1,31 @@
+use core::{
+    cell::Cell,
+    ptr,
+    sync::atomic::{AtomicPtr, Ordering},
+};
+
+use alloc::{boxed::Box, vec::Vec};
+
+#[inline(always)]
+pub(crate) fn get_or_init<T: Send + Sync + 'static>(
+    location: &'static AtomicPtr<T>,
+    init: impl FnOnce() -> T,
+) -> &'static T {
+    let mut ptr = location.load(Ordering::Acquire);
+    if ptr.is_null() {
+        let new_dfa = Box::new(init());
+        ptr = Box::into_raw(new_dfa);
+        let result = location.compare_exchange(
+            ptr::null_mut(),
+            ptr,
+            Ordering::AcqRel,
+            Ordering::Acquire,
+        );
+        if let Err(old) = result {
+            let redundant = unsafe { Box::from_raw(ptr) };
+            drop(redundant);
+            ptr = old;
+        }
+    }
+    unsafe { &*ptr }
+}
diff --git a/src/util/matchtypes.rs b/src/util/matchtypes.rs

new file mode 100644 (file)

index 0000000..de0fa65
--- /dev/null
+++ b/src/util/matchtypes.rs
@@ -0,0 +1,356 @@
+use crate::util::id::PatternID;
+
+/// The kind of match semantics to use for a DFA.
+///
+/// The default match kind is `LeftmostFirst`.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum MatchKind {
+    /// Report all possible matches.
+    All,
+    /// Report only the leftmost matches. When multiple leftmost matches exist,
+    /// report the match corresponding to the part of the regex that appears
+    /// first in the syntax.
+    LeftmostFirst,
+    /// Hints that destructuring should not be exhaustive.
+    ///
+    /// This enum may grow additional variants, so this makes sure clients
+    /// don't count on exhaustive matching. (Otherwise, adding a new variant
+    /// could break existing code.)
+    #[doc(hidden)]
+    __Nonexhaustive,
+    // There is prior art in RE2 that shows that we should be able to add
+    // LeftmostLongest too. The tricky part of it is supporting ungreedy
+    // repetitions. Instead of treating all NFA states as having equivalent
+    // priority (as in 'All') or treating all NFA states as having distinct
+    // priority based on order (as in 'LeftmostFirst'), we instead group NFA
+    // states into sets, and treat members of each set as having equivalent
+    // priority, but having greater priority than all following members
+    // of different sets.
+    //
+    // However, it's not clear whether it's really worth adding this. After
+    // all, leftmost-longest can be emulated when using literals by using
+    // leftmost-first and sorting the literals by length in descending order.
+    // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
+    // always match `a` in `ab` when using leftmost-first, but leftmost-longest
+    // would match `ab`.
+}
+
+impl MatchKind {
+    #[cfg(feature = "alloc")]
+    pub(crate) fn continue_past_first_match(&self) -> bool {
+        *self == MatchKind::All
+    }
+}
+
+impl Default for MatchKind {
+    fn default() -> MatchKind {
+        MatchKind::LeftmostFirst
+    }
+}
+
+/// A representation of a match reported by a regex engine.
+///
+/// A match records the start and end offsets of the match in the haystack.
+///
+/// Every match guarantees that `start <= end`.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct Match {
+    /// The start offset of the match, inclusive.
+    start: usize,
+    /// The end offset of the match, exclusive.
+    end: usize,
+}
+
+impl Match {
+    /// Create a new match from a byte offset span.
+    ///
+    /// # Panics
+    ///
+    /// This panics if `end < start`.
+    #[inline]
+    pub fn new(start: usize, end: usize) -> Match {
+        assert!(start <= end);
+        Match { start, end }
+    }
+
+    /// The starting position of the match.
+    #[inline]
+    pub fn start(&self) -> usize {
+        self.start
+    }
+
+    /// The ending position of the match.
+    #[inline]
+    pub fn end(&self) -> usize {
+        self.end
+    }
+
+    /// Returns the match location as a range.
+    #[inline]
+    pub fn range(&self) -> core::ops::Range<usize> {
+        self.start..self.end
+    }
+
+    /// Returns true if and only if this match is empty. That is, when
+    /// `start() == end()`.
+    ///
+    /// An empty match can only be returned when the empty string was among
+    /// the patterns used to build the Aho-Corasick automaton.
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.start == self.end
+    }
+}
+
+/// A representation of a match reported by a DFA.
+///
+/// This is called a "half" match because it only includes the end location
+/// (or start location for a reverse match) of a match. This corresponds to the
+/// information that a single DFA scan can report. Getting the other half of
+/// the match requires a second scan with a reversed DFA.
+///
+/// A half match also includes the pattern that matched. The pattern is
+/// identified by an ID, which corresponds to its position (starting from `0`)
+/// relative to other patterns used to construct the corresponding DFA. If only
+/// a single pattern is provided to the DFA, then all matches are guaranteed to
+/// have a pattern ID of `0`.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct HalfMatch {
+    /// The pattern ID.
+    pub(crate) pattern: PatternID,
+    /// The offset of the match.
+    ///
+    /// For forward searches, the offset is exclusive. For reverse searches,
+    /// the offset is inclusive.
+    pub(crate) offset: usize,
+}
+
+impl HalfMatch {
+    /// Create a new half match from a pattern ID and a byte offset.
+    #[inline]
+    pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
+        HalfMatch { pattern, offset }
+    }
+
+    /// Create a new half match from a pattern ID and a byte offset.
+    ///
+    /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
+    /// [`PatternID`]. This panics if the given `usize` is not representable
+    /// as a `PatternID`.
+    #[inline]
+    pub fn must(pattern: usize, offset: usize) -> HalfMatch {
+        HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
+    }
+
+    /// Returns the ID of the pattern that matched.
+    ///
+    /// The ID of a pattern is derived from the position in which it was
+    /// originally inserted into the corresponding DFA. The first pattern has
+    /// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
+    #[inline]
+    pub fn pattern(&self) -> PatternID {
+        self.pattern
+    }
+
+    /// The position of the match.
+    ///
+    /// If this match was produced by a forward search, then the offset is
+    /// exclusive. If this match was produced by a reverse search, then the
+    /// offset is inclusive.
+    #[inline]
+    pub fn offset(&self) -> usize {
+        self.offset
+    }
+}
+
+/// A representation of a multi match reported by a regex engine.
+///
+/// A multi match has two essential pieces of information: the identifier of
+/// the pattern that matched, along with the start and end offsets of the match
+/// in the haystack.
+///
+/// The pattern is identified by an ID, which corresponds to its position
+/// (starting from `0`) relative to other patterns used to construct the
+/// corresponding regex engine. If only a single pattern is provided, then all
+/// multi matches are guaranteed to have a pattern ID of `0`.
+///
+/// Every multi match guarantees that `start <= end`.
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub struct MultiMatch {
+    /// The pattern ID.
+    pattern: PatternID,
+    /// The start offset of the match, inclusive.
+    start: usize,
+    /// The end offset of the match, exclusive.
+    end: usize,
+}
+
+impl MultiMatch {
+    /// Create a new match from a pattern ID and a byte offset span.
+    ///
+    /// # Panics
+    ///
+    /// This panics if `end < start`.
+    #[inline]
+    pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch {
+        assert!(start <= end);
+        MultiMatch { pattern, start, end }
+    }
+
+    /// Create a new match from a pattern ID and a byte offset span.
+    ///
+    /// This is like [`MultiMatch::new`], but accepts a `usize` instead of a
+    /// [`PatternID`]. This panics if the given `usize` is not representable
+    /// as a `PatternID`.
+    ///
+    /// # Panics
+    ///
+    /// This panics if `end < start` or if `pattern > PatternID::MAX`.
+    #[inline]
+    pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch {
+        MultiMatch::new(PatternID::new(pattern).unwrap(), start, end)
+    }
+
+    /// Returns the ID of the pattern that matched.
+    ///
+    /// The ID of a pattern is derived from the position in which it was
+    /// originally inserted into the corresponding regex engine. The first
+    /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
+    /// so on.
+    #[inline]
+    pub fn pattern(&self) -> PatternID {
+        self.pattern
+    }
+
+    /// The starting position of the match.
+    #[inline]
+    pub fn start(&self) -> usize {
+        self.start
+    }
+
+    /// The ending position of the match.
+    #[inline]
+    pub fn end(&self) -> usize {
+        self.end
+    }
+
+    /// Returns the match location as a range.
+    #[inline]
+    pub fn range(&self) -> core::ops::Range<usize> {
+        self.start..self.end
+    }
+
+    /// Returns true if and only if this match is empty. That is, when
+    /// `start() == end()`.
+    ///
+    /// An empty match can only be returned when the empty string was among
+    /// the patterns used to build the Aho-Corasick automaton.
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.start == self.end
+    }
+}
+
+/// An error type indicating that a search stopped prematurely without finding
+/// a match.
+///
+/// This error type implies that one cannot assume that no matches occur, since
+/// the search stopped before completing.
+///
+/// Normally, when one searches for something, the response is either an
+/// affirmative "it was found at this location" or a negative "not found at
+/// all." However, in some cases, a regex engine can be configured to stop its
+/// search before concluding whether a match exists or not. When this happens,
+/// it may be important for the caller to know why the regex engine gave up and
+/// where in the input it gave up at. This error type exposes the 'why' and the
+/// 'where.'
+///
+/// For example, the DFAs provided by this library generally cannot correctly
+/// implement Unicode word boundaries. Instead, they provide an option to
+/// eagerly support them on ASCII text (since Unicode word boundaries are
+/// equivalent to ASCII word boundaries when searching ASCII text), but will
+/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
+/// required to either report the failure to the caller (unergonomic) or
+/// otherwise fall back to some other regex engine (ergonomic, but potentially
+/// costly).
+///
+/// More generally, some regex engines offer the ability for callers to specify
+/// certain bytes that will trigger the regex engine to automatically quit if
+/// they are seen.
+///
+/// Still yet, there may be other reasons for a failed match. For example,
+/// the hybrid DFA provided by this crate can be configured to give up if it
+/// believes that it is not efficient. This in turn permits callers to choose a
+/// different regex engine.
+///
+/// # Advice
+///
+/// While this form of error reporting adds complexity, it is generally
+/// possible for callers to configure regex engines to never give up a search,
+/// and thus never return an error. Indeed, the default configuration for every
+/// regex engine in this crate is such that they will never stop searching
+/// early. Therefore, the only way to get a match error is if the regex engine
+/// is explicitly configured to do so. Options that enable this behavior
+/// document the new error conditions they imply.
+///
+/// Regex engines for which no errors are possible for any configuration will
+/// return the normal `Option<Match>` and not use this error type at all.
+///
+/// For example, regex engines in the `dfa` sub-module will only report
+/// `MatchError::Quit` if instructed by either
+/// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary)
+/// or by
+/// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit).
+#[derive(Clone, Debug, Eq, Hash, PartialEq)]
+pub enum MatchError {
+    // Note that the first version of this type was called `SearchError` and it
+    // included a third `None` variant to indicate that the search completed
+    // and no match was found. However, this was problematic for iterator
+    // APIs where the `None` sentinel for stopping iteration corresponds
+    // precisely to the "match not found" case. The fact that the `None`
+    // variant was buried inside this type was in turn quite awkward. So
+    // instead, I removed the `None` variant, renamed the type and used
+    // `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the
+    // conceptually simpler `Result<Match, MatchError>`. However, we "regain"
+    // ergonomics by only putting the more complex API in the `try_` variants
+    // ("fallible") of search methods. The infallible APIs will instead just
+    // return `Option<Match>` and panic on error.
+    /// The search saw a "quit" byte at which it was instructed to stop
+    /// searching.
+    Quit {
+        /// The "quit" byte that was observed that caused the search to stop.
+        byte: u8,
+        /// The offset at which the quit byte was observed.
+        offset: usize,
+    },
+    /// The search, based on heuristics, determined that it would be better
+    /// to stop, typically to provide the caller an opportunity to use an
+    /// alternative regex engine.
+    ///
+    /// Currently, the only way for this to occur is via the lazy DFA and
+    /// only when it is configured to do so (it will not return this error by
+    /// default).
+    GaveUp {
+        /// The offset at which the search stopped. This corresponds to the
+        /// position immediately following the last byte scanned.
+        offset: usize,
+    },
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for MatchError {}
+
+impl core::fmt::Display for MatchError {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        match *self {
+            MatchError::Quit { byte, offset } => write!(
+                f,
+                "quit search after observing byte \\x{:02X} at offset {}",
+                byte, offset,
+            ),
+            MatchError::GaveUp { offset } => {
+                write!(f, "gave up searching at offset {}", offset)
+            }
+        }
+    }
+}
diff --git a/src/util/mod.rs b/src/util/mod.rs

new file mode 100644 (file)

index 0000000..798507d
--- /dev/null
+++ b/src/util/mod.rs
@@ -0,0 +1,275 @@
+/*!
+TODO
+*/
+
+use core::{ascii, fmt, str};
+
+#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+pub mod alphabet;
+pub(crate) mod bytes;
+#[cfg(feature = "alloc")]
+pub(crate) mod determinize;
+pub mod id;
+#[cfg(feature = "alloc")]
+pub(crate) mod lazy;
+pub(crate) mod matchtypes;
+pub mod prefilter;
+#[cfg(feature = "alloc")]
+pub(crate) mod sparse_set;
+pub(crate) mod start;
+#[cfg(feature = "alloc")]
+pub(crate) mod syntax;
+
+/// The offset, in bytes, that a match is delayed by in the DFAs generated by
+/// this crate. (This includes lazy DFAs.)
+///
+/// The purpose of this delay is to support look-ahead such as \b (ASCII-only)
+/// and $. In particular, both of these operators may require the
+/// identification of the end of input in order to confirm a match. Not only
+/// does this mean that all matches must therefore be delayed by a single byte,
+/// but that a special EOI value is added to the alphabet of all DFAs. (Which
+/// means that even though the alphabet of a DFA is typically all byte values,
+/// the actual maximum alphabet size is 257 due to the extra EOI value.)
+///
+/// Since we delay matches by only 1 byte, this can't fully support a
+/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed,
+/// DFAs in this crate do not support it. (It's not as simple as just
+/// increasing the match offset to do it---otherwise we would---but building
+/// the full Unicode-aware word boundary detection into an automaton is quite
+/// tricky.)
+pub(crate) const MATCH_OFFSET: usize = 1;
+
+/// A type that wraps a single byte with a convenient fmt::Debug impl that
+/// escapes the byte.
+pub(crate) struct DebugByte(pub u8);
+
+impl fmt::Debug for DebugByte {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // 10 bytes is enough to cover any output from ascii::escape_default.
+        let mut bytes = [0u8; 10];
+        let mut len = 0;
+        for (i, mut b) in ascii::escape_default(self.0).enumerate() {
+            // capitalize \xab to \xAB
+            if i >= 2 && b'a' <= b && b <= b'f' {
+                b -= 32;
+            }
+            bytes[len] = b;
+            len += 1;
+        }
+        write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap())
+    }
+}
+
+/// Returns the smallest possible index of the next valid UTF-8 sequence
+/// starting after `i`.
+///
+/// For all inputs, including invalid UTF-8 and any value of `i`, the return
+/// value is guaranteed to be greater than `i`.
+///
+/// Generally speaking, this should only be called on `text` when it is
+/// permitted to assume that it is valid UTF-8 and where either `i >=
+/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
+#[inline(always)]
+pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize {
+    let b = match text.get(i) {
+        None => return i.checked_add(1).unwrap(),
+        Some(&b) => b,
+    };
+    // For cases where we see an invalid UTF-8 byte, there isn't much we can do
+    // other than just start at the next byte.
+    let inc = utf8_len(b).unwrap_or(1);
+    i.checked_add(inc).unwrap()
+}
+
+/// Returns true if and only if the given byte is considered a word character.
+/// This only applies to ASCII.
+///
+/// This was copied from regex-syntax so that we can use it to determine the
+/// starting DFA state while searching without depending on regex-syntax. The
+/// definition is never going to change, so there's no maintenance/bit-rot
+/// hazard here.
+#[inline(always)]
+pub(crate) fn is_word_byte(b: u8) -> bool {
+    match b {
+        b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
+        _ => false,
+    }
+}
+
+/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
+///
+/// If no valid encoding of a codepoint exists at the beginning of the given
+/// byte slice, then the first byte is returned instead.
+///
+/// This returns `None` if and only if `bytes` is empty.
+#[inline(always)]
+pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
+    if bytes.is_empty() {
+        return None;
+    }
+    let len = match utf8_len(bytes[0]) {
+        None => return Some(Err(bytes[0])),
+        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
+        Some(1) => return Some(Ok(bytes[0] as char)),
+        Some(len) => len,
+    };
+    match str::from_utf8(&bytes[..len]) {
+        Ok(s) => Some(Ok(s.chars().next().unwrap())),
+        Err(_) => Some(Err(bytes[0])),
+    }
+}
+
+/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
+///
+/// If no valid encoding of a codepoint exists at the end of the given byte
+/// slice, then the last byte is returned instead.
+///
+/// This returns `None` if and only if `bytes` is empty.
+#[inline(always)]
+pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
+    if bytes.is_empty() {
+        return None;
+    }
+    let mut start = bytes.len() - 1;
+    let limit = bytes.len().saturating_sub(4);
+    while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) {
+        start -= 1;
+    }
+    match decode_utf8(&bytes[start..]) {
+        None => None,
+        Some(Ok(ch)) => Some(Ok(ch)),
+        Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
+    }
+}
+
+/// Given a UTF-8 leading byte, this returns the total number of code units
+/// in the following encoded codepoint.
+///
+/// If the given byte is not a valid UTF-8 leading byte, then this returns
+/// `None`.
+#[inline(always)]
+fn utf8_len(byte: u8) -> Option<usize> {
+    if byte <= 0x7F {
+        return Some(1);
+    } else if byte & 0b1100_0000 == 0b1000_0000 {
+        return None;
+    } else if byte <= 0b1101_1111 {
+        Some(2)
+    } else if byte <= 0b1110_1111 {
+        Some(3)
+    } else if byte <= 0b1111_0111 {
+        Some(4)
+    } else {
+        None
+    }
+}
+
+/// Returns true if and only if the given byte is either a valid leading UTF-8
+/// byte, or is otherwise an invalid byte that can never appear anywhere in a
+/// valid UTF-8 sequence.
+#[inline(always)]
+fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
+    // In the ASCII case, the most significant bit is never set. The leading
+    // byte of a 2/3/4-byte sequence always has the top two most significant
+    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
+    // also returns true, since every such byte has its two most significant
+    // bits set:
+    //
+    //     \xC0 :: 11000000
+    //     \xC1 :: 11000001
+    //     \xF5 :: 11110101
+    //     \xF6 :: 11110110
+    //     \xF7 :: 11110111
+    //     \xF8 :: 11111000
+    //     \xF9 :: 11111001
+    //     \xFA :: 11111010
+    //     \xFB :: 11111011
+    //     \xFC :: 11111100
+    //     \xFD :: 11111101
+    //     \xFE :: 11111110
+    //     \xFF :: 11111111
+    (b & 0b1100_0000) != 0b1000_0000
+}
+
+#[cfg(feature = "alloc")]
+#[inline(always)]
+pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool {
+    use core::{ptr, sync::atomic::AtomicPtr};
+
+    use crate::{
+        dfa::{
+            dense::{self, DFA},
+            Automaton,
+        },
+        util::lazy,
+    };
+
+    static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
+
+    let dfa = lazy::get_or_init(&WORD, || {
+        // TODO: Should we use a lazy DFA here instead? It does complicate
+        // things somewhat, since we then need a mutable cache, which probably
+        // means a thread local.
+        dense::Builder::new()
+            .configure(dense::Config::new().anchored(true))
+            .build(r"\w")
+            .unwrap()
+    });
+    // This is OK since '\w' contains no look-around.
+    let mut sid = dfa.universal_start_state();
+    while at < bytes.len() {
+        let byte = bytes[at];
+        sid = dfa.next_state(sid, byte);
+        at += 1;
+        if dfa.is_special_state(sid) {
+            if dfa.is_match_state(sid) {
+                return true;
+            } else if dfa.is_dead_state(sid) {
+                return false;
+            }
+        }
+    }
+    dfa.is_match_state(dfa.next_eoi_state(sid))
+}
+
+#[cfg(feature = "alloc")]
+#[inline(always)]
+pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool {
+    use core::{ptr, sync::atomic::AtomicPtr};
+
+    use crate::{
+        dfa::{
+            dense::{self, DFA},
+            Automaton,
+        },
+        nfa::thompson::NFA,
+    };
+
+    static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
+
+    let dfa = lazy::get_or_init(&WORD, || {
+        dense::Builder::new()
+            .configure(dense::Config::new().anchored(true))
+            .thompson(NFA::config().reverse(true).shrink(true))
+            .build(r"\w")
+            .unwrap()
+    });
+
+    // This is OK since '\w' contains no look-around.
+    let mut sid = dfa.universal_start_state();
+    while at > 0 {
+        at -= 1;
+        let byte = bytes[at];
+        sid = dfa.next_state(sid, byte);
+        if dfa.is_special_state(sid) {
+            if dfa.is_match_state(sid) {
+                return true;
+            } else if dfa.is_dead_state(sid) {
+                return false;
+            }
+        }
+    }
+    dfa.is_match_state(dfa.next_eoi_state(sid))
+}
diff --git a/src/util/prefilter.rs b/src/util/prefilter.rs

new file mode 100644 (file)

index 0000000..5fe1515
--- /dev/null
+++ b/src/util/prefilter.rs
@@ -0,0 +1,281 @@
+use crate::Match;
+
+/// A candidate is the result of running a prefilter on a haystack at a
+/// particular position. The result is one of no match, a confirmed match or
+/// a possible match.
+///
+/// When no match is returned, the prefilter is guaranteeing that no possible
+/// match can be found in the haystack, and the caller may trust this. That is,
+/// all correct prefilters must never report false negatives.
+///
+/// In some cases, a prefilter can confirm a match very quickly, in which case,
+/// the caller may use this to stop what it's doing and report the match. In
+/// this case, prefilter implementations must never report a false positive.
+/// In other cases, the prefilter can only report a potential match, in which
+/// case the callers must attempt to confirm the match. In this case, prefilter
+/// implementations are permitted to return false positives.
+#[derive(Clone, Debug)]
+pub enum Candidate {
+    /// The prefilter reports that no match is possible. Prefilter
+    /// implementations will never report false negatives.
+    None,
+    /// The prefilter reports that a match has been confirmed at the provided
+    /// byte offsets. When this variant is reported, the prefilter is
+    /// guaranteeing a match. No false positives are permitted.
+    Match(Match),
+    /// The prefilter reports that a match *may* start at the given position.
+    /// When this variant is reported, it may correspond to a false positive.
+    PossibleStartOfMatch(usize),
+}
+
+impl Candidate {
+    /// Convert this candidate into an option. This is useful when callers do
+    /// not distinguish between true positives and false positives (i.e., the
+    /// caller must always confirm the match in order to update some other
+    /// state).
+    ///
+    /// The byte offset in the option returned corresponds to the starting
+    /// position of the possible match.
+    pub fn into_option(self) -> Option<usize> {
+        match self {
+            Candidate::None => None,
+            Candidate::Match(ref m) => Some(m.start()),
+            Candidate::PossibleStartOfMatch(start) => Some(start),
+        }
+    }
+}
+
+/// A prefilter describes the behavior of fast literal scanners for quickly
+/// skipping past bytes in the haystack that we know cannot possibly
+/// participate in a match.
+pub trait Prefilter: core::fmt::Debug {
+    /// Returns the next possible match candidate. This may yield false
+    /// positives, so callers must confirm a match starting at the position
+    /// returned. This, however, must never produce false negatives. That is,
+    /// this must, at minimum, return the starting position of the next match
+    /// in the given haystack after or at the given position.
+    fn next_candidate(
+        &self,
+        state: &mut State,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate;
+
+    /// Returns the approximate total amount of heap used by this prefilter, in
+    /// units of bytes.
+    fn heap_bytes(&self) -> usize;
+
+    /// Returns true if and only if this prefilter may return false positives
+    /// via the `Candidate::PossibleStartOfMatch` variant. This is most useful
+    /// when false positives are not posssible (in which case, implementations
+    /// should return false), which may allow completely avoiding heavier regex
+    /// machinery when the prefilter can quickly confirm its own matches.
+    ///
+    /// By default, this returns true, which is conservative; it is always
+    /// correct to return `true`. Returning `false` here and reporting a false
+    /// positive will result in incorrect searches.
+    fn reports_false_positives(&self) -> bool {
+        true
+    }
+}
+
+impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
+    #[inline]
+    fn next_candidate(
+        &self,
+        state: &mut State,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate {
+        (**self).next_candidate(state, haystack, at)
+    }
+
+    fn heap_bytes(&self) -> usize {
+        (**self).heap_bytes()
+    }
+
+    fn reports_false_positives(&self) -> bool {
+        (**self).reports_false_positives()
+    }
+}
+
+#[derive(Clone)]
+pub struct Scanner<'p> {
+    prefilter: &'p dyn Prefilter,
+    state: State,
+}
+
+impl<'p> Scanner<'p> {
+    pub fn new(prefilter: &'p dyn Prefilter) -> Scanner<'p> {
+        Scanner { prefilter, state: State::new() }
+    }
+
+    pub(crate) fn is_effective(&mut self, at: usize) -> bool {
+        self.state.is_effective(at)
+    }
+
+    pub(crate) fn reports_false_positives(&self) -> bool {
+        self.prefilter.reports_false_positives()
+    }
+
+    pub(crate) fn next_candidate(
+        &mut self,
+        bytes: &[u8],
+        at: usize,
+    ) -> Candidate {
+        let cand = self.prefilter.next_candidate(&mut self.state, bytes, at);
+        match cand {
+            Candidate::None => {
+                self.state.update_skipped_bytes(bytes.len() - at);
+            }
+            Candidate::Match(ref m) => {
+                self.state.update_skipped_bytes(m.start() - at);
+            }
+            Candidate::PossibleStartOfMatch(i) => {
+                self.state.update_skipped_bytes(i - at);
+            }
+        }
+        cand
+    }
+}
+
+impl<'p> core::fmt::Debug for Scanner<'p> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        f.debug_struct("Scanner").field("state", &self.state).finish()
+    }
+}
+
+/// State tracks state associated with the effectiveness of a
+/// prefilter. It is used to track how many bytes, on average, are skipped by
+/// the prefilter. If this average dips below a certain threshold over time,
+/// then the state renders the prefilter inert and stops using it.
+///
+/// A prefilter state should be created for each search. (Where creating an
+/// iterator via, e.g., `find_iter`, is treated as a single search.)
+#[derive(Clone, Debug)]
+pub struct State {
+    /// The number of skips that has been executed.
+    skips: usize,
+    /// The total number of bytes that have been skipped.
+    skipped: usize,
+    /// Once this heuristic has been deemed permanently ineffective, it will be
+    /// inert throughout the rest of its lifetime. This serves as a cheap way
+    /// to check inertness.
+    inert: bool,
+    /// The last (absolute) position at which a prefilter scanned to.
+    /// Prefilters can use this position to determine whether to re-scan or
+    /// not.
+    ///
+    /// Unlike other things that impact effectiveness, this is a fleeting
+    /// condition. That is, a prefilter can be considered ineffective if it is
+    /// at a position before `last_scan_at`, but can become effective again
+    /// once the search moves past `last_scan_at`.
+    ///
+    /// The utility of this is to both avoid additional overhead from calling
+    /// the prefilter and to avoid quadratic behavior. This ensures that a
+    /// prefilter will scan any particular byte at most once. (Note that some
+    /// prefilters, like the start-byte prefilter, do not need to use this
+    /// field at all, since it only looks for starting bytes.)
+    last_scan_at: usize,
+}
+
+impl State {
+    /// The minimum number of skip attempts to try before considering whether
+    /// a prefilter is effective or not.
+    const MIN_SKIPS: usize = 40;
+
+    /// The minimum amount of bytes that skipping must average.
+    ///
+    /// That is, after MIN_SKIPS have occurred, if the average number of bytes
+    /// skipped ever falls below MIN_AVG_SKIP, then the prefilter will be
+    /// rendered inert.
+    const MIN_AVG_SKIP: usize = 16;
+
+    /// Create a fresh prefilter state.
+    pub fn new() -> State {
+        State { skips: 0, skipped: 0, inert: false, last_scan_at: 0 }
+    }
+
+    /// Updates the position at which the last scan stopped. This may be
+    /// greater than the position of the last candidate reported. For example,
+    /// searching for the byte `z` in `abczdef` for the pattern `abcz` will
+    /// report a candidate at position `0`, but the end of its last scan will
+    /// be at position `3`.
+    ///
+    /// This position factors into the effectiveness of this prefilter. If the
+    /// current position is less than the last position at which a scan ended,
+    /// then the prefilter should not be re-run until the search moves past
+    /// that position.
+    ///
+    /// It is always correct to never update the last scan position. In fact,
+    /// it is also always correct to set the last scan position to an arbitrary
+    /// value. The key is setting it to a position in the future at which it
+    /// makes sense to restart the prefilter.
+    pub fn update_last_scan(&mut self, at: usize) {
+        if at > self.last_scan_at {
+            self.last_scan_at = at;
+        }
+    }
+
+    /// Return true if and only if this state indicates that a prefilter is
+    /// still effective. If the prefilter is not effective, then this state
+    /// is rendered "inert." At which point, all subsequent calls to
+    /// `is_effective` on this state will return `false`.
+    ///
+    /// `at` should correspond to the current starting position of the search.
+    ///
+    /// Callers typically do not need to use this, as it represents the
+    /// default implementation of
+    /// [`Prefilter::is_effective`](trait.Prefilter.html#tymethod.is_effective).
+    fn is_effective(&mut self, at: usize) -> bool {
+        if self.inert {
+            return false;
+        }
+        if at < self.last_scan_at {
+            return false;
+        }
+        if self.skips < State::MIN_SKIPS {
+            return true;
+        }
+
+        if self.skipped >= State::MIN_AVG_SKIP * self.skips {
+            return true;
+        }
+
+        // We're inert.
+        self.inert = true;
+        false
+    }
+
+    /// Update this state with the number of bytes skipped on the last
+    /// invocation of the prefilter.
+    fn update_skipped_bytes(&mut self, skipped: usize) {
+        self.skips += 1;
+        self.skipped += skipped;
+    }
+}
+
+/// A `Prefilter` implementation that reports a possible match at every
+/// position.
+///
+/// This should generally not be used as an actual prefilter. It is only
+/// useful when one needs to represent the absence of a prefilter in a generic
+/// context. For example, a [`dfa::regex::Regex`](crate::dfa::regex::Regex)
+/// uses this prefilter by default to indicate that no prefilter should be
+/// used.
+///
+/// A `None` prefilter value cannot be constructed.
+#[derive(Clone, Debug)]
+pub struct None {
+    _priv: (),
+}
+
+impl Prefilter for None {
+    fn next_candidate(&self, _: &mut State, _: &[u8], at: usize) -> Candidate {
+        Candidate::PossibleStartOfMatch(at)
+    }
+
+    fn heap_bytes(&self) -> usize {
+        0
+    }
+}
diff --git a/src/util/sparse_set.rs b/src/util/sparse_set.rs

new file mode 100644 (file)

index 0000000..bf59e44
--- /dev/null
+++ b/src/util/sparse_set.rs
@@ -0,0 +1,229 @@
+use alloc::{boxed::Box, vec, vec::Vec};
+
+use crate::util::id::StateID;
+
+/// A pairse of sparse sets.
+///
+/// This is useful when one needs to compute NFA epsilon closures from a
+/// previous set of states derived from an epsilon closure. One set can be the
+/// starting states where as the other set can be the destination states after
+/// following the transitions for a particular byte of input.
+///
+/// There is no significance to 'set1' or 'set2'. They are both sparse sets of
+/// the same size.
+///
+/// The members of this struct are exposed so that callers may borrow 'set1'
+/// and 'set2' individually without being force to borrow both at the same
+/// time.
+#[derive(Clone, Debug)]
+pub(crate) struct SparseSets {
+    pub(crate) set1: SparseSet,
+    pub(crate) set2: SparseSet,
+}
+
+impl SparseSets {
+    /// Create a new pair of sparse sets where each set has the given capacity.
+    ///
+    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+    pub(crate) fn new(capacity: usize) -> SparseSets {
+        SparseSets {
+            set1: SparseSet::new(capacity),
+            set2: SparseSet::new(capacity),
+        }
+    }
+
+    /// Resizes these sparse sets to have the new capacity given.
+    ///
+    /// The sets are automatically cleared.
+    ///
+    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+    #[inline]
+    pub(crate) fn resize(&mut self, new_capacity: usize) {
+        self.set1.resize(new_capacity);
+        self.set2.resize(new_capacity);
+    }
+
+    /// Clear both sparse sets.
+    pub(crate) fn clear(&mut self) {
+        self.set1.clear();
+        self.set2.clear();
+    }
+
+    /// Swap set1 with set2.
+    pub(crate) fn swap(&mut self) {
+        core::mem::swap(&mut self.set1, &mut self.set2);
+    }
+
+    /// Returns the memory usage, in bytes, used by this pair of sparse sets.
+    pub(crate) fn memory_usage(&self) -> usize {
+        self.set1.memory_usage() + self.set2.memory_usage()
+    }
+}
+
+/// A sparse set used for representing ordered NFA states.
+///
+/// This supports constant time addition and membership testing. Clearing an
+/// entire set can also be done in constant time. Iteration yields elements
+/// in the order in which they were inserted.
+///
+/// The data structure is based on: https://research.swtch.com/sparse
+/// Note though that we don't actually use uninitialized memory. We generally
+/// reuse sparse sets, so the initial allocation cost is bareable. However, its
+/// other properties listed above are extremely useful.
+#[derive(Clone)]
+pub(crate) struct SparseSet {
+    /// The number of elements currently in this set.
+    len: usize,
+    /// Dense contains the ids in the order in which they were inserted.
+    dense: Vec<StateID>,
+    /// Sparse maps ids to their location in dense.
+    ///
+    /// A state ID is in the set if and only if
+    /// sparse[id] < dense.len() && id == dense[sparse[id]].
+    sparse: Vec<StateID>,
+}
+
+impl SparseSet {
+    /// Create a new sparse set with the given capacity.
+    ///
+    /// Sparse sets have a fixed size and they cannot grow. Attempting to
+    /// insert more distinct elements than the total capacity of the set will
+    /// result in a panic.
+    ///
+    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+    #[inline]
+    pub(crate) fn new(capacity: usize) -> SparseSet {
+        let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
+        set.resize(capacity);
+        set
+    }
+
+    /// Resizes this sparse set to have the new capacity given.
+    ///
+    /// This set is automatically cleared.
+    ///
+    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
+    #[inline]
+    pub(crate) fn resize(&mut self, new_capacity: usize) {
+        assert!(
+            new_capacity <= StateID::LIMIT,
+            "sparse set capacity cannot excced {:?}",
+            StateID::LIMIT
+        );
+        self.clear();
+        self.dense.resize(new_capacity, StateID::ZERO);
+        self.sparse.resize(new_capacity, StateID::ZERO);
+    }
+
+    /// Returns the capacity of this set.
+    ///
+    /// The capacity represents a fixed limit on the number of distinct
+    /// elements that are allowed in this set. The capacity cannot be changed.
+    #[inline]
+    pub(crate) fn capacity(&self) -> usize {
+        self.dense.len()
+    }
+
+    /// Returns the number of elements in this set.
+    #[inline]
+    pub(crate) fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Returns true if and only if this set is empty.
+    #[inline]
+    pub(crate) fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Insert the state ID value into this set and return true if the given
+    /// state ID was not previously in this set.
+    ///
+    /// This operation is idempotent. If the given value is already in this
+    /// set, then this is a no-op.
+    ///
+    /// If more than `capacity` ids are inserted, then this panics.
+    ///
+    /// This is marked as inline(always) since the compiler won't inline it
+    /// otherwise, and it's a fairly hot piece of code in DFA determinization.
+    #[inline(always)]
+    pub(crate) fn insert(&mut self, value: StateID) -> bool {
+        if self.contains(value) {
+            return false;
+        }
+
+        let i = self.len();
+        assert!(
+            i < self.capacity(),
+            "{:?} exceeds capacity of {:?} when inserting {:?}",
+            i,
+            self.capacity(),
+            value,
+        );
+        // OK since i < self.capacity() and self.capacity() is guaranteed to
+        // be <= StateID::LIMIT.
+        let id = StateID::new_unchecked(i);
+        self.dense[id] = value;
+        self.sparse[value] = id;
+        self.len += 1;
+        true
+    }
+
+    /// Returns true if and only if this set contains the given value.
+    #[inline]
+    pub(crate) fn contains(&self, value: StateID) -> bool {
+        let i = self.sparse[value];
+        i.as_usize() < self.len() && self.dense[i] == value
+    }
+
+    /// Returns the ith inserted element from this set.
+    ///
+    /// Panics when i >= self.len().
+    #[inline]
+    pub(crate) fn get(&self, i: usize) -> StateID {
+        self.dense[i]
+    }
+
+    /// Clear this set such that it has no members.
+    #[inline]
+    pub(crate) fn clear(&mut self) {
+        self.len = 0;
+    }
+
+    /// Returns the heap memory usage, in bytes, used by this sparse set.
+    #[inline]
+    pub(crate) fn memory_usage(&self) -> usize {
+        2 * self.dense.len() * StateID::SIZE
+    }
+}
+
+impl core::fmt::Debug for SparseSet {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        let elements: Vec<StateID> = self.into_iter().collect();
+        f.debug_tuple("SparseSet").field(&elements).finish()
+    }
+}
+
+/// An iterator over all elements in a sparse set.
+///
+/// The lifetime `'a` refers to the lifetime of the set being iterated over.
+#[derive(Debug)]
+pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
+
+impl<'a> IntoIterator for &'a SparseSet {
+    type Item = StateID;
+    type IntoIter = SparseSetIter<'a>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        SparseSetIter(self.dense[..self.len()].iter())
+    }
+}
+
+impl<'a> Iterator for SparseSetIter<'a> {
+    type Item = StateID;
+
+    #[inline(always)]
+    fn next(&mut self) -> Option<StateID> {
+        self.0.next().map(|value| *value)
+    }
+}
diff --git a/src/util/start.rs b/src/util/start.rs

new file mode 100644 (file)

index 0000000..3c756fc
--- /dev/null
+++ b/src/util/start.rs
@@ -0,0 +1,109 @@
+/// Represents the four possible starting configurations of a DFA search.
+///
+/// The starting configuration is determined by inspecting the the beginning of
+/// the haystack (up to 1 byte). Ultimately, this along with a pattern ID (if
+/// specified) is what selects the start state to use in a DFA.
+///
+/// In a DFA that doesn't have starting states for each pattern, then it will
+/// have a maximum of four DFA start states. If the DFA was compiled with start
+/// states for each pattern, then it will have a maximum of four DFA start
+/// states for searching for any pattern, and then another maximum of four DFA
+/// start states for executing an anchored search for each pattern.
+///
+/// This ends up being represented as a table in the DFA (whether lazy or fully
+/// built) where the stride of that table is 4, and each entry is an index into
+/// the state transition table. Note though that multiple entries in the table
+/// might point to the same state if the states would otherwise be equivalent.
+/// (This is guaranteed by DFA minimization and may even be accomplished by
+/// normal determinization, since it attempts to reuse equivalent states too.)
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub(crate) enum Start {
+    /// This occurs when the starting position is not any of the ones below.
+    NonWordByte = 0,
+    /// This occurs when the byte immediately preceding the start of the search
+    /// is an ASCII word byte.
+    WordByte = 1,
+    /// This occurs when the starting position of the search corresponds to the
+    /// beginning of the haystack.
+    Text = 2,
+    /// This occurs when the byte immediately preceding the start of the search
+    /// is a line terminator. Specifically, `\n`.
+    Line = 3,
+}
+
+impl Start {
+    /// Return the starting state corresponding to the given integer. If no
+    /// starting state exists for the given integer, then None is returned.
+    pub(crate) fn from_usize(n: usize) -> Option<Start> {
+        match n {
+            0 => Some(Start::NonWordByte),
+            1 => Some(Start::WordByte),
+            2 => Some(Start::Text),
+            3 => Some(Start::Line),
+            _ => None,
+        }
+    }
+
+    /// Returns the total number of starting state configurations.
+    pub(crate) fn count() -> usize {
+        4
+    }
+
+    /// Returns the starting state configuration for the given search
+    /// parameters. If the given offset range is not valid, then this panics.
+    #[inline(always)]
+    pub(crate) fn from_position_fwd(
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Start {
+        assert!(
+            bytes.get(start..end).is_some(),
+            "{}..{} is invalid",
+            start,
+            end
+        );
+        if start == 0 {
+            Start::Text
+        } else if bytes[start - 1] == b'\n' {
+            Start::Line
+        } else if crate::util::is_word_byte(bytes[start - 1]) {
+            Start::WordByte
+        } else {
+            Start::NonWordByte
+        }
+    }
+
+    /// Returns the starting state configuration for a reverse search with the
+    /// given search parameters. If the given offset range is not valid, then
+    /// this panics.
+    #[inline(always)]
+    pub(crate) fn from_position_rev(
+        bytes: &[u8],
+        start: usize,
+        end: usize,
+    ) -> Start {
+        assert!(
+            bytes.get(start..end).is_some(),
+            "{}..{} is invalid",
+            start,
+            end
+        );
+        if end == bytes.len() {
+            Start::Text
+        } else if bytes[end] == b'\n' {
+            Start::Line
+        } else if crate::util::is_word_byte(bytes[end]) {
+            Start::WordByte
+        } else {
+            Start::NonWordByte
+        }
+    }
+
+    /// Return this starting configuration as an integer. It is guaranteed to
+    /// be less than `Start::count()`.
+    #[inline(always)]
+    pub(crate) fn as_usize(&self) -> usize {
+        *self as usize
+    }
+}
diff --git a/src/util/syntax.rs b/src/util/syntax.rs

new file mode 100644 (file)

index 0000000..88beeee
--- /dev/null
+++ b/src/util/syntax.rs
@@ -0,0 +1,272 @@
+use regex_syntax::ParserBuilder;
+
+/// A common set of configuration options that apply to the syntax of a regex.
+///
+/// This represents a group of configuration options that specifically apply
+/// to how the concrete syntax of a regular expression is interpreted. In
+/// particular, they are generally forwarded to the
+/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html)
+/// in the
+/// [`regex-syntax`](https://docs.rs/regex-syntax)
+/// crate when building a regex from its concrete syntax directly.
+///
+/// These options are defined as a group since they apply to every regex engine
+/// in this crate. Instead of re-defining them on every engine's builder, they
+/// are instead provided here as one cohesive unit.
+#[derive(Clone, Copy, Debug)]
+pub struct SyntaxConfig {
+    case_insensitive: bool,
+    multi_line: bool,
+    dot_matches_new_line: bool,
+    swap_greed: bool,
+    ignore_whitespace: bool,
+    unicode: bool,
+    utf8: bool,
+    nest_limit: u32,
+    octal: bool,
+}
+
+impl SyntaxConfig {
+    /// Return a new default syntax configuration.
+    pub fn new() -> SyntaxConfig {
+        // These defaults match the ones used in regex-syntax.
+        SyntaxConfig {
+            case_insensitive: false,
+            multi_line: false,
+            dot_matches_new_line: false,
+            swap_greed: false,
+            ignore_whitespace: false,
+            unicode: true,
+            utf8: true,
+            nest_limit: 250,
+            octal: false,
+        }
+    }
+
+    /// Enable or disable the case insensitive flag by default.
+    ///
+    /// When Unicode mode is enabled, case insensitivity is Unicode-aware.
+    /// Specifically, it will apply the "simple" case folding rules as
+    /// specified by Unicode.
+    ///
+    /// By default this is disabled. It may alternatively be selectively
+    /// enabled in the regular expression itself via the `i` flag.
+    pub fn case_insensitive(mut self, yes: bool) -> SyntaxConfig {
+        self.case_insensitive = yes;
+        self
+    }
+
+    /// Enable or disable the multi-line matching flag by default.
+    ///
+    /// When this is enabled, the `^` and `$` look-around assertions will
+    /// match immediately after and immediately before a new line character,
+    /// respectively. Note that the `\A` and `\z` look-around assertions are
+    /// unaffected by this setting and always correspond to matching at the
+    /// beginning and end of the input.
+    ///
+    /// By default this is disabled. It may alternatively be selectively
+    /// enabled in the regular expression itself via the `m` flag.
+    pub fn multi_line(mut self, yes: bool) -> SyntaxConfig {
+        self.multi_line = yes;
+        self
+    }
+
+    /// Enable or disable the "dot matches any character" flag by default.
+    ///
+    /// When this is enabled, `.` will match any character. When it's disabled,
+    /// then `.` will match any character except for a new line character.
+    ///
+    /// Note that `.` is impacted by whether the "unicode" setting is enabled
+    /// or not. When Unicode is enabled (the defualt), `.` will match any UTF-8
+    /// encoding of any Unicode scalar value (sans a new line, depending on
+    /// whether this "dot matches new line" option is enabled). When Unicode
+    /// mode is disabled, `.` will match any byte instead. Because of this,
+    /// when Unicode mode is disabled, `.` can only be used when the "allow
+    /// invalid UTF-8" option is enabled, since `.` could otherwise match
+    /// invalid UTF-8.
+    ///
+    /// By default this is disabled. It may alternatively be selectively
+    /// enabled in the regular expression itself via the `s` flag.
+    pub fn dot_matches_new_line(mut self, yes: bool) -> SyntaxConfig {
+        self.dot_matches_new_line = yes;
+        self
+    }
+
+    /// Enable or disable the "swap greed" flag by default.
+    ///
+    /// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
+    /// will become greedy.
+    ///
+    /// By default this is disabled. It may alternatively be selectively
+    /// enabled in the regular expression itself via the `U` flag.
+    pub fn swap_greed(mut self, yes: bool) -> SyntaxConfig {
+        self.swap_greed = yes;
+        self
+    }
+
+    /// Enable verbose mode in the regular expression.
+    ///
+    /// When enabled, verbose mode permits insigificant whitespace in many
+    /// places in the regular expression, as well as comments. Comments are
+    /// started using `#` and continue until the end of the line.
+    ///
+    /// By default, this is disabled. It may be selectively enabled in the
+    /// regular expression by using the `x` flag regardless of this setting.
+    pub fn ignore_whitespace(mut self, yes: bool) -> SyntaxConfig {
+        self.ignore_whitespace = yes;
+        self
+    }
+
+    /// Enable or disable the Unicode flag (`u`) by default.
+    ///
+    /// By default this is **enabled**. It may alternatively be selectively
+    /// disabled in the regular expression itself via the `u` flag.
+    ///
+    /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
+    /// default), a regular expression will fail to parse if Unicode mode is
+    /// disabled and a sub-expression could possibly match invalid UTF-8.
+    ///
+    /// **WARNING**: Unicode mode can greatly increase the size of the compiled
+    /// DFA, which can noticeably impact both memory usage and compilation
+    /// time. This is especially noticeable if your regex contains character
+    /// classes like `\w` that are impacted by whether Unicode is enabled or
+    /// not. If Unicode is not necessary, you are encouraged to disable it.
+    pub fn unicode(mut self, yes: bool) -> SyntaxConfig {
+        self.unicode = yes;
+        self
+    }
+
+    /// When disabled, the builder will permit the construction of a regular
+    /// expression that may match invalid UTF-8.
+    ///
+    /// For example, when [`SyntaxConfig::unicode`] is disabled, then
+    /// expressions like `[^a]` may match invalid UTF-8 since they can match
+    /// any single byte that is not `a`. By default, these sub-expressions
+    /// are disallowed to avoid returning offsets that split a UTF-8
+    /// encoded codepoint. However, in cases where matching at arbitrary
+    /// locations is desired, this option can be disabled to permit all such
+    /// sub-expressions.
+    ///
+    /// When enabled (the default), the builder is guaranteed to produce a
+    /// regex that will only ever match valid UTF-8 (otherwise, the builder
+    /// will return an error).
+    pub fn utf8(mut self, yes: bool) -> SyntaxConfig {
+        self.utf8 = yes;
+        self
+    }
+
+    /// Set the nesting limit used for the regular expression parser.
+    ///
+    /// The nesting limit controls how deep the abstract syntax tree is allowed
+    /// to be. If the AST exceeds the given limit (e.g., with too many nested
+    /// groups), then an error is returned by the parser.
+    ///
+    /// The purpose of this limit is to act as a heuristic to prevent stack
+    /// overflow when building a finite automaton from a regular expression's
+    /// abstract syntax tree. In particular, construction currently uses
+    /// recursion. In the future, the implementation may stop using recursion
+    /// and this option will no longer be necessary.
+    ///
+    /// This limit is not checked until the entire AST is parsed. Therefore,
+    /// if callers want to put a limit on the amount of heap space used, then
+    /// they should impose a limit on the length, in bytes, of the concrete
+    /// pattern string. In particular, this is viable since the parser will
+    /// limit itself to heap space proportional to the lenth of the pattern
+    /// string.
+    ///
+    /// Note that a nest limit of `0` will return a nest limit error for most
+    /// patterns but not all. For example, a nest limit of `0` permits `a` but
+    /// not `ab`, since `ab` requires a concatenation AST item, which results
+    /// in a nest depth of `1`. In general, a nest limit is not something that
+    /// manifests in an obvious way in the concrete syntax, therefore, it
+    /// should not be used in a granular way.
+    pub fn nest_limit(mut self, limit: u32) -> SyntaxConfig {
+        self.nest_limit = limit;
+        self
+    }
+
+    /// Whether to support octal syntax or not.
+    ///
+    /// Octal syntax is a little-known way of uttering Unicode codepoints in
+    /// a regular expression. For example, `a`, `\x61`, `\u0061` and
+    /// `\141` are all equivalent regular expressions, where the last example
+    /// shows octal syntax.
+    ///
+    /// While supporting octal syntax isn't in and of itself a problem, it does
+    /// make good error messages harder. That is, in PCRE based regex engines,
+    /// syntax like `\1` invokes a backreference, which is explicitly
+    /// unsupported in Rust's regex engine. However, many users expect it to
+    /// be supported. Therefore, when octal support is disabled, the error
+    /// message will explicitly mention that backreferences aren't supported.
+    ///
+    /// Octal syntax is disabled by default.
+    pub fn octal(mut self, yes: bool) -> SyntaxConfig {
+        self.octal = yes;
+        self
+    }
+
+    /// Returns whether "unicode" mode is enabled.
+    pub fn get_unicode(&self) -> bool {
+        self.unicode
+    }
+
+    /// Returns whether "case insensitive" mode is enabled.
+    pub fn get_case_insensitive(&self) -> bool {
+        self.case_insensitive
+    }
+
+    /// Returns whether "multi line" mode is enabled.
+    pub fn get_multi_line(&self) -> bool {
+        self.multi_line
+    }
+
+    /// Returns whether "dot matches new line" mode is enabled.
+    pub fn get_dot_matches_new_line(&self) -> bool {
+        self.dot_matches_new_line
+    }
+
+    /// Returns whether "swap greed" mode is enabled.
+    pub fn get_swap_greed(&self) -> bool {
+        self.swap_greed
+    }
+
+    /// Returns whether "ignore whitespace" mode is enabled.
+    pub fn get_ignore_whitespace(&self) -> bool {
+        self.ignore_whitespace
+    }
+
+    /// Returns whether UTF-8 mode is enabled.
+    pub fn get_utf8(&self) -> bool {
+        self.utf8
+    }
+
+    /// Returns the "nest limit" setting.
+    pub fn get_nest_limit(&self) -> u32 {
+        self.nest_limit
+    }
+
+    /// Returns whether "octal" mode is enabled.
+    pub fn get_octal(&self) -> bool {
+        self.octal
+    }
+
+    /// Applies this configuration to the given parser.
+    pub(crate) fn apply(&self, builder: &mut ParserBuilder) {
+        builder
+            .unicode(self.unicode)
+            .case_insensitive(self.case_insensitive)
+            .multi_line(self.multi_line)
+            .dot_matches_new_line(self.dot_matches_new_line)
+            .swap_greed(self.swap_greed)
+            .ignore_whitespace(self.ignore_whitespace)
+            .allow_invalid_utf8(!self.utf8)
+            .nest_limit(self.nest_limit)
+            .octal(self.octal);
+    }
+}
+
+impl Default for SyntaxConfig {
+    fn default() -> SyntaxConfig {
+        SyntaxConfig::new()
+    }
+}
diff --git a/tests/data/bytes.toml b/tests/data/bytes.toml

new file mode 100644 (file)

index 0000000..eb3a094
--- /dev/null
+++ b/tests/data/bytes.toml
@@ -0,0 +1,235 @@
+# These are tests specifically crafted for regexes that can match arbitrary
+# bytes. In some cases, we also test the Unicode variant as well, just because
+# it's good sense to do so. But also, these tests aren't really about Unicode,
+# but whether matches are only reported at valid UTF-8 boundaries. For most
+# tests in this entire collection, utf8 = true. But for these tests, we use
+# utf8 = false.
+
+[[tests]]
+name = "word-boundary-ascii"
+regex = ' \b'
+input = " δ"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "word-boundary-unicode"
+regex = ' \b'
+input = " δ"
+matches = [[0, 1]]
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "word-boundary-ascii-not"
+regex = ' \B'
+input = " δ"
+matches = [[0, 1]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "word-boundary-unicode-not"
+regex = ' \B'
+input = " δ"
+matches = []
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "perl-word-ascii"
+regex = '\w+'
+input = "aδ"
+matches = [[0, 1]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "perl-word-unicode"
+regex = '\w+'
+input = "aδ"
+matches = [[0, 3]]
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "perl-decimal-ascii"
+regex = '\d+'
+input = "1२३9"
+matches = [[0, 1], [7, 8]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "perl-decimal-unicode"
+regex = '\d+'
+input = "1२३9"
+matches = [[0, 8]]
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "perl-whitespace-ascii"
+regex = '\s+'
+input = " \u1680"
+matches = [[0, 1]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "perl-whitespace-unicode"
+regex = '\s+'
+input = " \u1680"
+matches = [[0, 4]]
+unicode = true
+utf8 = false
+
+# The first `(.+)` matches two Unicode codepoints, but can't match the 5th
+# byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
+# matches.
+[[tests]]
+name = "mixed-dot"
+regex = '(.+)(?-u)(.+)'
+input = '\xCE\x93\xCE\x94\xFF'
+captures = [
+  [[0, 5], [0, 4], [4, 5]],
+]
+unescape = true
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "case-one-ascii"
+regex = 'a'
+input = "A"
+matches = [[0, 1]]
+case_insensitive = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "case-one-unicode"
+regex = 'a'
+input = "A"
+matches = [[0, 1]]
+case_insensitive = true
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "case-class-simple-ascii"
+regex = '[a-z]+'
+input = "AaAaA"
+matches = [[0, 5]]
+case_insensitive = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "case-class-ascii"
+regex = '[a-z]+'
+input = "aA\u212AaA"
+matches = [[0, 2], [5, 7]]
+case_insensitive = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "case-class-unicode"
+regex = '[a-z]+'
+input = "aA\u212AaA"
+matches = [[0, 7]]
+case_insensitive = true
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "negate-ascii"
+regex = '[^a]'
+input = "δ"
+matches = [[0, 1], [1, 2]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "negate-unicode"
+regex = '[^a]'
+input = "δ"
+matches = [[0, 2]]
+unicode = true
+utf8 = false
+
+# When utf8=true, this won't match, because the implicit '.*?' prefix is
+# Unicode aware and will refuse to match through invalid UTF-8 bytes.
+[[tests]]
+name = "dotstar-prefix-ascii"
+regex = 'a'
+input = '\xFFa'
+matches = [[1, 2]]
+unescape = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "dotstar-prefix-unicode"
+regex = 'a'
+input = '\xFFa'
+matches = [[1, 2]]
+unescape = true
+unicode = true
+utf8 = false
+
+[[tests]]
+name = "null-bytes"
+regex = '(?P<cstr>[^\x00]+)\x00'
+input = 'foo\x00'
+captures = [
+  [[0, 4], [0, 3]],
+]
+unescape = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "invalid-utf8-anchor-100"
+regex = '\xCC?^'
+input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
+matches = [[0, 0]]
+unescape = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "invalid-utf8-anchor-200"
+regex = '^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$'
+input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
+matches = [[22, 22]]
+unescape = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "invalid-utf8-anchor-300"
+regex = '^|ddp\xff\xffdddddlQd@\x80'
+input = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
+matches = [[0, 0]]
+unescape = true
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "word-boundary-ascii-100"
+regex = '\Bx\B'
+input = "áxβ"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "word-boundary-ascii-200"
+regex = '\B'
+input = "0\U0007EF5E"
+matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
+unicode = false
+utf8 = false
diff --git a/tests/data/crazy.toml b/tests/data/crazy.toml

new file mode 100644 (file)

index 0000000..549b86c
--- /dev/null
+++ b/tests/data/crazy.toml
@@ -0,0 +1,302 @@
+# TODO: There are still a couple of manually written tests in crazy.rs.
+
+[[tests]]
+name = "ranges"
+regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b'
+input = "num: 255"
+matches = [[5, 8]]
+
+[[tests]]
+name = "ranges-not"
+regex = '(?-u)\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b'
+input = "num: 256"
+matches = []
+
+[[tests]]
+name = "float1"
+regex = '[-+]?[0-9]*\.?[0-9]+'
+input = "0.1"
+matches = [[0, 3]]
+
+[[tests]]
+name = "float2"
+regex = '[-+]?[0-9]*\.?[0-9]+'
+input = "0.1.2"
+matches = [[0, 3]]
+match_limit = 1
+
+[[tests]]
+name = "float3"
+regex = '[-+]?[0-9]*\.?[0-9]+'
+input = "a1.2"
+matches = [[1, 4]]
+
+[[tests]]
+name = "float4"
+regex = '[-+]?[0-9]*\.?[0-9]+'
+input = "1.a"
+matches = [[0, 1]]
+
+[[tests]]
+name = "float5"
+regex = '^[-+]?[0-9]*\.?[0-9]+$'
+input = "1.a"
+matches = []
+
+[[tests]]
+name = "email"
+regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
+input = "mine is jam.slam@gmail.com "
+matches = [[8, 26]]
+
+[[tests]]
+name = "email-not"
+regex = '(?i-u)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b'
+input = "mine is jam.slam@gmail "
+matches = []
+
+[[tests]]
+name = "email-big"
+regex = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?'''
+input = "mine is jam.slam@gmail.com "
+matches = [[8, 26]]
+
+[[tests]]
+name = "date1"
+regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
+input = "1900-01-01"
+matches = [[0, 10]]
+
+[[tests]]
+name = "date2"
+regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
+input = "1900-00-01"
+matches = []
+
+[[tests]]
+name = "date3"
+regex = '(?-u)^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$'
+input = "1900-13-01"
+matches = []
+
+[[tests]]
+name = "start-end-empty"
+regex = '^$'
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "start-end-empty-rev"
+regex = '$^'
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "start-end-empty-many-1"
+regex = '^$^$^$'
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "start-end-empty-many-2"
+regex = '^^^$$$'
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "start-end-empty-rep"
+regex = '(?:^$)*'
+input = "a\nb\nc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+
+[[tests]]
+name = "start-end-empty-rep-rev"
+regex = '(?:$^)*'
+input = "a\nb\nc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+
+[[tests]]
+name = "neg-class-letter"
+regex = '[^ac]'
+input = "acx"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-letter-comma"
+regex = '[^a,]'
+input = "a,x"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-letter-space"
+regex = '[^a[:space:]]'
+input = "a x"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-comma"
+regex = '[^,]'
+input = ",,x"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-space"
+regex = '[^[:space:]]'
+input = " a"
+matches = [[1, 2]]
+
+[[tests]]
+name = "neg-class-space-comma"
+regex = '[^,[:space:]]'
+input = ", a"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-comma-space"
+regex = '[^[:space:],]'
+input = " ,a"
+matches = [[2, 3]]
+
+[[tests]]
+name = "neg-class-ascii"
+regex = '[^[:alpha:]Z]'
+input = "A1"
+matches = [[1, 2]]
+
+[[tests]]
+name = "lazy-many-many"
+regex = '((?:.*)*?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "lazy-many-optional"
+regex = '((?:.?)*?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "lazy-one-many-many"
+regex = '((?:.*)+?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "lazy-one-many-optional"
+regex = '((?:.?)+?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "lazy-range-min-many"
+regex = '((?:.*){1,}?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "lazy-range-many"
+regex = '((?:.*){1,2}?)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-many-many"
+regex = '((?:.*)*)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-many-optional"
+regex = '((?:.?)*)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-one-many-many"
+regex = '((?:.*)+)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-one-many-optional"
+regex = '((?:.?)+)='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-range-min-many"
+regex = '((?:.*){1,})='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "greedy-range-many"
+regex = '((?:.*){1,2})='
+input = "a=b"
+matches = [[0, 2]]
+
+[[tests]]
+name = "empty1"
+regex = ''
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "empty2"
+regex = ''
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty3"
+regex = '()'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty4"
+regex = '()*'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty5"
+regex = '()+'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty6"
+regex = '()?'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty7"
+regex = '()()'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty8"
+regex = '()+|z'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty9"
+regex = 'z|()+'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty10"
+regex = '()+|b'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty11"
+regex = 'b|()+'
+input = "abc"
+matches = [[0, 0], [1, 2], [3, 3]]
diff --git a/tests/data/earliest.toml b/tests/data/earliest.toml

new file mode 100644 (file)

index 0000000..6714a85
--- /dev/null
+++ b/tests/data/earliest.toml
@@ -0,0 +1,48 @@
+[[tests]]
+name = "no-greedy-100"
+regex = 'a+'
+input = "aaa"
+matches = [[0, 1], [1, 2], [2, 3]]
+search_kind = "earliest"
+
+[[tests]]
+name = "no-greedy-200"
+regex = 'abc+'
+input = "zzzabccc"
+matches = [[3, 6]]
+search_kind = "earliest"
+
+[[tests]]
+name = "is-ungreedy"
+regex = 'a+?'
+input = "aaa"
+matches = [[0, 1], [1, 2], [2, 3]]
+search_kind = "earliest"
+
+[[tests]]
+name = "look-start-test"
+regex = '^(abc|a)'
+input = "abc"
+matches = [[0, 1]]
+search_kind = "earliest"
+
+[[tests]]
+name = "look-end-test"
+regex = '(abc|a)$'
+input = "abc"
+matches = [[0, 3]]
+search_kind = "earliest"
+
+[[tests]]
+name = "no-leftmost-first-100"
+regex = 'abc|a'
+input = "abc"
+matches = [[0, 1]]
+search_kind = "earliest"
+
+[[tests]]
+name = "no-leftmost-first-200"
+regex = 'aba|a'
+input = "aba"
+matches = [[0, 1], [2, 3]]
+search_kind = "earliest"
diff --git a/tests/data/empty.toml b/tests/data/empty.toml

new file mode 100644 (file)

index 0000000..ad703e6
--- /dev/null
+++ b/tests/data/empty.toml
@@ -0,0 +1,113 @@
+[[tests]]
+name = "100"
+regex = "|b"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "110"
+regex = "b|"
+input = "abc"
+matches = [[0, 0], [1, 2], [3, 3]]
+
+[[tests]]
+name = "120"
+regex = "|z"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "130"
+regex = "z|"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "200"
+regex = "|"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "210"
+regex = "||"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "220"
+regex = "||b"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "230"
+regex = "b||"
+input = "abc"
+matches = [[0, 0], [1, 2], [3, 3]]
+
+[[tests]]
+name = "240"
+regex = "||z"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "300"
+regex = "(?:)|b"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "310"
+regex = "b|(?:)"
+input = "abc"
+matches = [[0, 0], [1, 2], [3, 3]]
+
+[[tests]]
+name = "320"
+regex = "(?:|)"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "330"
+regex = "(?:|)|z"
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "400"
+regex = "a(?:)|b"
+input = "abc"
+matches = [[0, 1], [1, 2]]
+
+[[tests]]
+name = "500"
+regex = ""
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "510"
+regex = ""
+input = "a"
+matches = [[0, 0], [1, 1]]
+
+[[tests]]
+name = "520"
+regex = ""
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "600"
+regex = '(|a)*'
+input = "aaa"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "610"
+regex = '(|a)+'
+input = "aaa"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
diff --git a/tests/data/expensive.toml b/tests/data/expensive.toml

new file mode 100644 (file)

index 0000000..e062e39
--- /dev/null
+++ b/tests/data/expensive.toml
@@ -0,0 +1,12 @@
+# These represent tests that may be expensive to run on some regex engines. For
+# example, tests that build a full DFA ahead of time and minimize it can take a
+# horrendously long time on regexes that are large (or result in an explosion
+# in the number of states). We group these tests together so that such engines
+# can simply skip these tests.
+
+# See: https://github.com/rust-lang/regex/issues/98
+[[tests]]
+name = "regression-many-repeat-no-stack-overflow"
+regex = '^.{1,2500}'
+input = "a"
+matches = [[0, 1]]
diff --git a/tests/data/flags.toml b/tests/data/flags.toml

new file mode 100644 (file)

index 0000000..2b631ef
--- /dev/null
+++ b/tests/data/flags.toml
@@ -0,0 +1,67 @@
+[[tests]]
+name = "1"
+regex = "(?i)abc"
+input = "ABC"
+matches = [[0, 3]]
+
+[[tests]]
+name = "2"
+regex = "(?i)a(?-i)bc"
+input = "Abc"
+matches = [[0, 3]]
+
+[[tests]]
+name = "3"
+regex = "(?i)a(?-i)bc"
+input = "ABC"
+matches = []
+
+[[tests]]
+name = "4"
+regex = "(?is)a."
+input = "A\n"
+matches = [[0, 2]]
+
+[[tests]]
+name = "5"
+regex = "(?is)a.(?-is)a."
+input = "A\nab"
+matches = [[0, 4]]
+
+[[tests]]
+name = "6"
+regex = "(?is)a.(?-is)a."
+input = "A\na\n"
+matches = []
+
+[[tests]]
+name = "7"
+regex = "(?is)a.(?-is:a.)?"
+input = "A\na\n"
+matches = [[0, 2]]
+match_limit = 1
+
+[[tests]]
+name = "8"
+regex = "(?U)a+"
+input = "aa"
+matches = [[0, 1]]
+match_limit = 1
+
+[[tests]]
+name = "9"
+regex = "(?U)a+?"
+input = "aa"
+matches = [[0, 2]]
+
+[[tests]]
+name = "10"
+regex = "(?U)(?-U)a+"
+input = "aa"
+matches = [[0, 2]]
+
+[[tests]]
+name = "11"
+regex = '(?m)(?:^\d+$\n?)+'
+input = "123\n456\n789"
+matches = [[0, 11]]
diff --git a/tests/data/fowler/basic.toml b/tests/data/fowler/basic.toml

new file mode 100644 (file)

index 0000000..c965f26
--- /dev/null
+++ b/tests/data/fowler/basic.toml
@@ -0,0 +1,1638 @@
+# !!! DO NOT EDIT !!!
+# Automatically generated by scripts/fowler-to-toml.
+# Numbers in the test names correspond to the line number of the test from
+# the original dat file.
+
+[[tests]]
+name = "basic3"
+regex = '''abracadabra$'''
+input = '''abracadabracadabra'''
+captures = [[[7, 18]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic4"
+regex = '''a...b'''
+input = '''abababbb'''
+captures = [[[2, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic5"
+regex = '''XXXXXX'''
+input = '''..XXXXXX'''
+captures = [[[2, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic6"
+regex = '''\)'''
+input = '''()'''
+captures = [[[1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic7"
+regex = '''a]'''
+input = '''a]a'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic9"
+regex = '''\}'''
+input = '''}'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic10"
+regex = '''\]'''
+input = ''']'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic12"
+regex = ''']'''
+input = ''']'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic15"
+regex = '''^a'''
+input = '''ax'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic16"
+regex = '''\^a'''
+input = '''a^a'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic17"
+regex = '''a\^'''
+input = '''a^'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic18"
+regex = '''a$'''
+input = '''aa'''
+captures = [[[1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic19"
+regex = '''a\$'''
+input = '''a$'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic20"
+regex = '''^$'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic21"
+regex = '''$^'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic22"
+regex = '''a($)'''
+input = '''aa'''
+captures = [[[1, 2], [2, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic23"
+regex = '''a*(^a)'''
+input = '''aa'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic24"
+regex = '''(..)*(...)*'''
+input = '''a'''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic25"
+regex = '''(..)*(...)*'''
+input = '''abcd'''
+captures = [[[0, 4], [2, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic26"
+regex = '''(ab|a)(bc|c)'''
+input = '''abc'''
+captures = [[[0, 3], [0, 2], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic27"
+regex = '''(ab)c|abc'''
+input = '''abc'''
+captures = [[[0, 3], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic28"
+regex = '''a{0}b'''
+input = '''ab'''
+captures = [[[1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic29"
+regex = '''(a*)(b?)(b+)b{3}'''
+input = '''aaabbbbbbb'''
+captures = [[[0, 10], [0, 3], [3, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic30"
+regex = '''(a*)(b{0,1})(b{1,})b{3}'''
+input = '''aaabbbbbbb'''
+captures = [[[0, 10], [0, 3], [3, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic32"
+regex = '''((a|a)|a)'''
+input = '''a'''
+captures = [[[0, 1], [0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic33"
+regex = '''(a*)(a|aa)'''
+input = '''aaaa'''
+captures = [[[0, 4], [0, 3], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic34"
+regex = '''a*(a.|aa)'''
+input = '''aaaa'''
+captures = [[[0, 4], [2, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic35"
+regex = '''a(b)|c(d)|a(e)f'''
+input = '''aef'''
+captures = [[[0, 3], [], [], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic36"
+regex = '''(a|b)?.*'''
+input = '''b'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic37"
+regex = '''(a|b)c|a(b|c)'''
+input = '''ac'''
+captures = [[[0, 2], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic38"
+regex = '''(a|b)c|a(b|c)'''
+input = '''ab'''
+captures = [[[0, 2], [], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic39"
+regex = '''(a|b)*c|(a|ab)*c'''
+input = '''abc'''
+captures = [[[0, 3], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic40"
+regex = '''(a|b)*c|(a|ab)*c'''
+input = '''xc'''
+captures = [[[1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic41"
+regex = '''(.a|.b).*|.*(.a|.b)'''
+input = '''xa'''
+captures = [[[0, 2], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic42"
+regex = '''a?(ab|ba)ab'''
+input = '''abab'''
+captures = [[[0, 4], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic43"
+regex = '''a?(ac{0}b|ba)ab'''
+input = '''abab'''
+captures = [[[0, 4], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic44"
+regex = '''ab|abab'''
+input = '''abbabab'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic45"
+regex = '''aba|bab|bba'''
+input = '''baaabbbaba'''
+captures = [[[5, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic46"
+regex = '''aba|bab'''
+input = '''baaabbbaba'''
+captures = [[[6, 9]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic47"
+regex = '''(aa|aaa)*|(a|aaaaa)'''
+input = '''aa'''
+captures = [[[0, 2], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic48"
+regex = '''(a.|.a.)*|(a|.a...)'''
+input = '''aa'''
+captures = [[[0, 2], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic49"
+regex = '''ab|a'''
+input = '''xabc'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic50"
+regex = '''ab|a'''
+input = '''xxabc'''
+captures = [[[2, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic51"
+regex = '''(Ab|cD)*'''
+input = '''aBcD'''
+captures = [[[0, 4], [2, 4]]]
+match_limit = 1
+unescape = true
+case_insensitive = true
+
+[[tests]]
+name = "basic52"
+regex = '''[^-]'''
+input = '''--a'''
+captures = [[[2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic53"
+regex = '''[a-]*'''
+input = '''--a'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic54"
+regex = '''[a-m-]*'''
+input = '''--amoma--'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic55"
+regex = ''':::1:::0:|:::1:1:0:'''
+input = ''':::0:::1:::1:::0:'''
+captures = [[[8, 17]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic56"
+regex = ''':::1:::0:|:::1:1:1:'''
+input = ''':::0:::1:::1:::0:'''
+captures = [[[8, 17]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic57"
+regex = '''[[:upper:]]'''
+input = '''A'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic58"
+regex = '''[[:lower:]]+'''
+input = '''`az{'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic59"
+regex = '''[[:upper:]]+'''
+input = '''@AZ['''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic65"
+regex = '''\n'''
+input = '''\n'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic66"
+regex = '''\n'''
+input = '''\n'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic67"
+regex = '''[^a]'''
+input = '''\n'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic68"
+regex = '''\na'''
+input = '''\na'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic69"
+regex = '''(a)(b)(c)'''
+input = '''abc'''
+captures = [[[0, 3], [0, 1], [1, 2], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic70"
+regex = '''xxx'''
+input = '''xxx'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic71"
+regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
+input = '''feb 6,'''
+captures = [[[0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic72"
+regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
+input = '''2/7'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic73"
+regex = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)'''
+input = '''feb 1,Feb 6'''
+captures = [[[5, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic74"
+regex = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))'''
+input = '''x'''
+captures = [[[0, 1], [0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic75"
+regex = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*'''
+input = '''xx'''
+captures = [[[0, 2], [1, 2], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic76"
+regex = '''a?(ab|ba)*'''
+input = '''ababababababababababababababababababababababababababababababababababababababababa'''
+captures = [[[0, 81], [79, 81]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic77"
+regex = '''abaa|abbaa|abbbaa|abbbbaa'''
+input = '''ababbabbbabbbabbbbabbbbaa'''
+captures = [[[18, 25]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic78"
+regex = '''abaa|abbaa|abbbaa|abbbbaa'''
+input = '''ababbabbbabbbabbbbabaa'''
+captures = [[[18, 22]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic79"
+regex = '''aaac|aabc|abac|abbc|baac|babc|bbac|bbbc'''
+input = '''baaabbbabac'''
+captures = [[[7, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic80"
+regex = '''.*'''
+input = '''\x01\x7f'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic81"
+regex = '''aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll'''
+input = '''XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa'''
+captures = [[[53, 57]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic83"
+regex = '''a*a*a*a*a*b'''
+input = '''aaaaaaaaab'''
+captures = [[[0, 10]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic84"
+regex = '''^'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic85"
+regex = '''$'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic86"
+regex = '''^$'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic87"
+regex = '''^a$'''
+input = '''a'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic88"
+regex = '''abc'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic89"
+regex = '''abc'''
+input = '''xabcy'''
+captures = [[[1, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic90"
+regex = '''abc'''
+input = '''ababc'''
+captures = [[[2, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic91"
+regex = '''ab*c'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic92"
+regex = '''ab*bc'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic93"
+regex = '''ab*bc'''
+input = '''abbc'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic94"
+regex = '''ab*bc'''
+input = '''abbbbc'''
+captures = [[[0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic95"
+regex = '''ab+bc'''
+input = '''abbc'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic96"
+regex = '''ab+bc'''
+input = '''abbbbc'''
+captures = [[[0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic97"
+regex = '''ab?bc'''
+input = '''abbc'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic98"
+regex = '''ab?bc'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic99"
+regex = '''ab?c'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic100"
+regex = '''^abc$'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic101"
+regex = '''^abc'''
+input = '''abcc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic102"
+regex = '''abc$'''
+input = '''aabc'''
+captures = [[[1, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic103"
+regex = '''^'''
+input = '''abc'''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic104"
+regex = '''$'''
+input = '''abc'''
+captures = [[[3, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic105"
+regex = '''a.c'''
+input = '''abc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic106"
+regex = '''a.c'''
+input = '''axc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic107"
+regex = '''a.*c'''
+input = '''axyzc'''
+captures = [[[0, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic108"
+regex = '''a[bc]d'''
+input = '''abd'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic109"
+regex = '''a[b-d]e'''
+input = '''ace'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic110"
+regex = '''a[b-d]'''
+input = '''aac'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic111"
+regex = '''a[-b]'''
+input = '''a-'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic112"
+regex = '''a[b-]'''
+input = '''a-'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic113"
+regex = '''a]'''
+input = '''a]'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic114"
+regex = '''a[]]b'''
+input = '''a]b'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic115"
+regex = '''a[^bc]d'''
+input = '''aed'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic116"
+regex = '''a[^-b]c'''
+input = '''adc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic117"
+regex = '''a[^]b]c'''
+input = '''adc'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic118"
+regex = '''ab|cd'''
+input = '''abc'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic119"
+regex = '''ab|cd'''
+input = '''abcd'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic120"
+regex = '''a\(b'''
+input = '''a(b'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic121"
+regex = '''a\(*b'''
+input = '''ab'''
+captures = [[[0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic122"
+regex = '''a\(*b'''
+input = '''a((b'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic123"
+regex = '''((a))'''
+input = '''abc'''
+captures = [[[0, 1], [0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic124"
+regex = '''(a)b(c)'''
+input = '''abc'''
+captures = [[[0, 3], [0, 1], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic125"
+regex = '''a+b+c'''
+input = '''aabbabc'''
+captures = [[[4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic126"
+regex = '''a*'''
+input = '''aaa'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic128"
+regex = '''(a*)*'''
+input = '''-'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic129"
+regex = '''(a*)+'''
+input = '''-'''
+captures = [[[0, 0], [0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic131"
+regex = '''(a*|b)*'''
+input = '''-'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic132"
+regex = '''(a+|b)*'''
+input = '''ab'''
+captures = [[[0, 2], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic133"
+regex = '''(a+|b)+'''
+input = '''ab'''
+captures = [[[0, 2], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic134"
+regex = '''(a+|b)?'''
+input = '''ab'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic135"
+regex = '''[^ab]*'''
+input = '''cde'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic137"
+regex = '''(^)*'''
+input = '''-'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic138"
+regex = '''a*'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic139"
+regex = '''([abc])*d'''
+input = '''abbbcd'''
+captures = [[[0, 6], [4, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic140"
+regex = '''([abc])*bcd'''
+input = '''abcd'''
+captures = [[[0, 4], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic141"
+regex = '''a|b|c|d|e'''
+input = '''e'''
+captures = [[[0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic142"
+regex = '''(a|b|c|d|e)f'''
+input = '''ef'''
+captures = [[[0, 2], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic144"
+regex = '''((a*|b))*'''
+input = '''-'''
+captures = [[[0, 0], [], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic145"
+regex = '''abcd*efg'''
+input = '''abcdefg'''
+captures = [[[0, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic146"
+regex = '''ab*'''
+input = '''xabyabbbz'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic147"
+regex = '''ab*'''
+input = '''xayabbbz'''
+captures = [[[1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic148"
+regex = '''(ab|cd)e'''
+input = '''abcde'''
+captures = [[[2, 5], [2, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic149"
+regex = '''[abhgefdc]ij'''
+input = '''hij'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic150"
+regex = '''(a|b)c*d'''
+input = '''abcd'''
+captures = [[[1, 4], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic151"
+regex = '''(ab|ab*)bc'''
+input = '''abc'''
+captures = [[[0, 3], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic152"
+regex = '''a([bc]*)c*'''
+input = '''abc'''
+captures = [[[0, 3], [1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic153"
+regex = '''a([bc]*)(c*d)'''
+input = '''abcd'''
+captures = [[[0, 4], [1, 3], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic154"
+regex = '''a([bc]+)(c*d)'''
+input = '''abcd'''
+captures = [[[0, 4], [1, 3], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic155"
+regex = '''a([bc]*)(c+d)'''
+input = '''abcd'''
+captures = [[[0, 4], [1, 2], [2, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic156"
+regex = '''a[bcd]*dcdcde'''
+input = '''adcdcde'''
+captures = [[[0, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic157"
+regex = '''(ab|a)b*c'''
+input = '''abc'''
+captures = [[[0, 3], [0, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic158"
+regex = '''((a)(b)c)(d)'''
+input = '''abcd'''
+captures = [[[0, 4], [0, 3], [0, 1], [1, 2], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic159"
+regex = '''[A-Za-z_][A-Za-z0-9_]*'''
+input = '''alpha'''
+captures = [[[0, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic160"
+regex = '''^a(bc+|b[eh])g|.h$'''
+input = '''abh'''
+captures = [[[1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic161"
+regex = '''(bc+d$|ef*g.|h?i(j|k))'''
+input = '''effgz'''
+captures = [[[0, 5], [0, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic162"
+regex = '''(bc+d$|ef*g.|h?i(j|k))'''
+input = '''ij'''
+captures = [[[0, 2], [0, 2], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic163"
+regex = '''(bc+d$|ef*g.|h?i(j|k))'''
+input = '''reffgz'''
+captures = [[[1, 6], [1, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic164"
+regex = '''(((((((((a)))))))))'''
+input = '''a'''
+captures = [[[0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic165"
+regex = '''multiple words'''
+input = '''multiple words yeah'''
+captures = [[[0, 14]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic166"
+regex = '''(.*)c(.*)'''
+input = '''abcde'''
+captures = [[[0, 5], [0, 2], [3, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic167"
+regex = '''abcd'''
+input = '''abcd'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic168"
+regex = '''a(bc)d'''
+input = '''abcd'''
+captures = [[[0, 4], [1, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic169"
+regex = '''a[\x01-\x03]?c'''
+input = '''a\x02c'''
+captures = [[[0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic170"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Qaddafi'''
+captures = [[[0, 15], [], [10, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic171"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Mo'ammar Gadhafi'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic172"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Kaddafi'''
+captures = [[[0, 15], [], [10, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic173"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Qadhafi'''
+captures = [[[0, 15], [], [10, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic174"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Gadafi'''
+captures = [[[0, 14], [], [10, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic175"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Mu'ammar Qadafi'''
+captures = [[[0, 15], [], [11, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic176"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Moamar Gaddafi'''
+captures = [[[0, 14], [], [9, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic177"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Mu'ammar Qadhdhafi'''
+captures = [[[0, 18], [], [13, 15]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic178"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Khaddafi'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic179"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Ghaddafy'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic180"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Ghadafi'''
+captures = [[[0, 15], [], [11, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic181"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Ghaddafi'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic182"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muamar Kaddafi'''
+captures = [[[0, 14], [], [9, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic183"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Quathafi'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic184"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Muammar Gheddafi'''
+captures = [[[0, 16], [], [11, 13]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic185"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Moammar Khadafy'''
+captures = [[[0, 15], [], [11, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic186"
+regex = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]'''
+input = '''Moammar Qudhafi'''
+captures = [[[0, 15], [], [10, 12]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic187"
+regex = '''a+(b|c)*d+'''
+input = '''aabcdd'''
+captures = [[[0, 6], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic188"
+regex = '''^.+$'''
+input = '''vivi'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic189"
+regex = '''^(.+)$'''
+input = '''vivi'''
+captures = [[[0, 4], [0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic190"
+regex = '''^([^!.]+).att.com!(.+)$'''
+input = '''gryphon.att.com!eby'''
+captures = [[[0, 19], [0, 7], [16, 19]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic191"
+regex = '''^([^!]+!)?([^!]+)$'''
+input = '''bas'''
+captures = [[[0, 3], [], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic192"
+regex = '''^([^!]+!)?([^!]+)$'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic193"
+regex = '''^([^!]+!)?([^!]+)$'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic194"
+regex = '''^.+!([^!]+!)([^!]+)$'''
+input = '''foo!bar!bas'''
+captures = [[[0, 11], [4, 8], [8, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic195"
+regex = '''((foo)|(bar))!bas'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 3], [], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic196"
+regex = '''((foo)|(bar))!bas'''
+input = '''foo!bar!bas'''
+captures = [[[4, 11], [4, 7], [], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic197"
+regex = '''((foo)|(bar))!bas'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 3], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic198"
+regex = '''((foo)|bar)!bas'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic199"
+regex = '''((foo)|bar)!bas'''
+input = '''foo!bar!bas'''
+captures = [[[4, 11], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic200"
+regex = '''((foo)|bar)!bas'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 3], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic201"
+regex = '''(foo|(bar))!bas'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 3], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic202"
+regex = '''(foo|(bar))!bas'''
+input = '''foo!bar!bas'''
+captures = [[[4, 11], [4, 7], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic203"
+regex = '''(foo|(bar))!bas'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic204"
+regex = '''(foo|bar)!bas'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic205"
+regex = '''(foo|bar)!bas'''
+input = '''foo!bar!bas'''
+captures = [[[4, 11], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic206"
+regex = '''(foo|bar)!bas'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic207"
+regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''foo!bar!bas'''
+captures = [[[0, 11], [0, 11], [], [], [4, 8], [8, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic208"
+regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''bas'''
+captures = [[[0, 3], [], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic209"
+regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic210"
+regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''foo!bar!bas'''
+captures = [[[0, 11], [], [], [4, 8], [8, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic211"
+regex = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic212"
+regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''bas'''
+captures = [[[0, 3], [0, 3], [], [0, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic213"
+regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''bar!bas'''
+captures = [[[0, 7], [0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic214"
+regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''foo!bar!bas'''
+captures = [[[0, 11], [0, 11], [], [], [4, 8], [8, 11]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic215"
+regex = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$'''
+input = '''foo!bas'''
+captures = [[[0, 7], [0, 7], [0, 4], [4, 7]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic216"
+regex = '''.*(/XXX).*'''
+input = '''/XXX'''
+captures = [[[0, 4], [0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic217"
+regex = '''.*(\\XXX).*'''
+input = '''\\XXX'''
+captures = [[[0, 4], [0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic218"
+regex = '''\\XXX'''
+input = '''\\XXX'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic219"
+regex = '''.*(/000).*'''
+input = '''/000'''
+captures = [[[0, 4], [0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic220"
+regex = '''.*(\\000).*'''
+input = '''\\000'''
+captures = [[[0, 4], [0, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "basic221"
+regex = '''\\000'''
+input = '''\\000'''
+captures = [[[0, 4]]]
+match_limit = 1
+unescape = true
+
diff --git a/tests/data/fowler/dat/README b/tests/data/fowler/dat/README

new file mode 100644 (file)

index 0000000..e700725
--- /dev/null
+++ b/tests/data/fowler/dat/README
@@ -0,0 +1,24 @@
+Test data was taken from the Go distribution, which was in turn taken from the
+testregex test suite:
+
+  http://www2.research.att.com/~astopen/testregex/testregex.html
+
+Unfortunately, the above link is now dead, but the test data lives on.
+
+The LICENSE in this directory corresponds to the LICENSE that the data was
+originally released under.
+
+The tests themselves were modified for RE2/Go. A couple were modified further
+by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
+(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
+have been a bad idea, but I think being consistent with an established Regex
+library is worth something.
+
+After some number of years, these tests were transformed into a TOML format
+using the fowler-to-toml script in the 'scripts' directory. To re-generate the
+TOML files, then run the following from the root of this repository:
+
+  ./scripts/fowler-to-toml tests/data/fowler tests/data/fowler/dat/*.dat
+
+which brings them into a sensible structured format in which other tests can
+be written.
diff --git a/tests/data/fowler/dat/basic.dat b/tests/data/fowler/dat/basic.dat

new file mode 100644 (file)

index 0000000..e55efae
--- /dev/null
+++ b/tests/data/fowler/dat/basic.dat
@@ -0,0 +1,221 @@
+NOTE   all standard compliant implementations should pass these : 2002-05-31
+
+BE     abracadabra$    abracadabracadabra      (7,18)
+BE     a...b           abababbb                (2,7)
+BE     XXXXXX          ..XXXXXX                (2,8)
+E      \)              ()      (1,2)
+BE     a]              a]a     (0,2)
+B      }               }       (0,1)
+E      \}              }       (0,1)
+BE     \]              ]       (0,1)
+B      ]               ]       (0,1)
+E      ]               ]       (0,1)
+B      {               {       (0,1)
+B      }               }       (0,1)
+BE     ^a              ax      (0,1)
+BE     \^a             a^a     (1,3)
+BE     a\^             a^      (0,2)
+BE     a$              aa      (1,2)
+BE     a\$             a$      (0,2)
+BE     ^$              NULL    (0,0)
+E      $^              NULL    (0,0)
+E      a($)            aa      (1,2)(2,2)
+E      a*(^a)          aa      (0,1)(0,1)
+E      (..)*(...)*             a       (0,0)
+E      (..)*(...)*             abcd    (0,4)(2,4)
+E      (ab|a)(bc|c)            abc     (0,3)(0,2)(2,3)
+E      (ab)c|abc               abc     (0,3)(0,2)
+E      a{0}b           ab                      (1,2)
+E      (a*)(b?)(b+)b{3}        aaabbbbbbb      (0,10)(0,3)(3,4)(4,7)
+E      (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb      (0,10)(0,3)(3,4)(4,7)
+E      a{9876543210}   NULL    BADBR
+E      ((a|a)|a)                       a       (0,1)(0,1)(0,1)
+E      (a*)(a|aa)                      aaaa    (0,4)(0,3)(3,4)
+E      a*(a.|aa)                       aaaa    (0,4)(2,4)
+E      a(b)|c(d)|a(e)f                 aef     (0,3)(?,?)(?,?)(1,2)
+E      (a|b)?.*                        b       (0,1)(0,1)
+E      (a|b)c|a(b|c)                   ac      (0,2)(0,1)
+E      (a|b)c|a(b|c)                   ab      (0,2)(?,?)(1,2)
+E      (a|b)*c|(a|ab)*c                abc     (0,3)(1,2)
+E      (a|b)*c|(a|ab)*c                xc      (1,2)
+E      (.a|.b).*|.*(.a|.b)             xa      (0,2)(0,2)
+E      a?(ab|ba)ab                     abab    (0,4)(0,2)
+E      a?(ac{0}b|ba)ab                 abab    (0,4)(0,2)
+E      ab|abab                         abbabab (0,2)
+E      aba|bab|bba                     baaabbbaba      (5,8)
+E      aba|bab                         baaabbbaba      (6,9)
+E      (aa|aaa)*|(a|aaaaa)             aa      (0,2)(0,2)
+E      (a.|.a.)*|(a|.a...)             aa      (0,2)(0,2)
+E      ab|a                            xabc    (1,3)
+E      ab|a                            xxabc   (2,4)
+Ei     (Ab|cD)*                        aBcD    (0,4)(2,4)
+BE     [^-]                    --a             (2,3)
+BE     [a-]*                   --a             (0,3)
+BE     [a-m-]*                 --amoma--       (0,4)
+E      :::1:::0:|:::1:1:0:     :::0:::1:::1:::0:       (8,17)
+E      :::1:::0:|:::1:1:1:     :::0:::1:::1:::0:       (8,17)
+{E     [[:upper:]]             A               (0,1)   [[<element>]] not supported
+E      [[:lower:]]+            `az{            (1,3)
+E      [[:upper:]]+            @AZ[            (1,3)
+# No collation in Go
+#BE    [[-]]                   [[-]]           (2,4)
+#BE    [[.NIL.]]       NULL    ECOLLATE
+#BE    [[=aleph=]]     NULL    ECOLLATE
+}
+BE$    \n              \n      (0,1)
+BEn$   \n              \n      (0,1)
+BE$    [^a]            \n      (0,1)
+BE$    \na             \na     (0,2)
+E      (a)(b)(c)       abc     (0,3)(0,1)(1,2)(2,3)
+BE     xxx             xxx     (0,3)
+E1     (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)      feb 6,  (0,6)
+E1     (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)      2/7     (0,3)
+E1     (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)      feb 1,Feb 6     (5,11)
+E3     ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))   x       (0,1)(0,1)(0,1)
+E3     ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*  xx      (0,2)(1,2)(1,2)
+E      a?(ab|ba)*      ababababababababababababababababababababababababababababababababababababababababa       (0,81)(79,81)
+E      abaa|abbaa|abbbaa|abbbbaa       ababbabbbabbbabbbbabbbbaa       (18,25)
+E      abaa|abbaa|abbbaa|abbbbaa       ababbabbbabbbabbbbabaa  (18,22)
+E      aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac     (7,11)
+BE$    .*                      \x01\x7f        (0,2)
+E      aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll            XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa       (53,57)
+L      aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll         XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa       NOMATCH
+E      a*a*a*a*a*b             aaaaaaaaab      (0,10)
+BE     ^                       NULL            (0,0)
+BE     $                       NULL            (0,0)
+BE     ^$                      NULL            (0,0)
+BE     ^a$                     a               (0,1)
+BE     abc                     abc             (0,3)
+BE     abc                     xabcy           (1,4)
+BE     abc                     ababc           (2,5)
+BE     ab*c                    abc             (0,3)
+BE     ab*bc                   abc             (0,3)
+BE     ab*bc                   abbc            (0,4)
+BE     ab*bc                   abbbbc          (0,6)
+E      ab+bc                   abbc            (0,4)
+E      ab+bc                   abbbbc          (0,6)
+E      ab?bc                   abbc            (0,4)
+E      ab?bc                   abc             (0,3)
+E      ab?c                    abc             (0,3)
+BE     ^abc$                   abc             (0,3)
+BE     ^abc                    abcc            (0,3)
+BE     abc$                    aabc            (1,4)
+BE     ^                       abc             (0,0)
+BE     $                       abc             (3,3)
+BE     a.c                     abc             (0,3)
+BE     a.c                     axc             (0,3)
+BE     a.*c                    axyzc           (0,5)
+BE     a[bc]d                  abd             (0,3)
+BE     a[b-d]e                 ace             (0,3)
+BE     a[b-d]                  aac             (1,3)
+BE     a[-b]                   a-              (0,2)
+BE     a[b-]                   a-              (0,2)
+BE     a]                      a]              (0,2)
+BE     a[]]b                   a]b             (0,3)
+BE     a[^bc]d                 aed             (0,3)
+BE     a[^-b]c                 adc             (0,3)
+BE     a[^]b]c                 adc             (0,3)
+E      ab|cd                   abc             (0,2)
+E      ab|cd                   abcd            (0,2)
+E      a\(b                    a(b             (0,3)
+E      a\(*b                   ab              (0,2)
+E      a\(*b                   a((b            (0,4)
+E      ((a))                   abc             (0,1)(0,1)(0,1)
+E      (a)b(c)                 abc             (0,3)(0,1)(2,3)
+E      a+b+c                   aabbabc         (4,7)
+E      a*                      aaa             (0,3)
+#E     (a*)*                   -               (0,0)(0,0)
+E      (a*)*                   -               (0,0)(?,?)      RE2/Go
+E      (a*)+                   -               (0,0)(0,0)
+#E     (a*|b)*                 -               (0,0)(0,0)
+E      (a*|b)*                 -               (0,0)(?,?)      RE2/Go
+E      (a+|b)*                 ab              (0,2)(1,2)
+E      (a+|b)+                 ab              (0,2)(1,2)
+E      (a+|b)?                 ab              (0,1)(0,1)
+BE     [^ab]*                  cde             (0,3)
+#E     (^)*                    -               (0,0)(0,0)
+E      (^)*                    -               (0,0)(?,?)      RE2/Go
+BE     a*                      NULL            (0,0)
+E      ([abc])*d               abbbcd          (0,6)(4,5)
+E      ([abc])*bcd             abcd            (0,4)(0,1)
+E      a|b|c|d|e               e               (0,1)
+E      (a|b|c|d|e)f            ef              (0,2)(0,1)
+#E     ((a*|b))*               -               (0,0)(0,0)(0,0)
+E      ((a*|b))*               -               (0,0)(?,?)(?,?) RE2/Go
+BE     abcd*efg                abcdefg         (0,7)
+BE     ab*                     xabyabbbz       (1,3)
+BE     ab*                     xayabbbz        (1,2)
+E      (ab|cd)e                abcde           (2,5)(2,4)
+BE     [abhgefdc]ij            hij             (0,3)
+E      (a|b)c*d                abcd            (1,4)(1,2)
+E      (ab|ab*)bc              abc             (0,3)(0,1)
+E      a([bc]*)c*              abc             (0,3)(1,3)
+E      a([bc]*)(c*d)           abcd            (0,4)(1,3)(3,4)
+E      a([bc]+)(c*d)           abcd            (0,4)(1,3)(3,4)
+E      a([bc]*)(c+d)           abcd            (0,4)(1,2)(2,4)
+E      a[bcd]*dcdcde           adcdcde         (0,7)
+E      (ab|a)b*c               abc             (0,3)(0,2)
+E      ((a)(b)c)(d)            abcd            (0,4)(0,3)(0,1)(1,2)(3,4)
+BE     [A-Za-z_][A-Za-z0-9_]*  alpha           (0,5)
+E      ^a(bc+|b[eh])g|.h$      abh             (1,3)
+E      (bc+d$|ef*g.|h?i(j|k))  effgz           (0,5)(0,5)
+E      (bc+d$|ef*g.|h?i(j|k))  ij              (0,2)(0,2)(1,2)
+E      (bc+d$|ef*g.|h?i(j|k))  reffgz          (1,6)(1,6)
+E      (((((((((a)))))))))     a               (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
+BE     multiple words          multiple words yeah     (0,14)
+E      (.*)c(.*)               abcde           (0,5)(0,2)(3,5)
+BE     abcd                    abcd            (0,4)
+E      a(bc)d                  abcd            (0,4)(1,3)
+E      a[\ 1-\ 3]?c                a\ 2c             (0,3)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Muammar Qaddafi (0,15)(?,?)(10,12)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Mo'ammar Gadhafi        (0,16)(?,?)(11,13)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Muammar Kaddafi (0,15)(?,?)(10,12)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Muammar Qadhafi (0,15)(?,?)(10,12)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Muammar Gadafi  (0,14)(?,?)(10,11)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Mu'ammar Qadafi (0,15)(?,?)(11,12)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Moamar Gaddafi  (0,14)(?,?)(9,11)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Mu'ammar Qadhdhafi      (0,18)(?,?)(13,15)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Muammar Khaddafi        (0,16)(?,?)(11,13)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Muammar Ghaddafy        (0,16)(?,?)(11,13)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Muammar Ghadafi (0,15)(?,?)(11,12)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Muammar Ghaddafi        (0,16)(?,?)(11,13)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Muamar Kaddafi  (0,14)(?,?)(9,11)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Muammar Quathafi        (0,16)(?,?)(11,13)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Muammar Gheddafi        (0,16)(?,?)(11,13)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Moammar Khadafy (0,15)(?,?)(11,12)
+E      M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]       Moammar Qudhafi (0,15)(?,?)(10,12)
+E      a+(b|c)*d+              aabcdd                  (0,6)(3,4)
+E      ^.+$                    vivi                    (0,4)
+E      ^(.+)$                  vivi                    (0,4)(0,4)
+E      ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby     (0,19)(0,7)(16,19)
+E      ^([^!]+!)?([^!]+)$      bas                     (0,3)(?,?)(0,3)
+E      ^([^!]+!)?([^!]+)$      bar!bas                 (0,7)(0,4)(4,7)
+E      ^([^!]+!)?([^!]+)$      foo!bas                 (0,7)(0,4)(4,7)
+E      ^.+!([^!]+!)([^!]+)$    foo!bar!bas             (0,11)(4,8)(8,11)
+E      ((foo)|(bar))!bas       bar!bas                 (0,7)(0,3)(?,?)(0,3)
+E      ((foo)|(bar))!bas       foo!bar!bas             (4,11)(4,7)(?,?)(4,7)
+E      ((foo)|(bar))!bas       foo!bas                 (0,7)(0,3)(0,3)
+E      ((foo)|bar)!bas         bar!bas                 (0,7)(0,3)
+E      ((foo)|bar)!bas         foo!bar!bas             (4,11)(4,7)
+E      ((foo)|bar)!bas         foo!bas                 (0,7)(0,3)(0,3)
+E      (foo|(bar))!bas         bar!bas                 (0,7)(0,3)(0,3)
+E      (foo|(bar))!bas         foo!bar!bas             (4,11)(4,7)(4,7)
+E      (foo|(bar))!bas         foo!bas                 (0,7)(0,3)
+E      (foo|bar)!bas           bar!bas                 (0,7)(0,3)
+E      (foo|bar)!bas           foo!bar!bas             (4,11)(4,7)
+E      (foo|bar)!bas           foo!bas                 (0,7)(0,3)
+E      ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas     (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
+E      ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas             (0,3)(?,?)(0,3)
+E      ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas         (0,7)(0,4)(4,7)
+E      ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas     (0,11)(?,?)(?,?)(4,8)(8,11)
+E      ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas         (0,7)(0,4)(4,7)
+E      ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas             (0,3)(0,3)(?,?)(0,3)
+E      ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas         (0,7)(0,7)(0,4)(4,7)
+E      ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas     (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
+E      ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas         (0,7)(0,7)(0,4)(4,7)
+E      .*(/XXX).*                      /XXX                    (0,4)(0,4)
+E      .*(\\XXX).*                     \XXX                    (0,4)(0,4)
+E      \\XXX                           \XXX                    (0,4)
+E      .*(/000).*                      /000                    (0,4)(0,4)
+E      .*(\\000).*                     \000                    (0,4)(0,4)
+E      \\000                           \000                    (0,4)
diff --git a/tests/data/fowler/dat/nullsubexpr.dat b/tests/data/fowler/dat/nullsubexpr.dat

new file mode 100644 (file)

index 0000000..2e18fbb
--- /dev/null
+++ b/tests/data/fowler/dat/nullsubexpr.dat
@@ -0,0 +1,79 @@
+NOTE   null subexpression matches : 2002-06-06
+
+E      (a*)*           a               (0,1)(0,1)
+#E     SAME            x               (0,0)(0,0)
+E      SAME            x               (0,0)(?,?)      RE2/Go
+E      SAME            aaaaaa          (0,6)(0,6)
+E      SAME            aaaaaax         (0,6)(0,6)
+E      (a*)+           a               (0,1)(0,1)
+E      SAME            x               (0,0)(0,0)
+E      SAME            aaaaaa          (0,6)(0,6)
+E      SAME            aaaaaax         (0,6)(0,6)
+E      (a+)*           a               (0,1)(0,1)
+E      SAME            x               (0,0)
+E      SAME            aaaaaa          (0,6)(0,6)
+E      SAME            aaaaaax         (0,6)(0,6)
+E      (a+)+           a               (0,1)(0,1)
+E      SAME            x               NOMATCH
+E      SAME            aaaaaa          (0,6)(0,6)
+E      SAME            aaaaaax         (0,6)(0,6)
+
+E      ([a]*)*         a               (0,1)(0,1)
+#E     SAME            x               (0,0)(0,0)
+E      SAME            x               (0,0)(?,?)      RE2/Go
+E      SAME            aaaaaa          (0,6)(0,6)
+E      SAME            aaaaaax         (0,6)(0,6)
+E      ([a]*)+         a               (0,1)(0,1)
+E      SAME            x               (0,0)(0,0)
+E      SAME            aaaaaa          (0,6)(0,6)
+E      SAME            aaaaaax         (0,6)(0,6)
+E      ([^b]*)*        a               (0,1)(0,1)
+#E     SAME            b               (0,0)(0,0)
+E      SAME            b               (0,0)(?,?)      RE2/Go
+E      SAME            aaaaaa          (0,6)(0,6)
+E      SAME            aaaaaab         (0,6)(0,6)
+E      ([ab]*)*        a               (0,1)(0,1)
+E      SAME            aaaaaa          (0,6)(0,6)
+E      SAME            ababab          (0,6)(0,6)
+E      SAME            bababa          (0,6)(0,6)
+E      SAME            b               (0,1)(0,1)
+E      SAME            bbbbbb          (0,6)(0,6)
+E      SAME            aaaabcde        (0,5)(0,5)
+E      ([^a]*)*        b               (0,1)(0,1)
+E      SAME            bbbbbb          (0,6)(0,6)
+#E     SAME            aaaaaa          (0,0)(0,0)
+E      SAME            aaaaaa          (0,0)(?,?)      RE2/Go
+E      ([^ab]*)*       ccccxx          (0,6)(0,6)
+#E     SAME            ababab          (0,0)(0,0)
+E      SAME            ababab          (0,0)(?,?)      RE2/Go
+
+E      ((z)+|a)*       zabcde          (0,2)(1,2)
+
+#{E    a+?             aaaaaa          (0,1)   no *? +? mimimal match ops
+#E     (a)             aaa             (0,1)(0,1)
+#E     (a*?)           aaa             (0,0)(0,0)
+#E     (a)*?           aaa             (0,0)
+#E     (a*?)*?         aaa             (0,0)
+#}
+
+B      \(a*\)*\(x\)            x       (0,1)(0,0)(0,1)
+B      \(a*\)*\(x\)            ax      (0,2)(0,1)(1,2)
+B      \(a*\)*\(x\)            axa     (0,2)(0,1)(1,2)
+B      \(a*\)*\(x\)\(\1\)      x       (0,1)(0,0)(0,1)(1,1)
+B      \(a*\)*\(x\)\(\1\)      ax      (0,2)(1,1)(1,2)(2,2)
+B      \(a*\)*\(x\)\(\1\)      axa     (0,3)(0,1)(1,2)(2,3)
+B      \(a*\)*\(x\)\(\1\)\(x\) axax    (0,4)(0,1)(1,2)(2,3)(3,4)
+B      \(a*\)*\(x\)\(\1\)\(x\) axxa    (0,3)(1,1)(1,2)(2,2)(2,3)
+
+#E     (a*)*(x)                x       (0,1)(0,0)(0,1)
+E      (a*)*(x)                x       (0,1)(?,?)(0,1) RE2/Go
+E      (a*)*(x)                ax      (0,2)(0,1)(1,2)
+E      (a*)*(x)                axa     (0,2)(0,1)(1,2)
+
+E      (a*)+(x)                x       (0,1)(0,0)(0,1)
+E      (a*)+(x)                ax      (0,2)(0,1)(1,2)
+E      (a*)+(x)                axa     (0,2)(0,1)(1,2)
+
+E      (a*){2}(x)              x       (0,1)(0,0)(0,1)
+E      (a*){2}(x)              ax      (0,2)(1,1)(1,2)
+E      (a*){2}(x)              axa     (0,2)(1,1)(1,2)
diff --git a/tests/data/fowler/dat/repetition-expensive.dat b/tests/data/fowler/dat/repetition-expensive.dat

new file mode 100644 (file)

index 0000000..c915802
--- /dev/null
+++ b/tests/data/fowler/dat/repetition-expensive.dat
@@ -0,0 +1,85 @@
+NOTE   implicit vs. explicit repetitions : 2009-02-02
+
+# Glenn Fowler <gsf@research.att.com>
+# conforming matches (column 4) must match one of the following BREs
+#      NOMATCH
+#      (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
+#      (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
+# i.e., each 3-tuple has two identical elements and one (?,?)
+
+NOTE   additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
+
+:HA#100:E      X(.?){0,}Y      X1234567Y       (0,9)(7,8)
+:HA#101:E      X(.?){1,}Y      X1234567Y       (0,9)(7,8)
+:HA#102:E      X(.?){2,}Y      X1234567Y       (0,9)(7,8)
+:HA#103:E      X(.?){3,}Y      X1234567Y       (0,9)(7,8)
+:HA#104:E      X(.?){4,}Y      X1234567Y       (0,9)(7,8)
+:HA#105:E      X(.?){5,}Y      X1234567Y       (0,9)(7,8)
+:HA#106:E      X(.?){6,}Y      X1234567Y       (0,9)(7,8)
+:HA#107:E      X(.?){7,}Y      X1234567Y       (0,9)(7,8)
+:HA#108:E      X(.?){8,}Y      X1234567Y       (0,9)(8,8)
+#:HA#110:E     X(.?){0,8}Y     X1234567Y       (0,9)(7,8)
+:HA#110:E      X(.?){0,8}Y     X1234567Y       (0,9)(8,8)      RE2/Go
+#:HA#111:E     X(.?){1,8}Y     X1234567Y       (0,9)(7,8)
+:HA#111:E      X(.?){1,8}Y     X1234567Y       (0,9)(8,8)      RE2/Go
+#:HA#112:E     X(.?){2,8}Y     X1234567Y       (0,9)(7,8)
+:HA#112:E      X(.?){2,8}Y     X1234567Y       (0,9)(8,8)      RE2/Go
+#:HA#113:E     X(.?){3,8}Y     X1234567Y       (0,9)(7,8)
+:HA#113:E      X(.?){3,8}Y     X1234567Y       (0,9)(8,8)      RE2/Go
+#:HA#114:E     X(.?){4,8}Y     X1234567Y       (0,9)(7,8)
+:HA#114:E      X(.?){4,8}Y     X1234567Y       (0,9)(8,8)      RE2/Go
+#:HA#115:E     X(.?){5,8}Y     X1234567Y       (0,9)(7,8)
+:HA#115:E      X(.?){5,8}Y     X1234567Y       (0,9)(8,8)      RE2/Go
+#:HA#116:E     X(.?){6,8}Y     X1234567Y       (0,9)(7,8)
+:HA#116:E      X(.?){6,8}Y     X1234567Y       (0,9)(8,8)      RE2/Go
+#:HA#117:E     X(.?){7,8}Y     X1234567Y       (0,9)(7,8)
+:HA#117:E      X(.?){7,8}Y     X1234567Y       (0,9)(8,8)      RE2/Go
+:HA#118:E      X(.?){8,8}Y     X1234567Y       (0,9)(8,8)
+
+# These test a fixed bug in my regex-tdfa that did not keep the expanded
+# form properly grouped, so right association did the wrong thing with
+# these ambiguous patterns (crafted just to test my code when I became
+# suspicious of my implementation).  The first subexpression should use
+# "ab" then "a" then "bcd".
+
+# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
+# results like (0,6)(4,5)(6,6).
+
+:HA#260:E      (a|ab|c|bcd){0,}(d*)    ababcd  (0,1)(0,1)(1,1)
+:HA#261:E      (a|ab|c|bcd){1,}(d*)    ababcd  (0,1)(0,1)(1,1)
+:HA#262:E      (a|ab|c|bcd){2,}(d*)    ababcd  (0,6)(3,6)(6,6)
+:HA#263:E      (a|ab|c|bcd){3,}(d*)    ababcd  (0,6)(3,6)(6,6)
+:HA#264:E      (a|ab|c|bcd){4,}(d*)    ababcd  NOMATCH
+:HA#265:E      (a|ab|c|bcd){0,10}(d*)  ababcd  (0,1)(0,1)(1,1)
+:HA#266:E      (a|ab|c|bcd){1,10}(d*)  ababcd  (0,1)(0,1)(1,1)
+:HA#267:E      (a|ab|c|bcd){2,10}(d*)  ababcd  (0,6)(3,6)(6,6)
+:HA#268:E      (a|ab|c|bcd){3,10}(d*)  ababcd  (0,6)(3,6)(6,6)
+:HA#269:E      (a|ab|c|bcd){4,10}(d*)  ababcd  NOMATCH
+:HA#270:E      (a|ab|c|bcd)*(d*)       ababcd  (0,1)(0,1)(1,1)
+:HA#271:E      (a|ab|c|bcd)+(d*)       ababcd  (0,1)(0,1)(1,1)
+
+# The above worked on Linux/GLIBC but the following often fail.
+# They also trip up OS X / FreeBSD / NetBSD:
+
+#:HA#280:E     (ab|a|c|bcd){0,}(d*)    ababcd  (0,6)(3,6)(6,6)
+:HA#280:E      (ab|a|c|bcd){0,}(d*)    ababcd  (0,6)(4,5)(5,6) RE2/Go
+#:HA#281:E     (ab|a|c|bcd){1,}(d*)    ababcd  (0,6)(3,6)(6,6)
+:HA#281:E      (ab|a|c|bcd){1,}(d*)    ababcd  (0,6)(4,5)(5,6) RE2/Go
+#:HA#282:E     (ab|a|c|bcd){2,}(d*)    ababcd  (0,6)(3,6)(6,6)
+:HA#282:E      (ab|a|c|bcd){2,}(d*)    ababcd  (0,6)(4,5)(5,6) RE2/Go
+#:HA#283:E     (ab|a|c|bcd){3,}(d*)    ababcd  (0,6)(3,6)(6,6)
+:HA#283:E      (ab|a|c|bcd){3,}(d*)    ababcd  (0,6)(4,5)(5,6) RE2/Go
+:HA#284:E      (ab|a|c|bcd){4,}(d*)    ababcd  NOMATCH
+#:HA#285:E     (ab|a|c|bcd){0,10}(d*)  ababcd  (0,6)(3,6)(6,6)
+:HA#285:E      (ab|a|c|bcd){0,10}(d*)  ababcd  (0,6)(4,5)(5,6) RE2/Go
+#:HA#286:E     (ab|a|c|bcd){1,10}(d*)  ababcd  (0,6)(3,6)(6,6)
+:HA#286:E      (ab|a|c|bcd){1,10}(d*)  ababcd  (0,6)(4,5)(5,6) RE2/Go
+#:HA#287:E     (ab|a|c|bcd){2,10}(d*)  ababcd  (0,6)(3,6)(6,6)
+:HA#287:E      (ab|a|c|bcd){2,10}(d*)  ababcd  (0,6)(4,5)(5,6) RE2/Go
+#:HA#288:E     (ab|a|c|bcd){3,10}(d*)  ababcd  (0,6)(3,6)(6,6)
+:HA#288:E      (ab|a|c|bcd){3,10}(d*)  ababcd  (0,6)(4,5)(5,6) RE2/Go
+:HA#289:E      (ab|a|c|bcd){4,10}(d*)  ababcd  NOMATCH
+#:HA#290:E     (ab|a|c|bcd)*(d*)       ababcd  (0,6)(3,6)(6,6)
+:HA#290:E      (ab|a|c|bcd)*(d*)       ababcd  (0,6)(4,5)(5,6) RE2/Go
+#:HA#291:E     (ab|a|c|bcd)+(d*)       ababcd  (0,6)(3,6)(6,6)
+:HA#291:E      (ab|a|c|bcd)+(d*)       ababcd  (0,6)(4,5)(5,6) RE2/Go
diff --git a/tests/data/fowler/dat/repetition.dat b/tests/data/fowler/dat/repetition.dat

new file mode 100644 (file)

index 0000000..2dac082
--- /dev/null
+++ b/tests/data/fowler/dat/repetition.dat
@@ -0,0 +1,83 @@
+NOTE   implicit vs. explicit repetitions : 2009-02-02
+
+# Glenn Fowler <gsf@research.att.com>
+# conforming matches (column 4) must match one of the following BREs
+#      NOMATCH
+#      (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
+#      (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
+# i.e., each 3-tuple has two identical elements and one (?,?)
+
+E      ((..)|(.))                              NULL            NOMATCH
+E      ((..)|(.))((..)|(.))                    NULL            NOMATCH
+E      ((..)|(.))((..)|(.))((..)|(.))          NULL            NOMATCH
+
+E      ((..)|(.)){1}                           NULL            NOMATCH
+E      ((..)|(.)){2}                           NULL            NOMATCH
+E      ((..)|(.)){3}                           NULL            NOMATCH
+
+E      ((..)|(.))*                             NULL            (0,0)
+
+E      ((..)|(.))                              a               (0,1)(0,1)(?,?)(0,1)
+E      ((..)|(.))((..)|(.))                    a               NOMATCH
+E      ((..)|(.))((..)|(.))((..)|(.))          a               NOMATCH
+
+E      ((..)|(.)){1}                           a               (0,1)(0,1)(?,?)(0,1)
+E      ((..)|(.)){2}                           a               NOMATCH
+E      ((..)|(.)){3}                           a               NOMATCH
+
+E      ((..)|(.))*                             a               (0,1)(0,1)(?,?)(0,1)
+
+E      ((..)|(.))                              aa              (0,2)(0,2)(0,2)(?,?)
+E      ((..)|(.))((..)|(.))                    aa              (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
+E      ((..)|(.))((..)|(.))((..)|(.))          aa              NOMATCH
+
+E      ((..)|(.)){1}                           aa              (0,2)(0,2)(0,2)(?,?)
+E      ((..)|(.)){2}                           aa              (0,2)(1,2)(?,?)(1,2)
+E      ((..)|(.)){3}                           aa              NOMATCH
+
+E      ((..)|(.))*                             aa              (0,2)(0,2)(0,2)(?,?)
+
+E      ((..)|(.))                              aaa             (0,2)(0,2)(0,2)(?,?)
+E      ((..)|(.))((..)|(.))                    aaa             (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
+E      ((..)|(.))((..)|(.))((..)|(.))          aaa             (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
+
+E      ((..)|(.)){1}                           aaa             (0,2)(0,2)(0,2)(?,?)
+#E     ((..)|(.)){2}                           aaa             (0,3)(2,3)(?,?)(2,3)
+E      ((..)|(.)){2}                           aaa             (0,3)(2,3)(0,2)(2,3)    RE2/Go
+E      ((..)|(.)){3}                           aaa             (0,3)(2,3)(?,?)(2,3)
+
+#E     ((..)|(.))*                             aaa             (0,3)(2,3)(?,?)(2,3)
+E      ((..)|(.))*                             aaa             (0,3)(2,3)(0,2)(2,3)    RE2/Go
+
+E      ((..)|(.))                              aaaa            (0,2)(0,2)(0,2)(?,?)
+E      ((..)|(.))((..)|(.))                    aaaa            (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E      ((..)|(.))((..)|(.))((..)|(.))          aaaa            (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
+
+E      ((..)|(.)){1}                           aaaa            (0,2)(0,2)(0,2)(?,?)
+E      ((..)|(.)){2}                           aaaa            (0,4)(2,4)(2,4)(?,?)
+#E     ((..)|(.)){3}                           aaaa            (0,4)(3,4)(?,?)(3,4)
+E      ((..)|(.)){3}                           aaaa            (0,4)(3,4)(0,2)(3,4)    RE2/Go
+
+E      ((..)|(.))*                             aaaa            (0,4)(2,4)(2,4)(?,?)
+
+E      ((..)|(.))                              aaaaa           (0,2)(0,2)(0,2)(?,?)
+E      ((..)|(.))((..)|(.))                    aaaaa           (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E      ((..)|(.))((..)|(.))((..)|(.))          aaaaa           (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
+
+E      ((..)|(.)){1}                           aaaaa           (0,2)(0,2)(0,2)(?,?)
+E      ((..)|(.)){2}                           aaaaa           (0,4)(2,4)(2,4)(?,?)
+#E     ((..)|(.)){3}                           aaaaa           (0,5)(4,5)(?,?)(4,5)
+E      ((..)|(.)){3}                           aaaaa           (0,5)(4,5)(2,4)(4,5)    RE2/Go
+
+#E     ((..)|(.))*                             aaaaa           (0,5)(4,5)(?,?)(4,5)
+E      ((..)|(.))*                             aaaaa           (0,5)(4,5)(2,4)(4,5)    RE2/Go
+
+E      ((..)|(.))                              aaaaaa          (0,2)(0,2)(0,2)(?,?)
+E      ((..)|(.))((..)|(.))                    aaaaaa          (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
+E      ((..)|(.))((..)|(.))((..)|(.))          aaaaaa          (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
+
+E      ((..)|(.)){1}                           aaaaaa          (0,2)(0,2)(0,2)(?,?)
+E      ((..)|(.)){2}                           aaaaaa          (0,4)(2,4)(2,4)(?,?)
+E      ((..)|(.)){3}                           aaaaaa          (0,6)(4,6)(4,6)(?,?)
+
+E      ((..)|(.))*                             aaaaaa          (0,6)(4,6)(4,6)(?,?)
diff --git a/tests/data/fowler/nullsubexpr.toml b/tests/data/fowler/nullsubexpr.toml

new file mode 100644 (file)

index 0000000..55d1d5b
--- /dev/null
+++ b/tests/data/fowler/nullsubexpr.toml
@@ -0,0 +1,405 @@
+# !!! DO NOT EDIT !!!
+# Automatically generated by scripts/fowler-to-toml.
+# Numbers in the test names correspond to the line number of the test from
+# the original dat file.
+
+[[tests]]
+name = "nullsubexpr3"
+regex = '''(a*)*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr5"
+regex = '''(a*)*'''
+input = '''x'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr6"
+regex = '''(a*)*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr7"
+regex = '''(a*)*'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr8"
+regex = '''(a*)+'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr9"
+regex = '''(a*)+'''
+input = '''x'''
+captures = [[[0, 0], [0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr10"
+regex = '''(a*)+'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr11"
+regex = '''(a*)+'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr12"
+regex = '''(a+)*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr13"
+regex = '''(a+)*'''
+input = '''x'''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr14"
+regex = '''(a+)*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr15"
+regex = '''(a+)*'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr16"
+regex = '''(a+)+'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr17"
+regex = '''(a+)+'''
+input = '''x'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr18"
+regex = '''(a+)+'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr19"
+regex = '''(a+)+'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr21"
+regex = '''([a]*)*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr23"
+regex = '''([a]*)*'''
+input = '''x'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr24"
+regex = '''([a]*)*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr25"
+regex = '''([a]*)*'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr26"
+regex = '''([a]*)+'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr27"
+regex = '''([a]*)+'''
+input = '''x'''
+captures = [[[0, 0], [0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr28"
+regex = '''([a]*)+'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr29"
+regex = '''([a]*)+'''
+input = '''aaaaaax'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr30"
+regex = '''([^b]*)*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr32"
+regex = '''([^b]*)*'''
+input = '''b'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr33"
+regex = '''([^b]*)*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr34"
+regex = '''([^b]*)*'''
+input = '''aaaaaab'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr35"
+regex = '''([ab]*)*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr36"
+regex = '''([ab]*)*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr37"
+regex = '''([ab]*)*'''
+input = '''ababab'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr38"
+regex = '''([ab]*)*'''
+input = '''bababa'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr39"
+regex = '''([ab]*)*'''
+input = '''b'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr40"
+regex = '''([ab]*)*'''
+input = '''bbbbbb'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr41"
+regex = '''([ab]*)*'''
+input = '''aaaabcde'''
+captures = [[[0, 5], [0, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr42"
+regex = '''([^a]*)*'''
+input = '''b'''
+captures = [[[0, 1], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr43"
+regex = '''([^a]*)*'''
+input = '''bbbbbb'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr45"
+regex = '''([^a]*)*'''
+input = '''aaaaaa'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr46"
+regex = '''([^ab]*)*'''
+input = '''ccccxx'''
+captures = [[[0, 6], [0, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr48"
+regex = '''([^ab]*)*'''
+input = '''ababab'''
+captures = [[[0, 0], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr50"
+regex = '''((z)+|a)*'''
+input = '''zabcde'''
+captures = [[[0, 2], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr69"
+regex = '''(a*)*(x)'''
+input = '''x'''
+captures = [[[0, 1], [], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr70"
+regex = '''(a*)*(x)'''
+input = '''ax'''
+captures = [[[0, 2], [0, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr71"
+regex = '''(a*)*(x)'''
+input = '''axa'''
+captures = [[[0, 2], [0, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr73"
+regex = '''(a*)+(x)'''
+input = '''x'''
+captures = [[[0, 1], [0, 0], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr74"
+regex = '''(a*)+(x)'''
+input = '''ax'''
+captures = [[[0, 2], [0, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr75"
+regex = '''(a*)+(x)'''
+input = '''axa'''
+captures = [[[0, 2], [0, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr77"
+regex = '''(a*){2}(x)'''
+input = '''x'''
+captures = [[[0, 1], [0, 0], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr78"
+regex = '''(a*){2}(x)'''
+input = '''ax'''
+captures = [[[0, 2], [1, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "nullsubexpr79"
+regex = '''(a*){2}(x)'''
+input = '''axa'''
+captures = [[[0, 2], [1, 1], [1, 2]]]
+match_limit = 1
+unescape = true
+
diff --git a/tests/data/fowler/repetition-expensive.toml b/tests/data/fowler/repetition-expensive.toml

new file mode 100644 (file)

index 0000000..81a8964
--- /dev/null
+++ b/tests/data/fowler/repetition-expensive.toml
@@ -0,0 +1,341 @@
+# !!! DO NOT EDIT !!!
+# Automatically generated by scripts/fowler-to-toml.
+# Numbers in the test names correspond to the line number of the test from
+# the original dat file.
+
+[[tests]]
+name = "repetition-expensive12"
+regex = '''X(.?){0,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive13"
+regex = '''X(.?){1,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive14"
+regex = '''X(.?){2,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive15"
+regex = '''X(.?){3,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive16"
+regex = '''X(.?){4,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive17"
+regex = '''X(.?){5,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive18"
+regex = '''X(.?){6,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive19"
+regex = '''X(.?){7,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive20"
+regex = '''X(.?){8,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive22"
+regex = '''X(.?){0,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive24"
+regex = '''X(.?){1,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive26"
+regex = '''X(.?){2,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive28"
+regex = '''X(.?){3,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive30"
+regex = '''X(.?){4,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive32"
+regex = '''X(.?){5,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive34"
+regex = '''X(.?){6,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive36"
+regex = '''X(.?){7,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive37"
+regex = '''X(.?){8,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive48"
+regex = '''(a|ab|c|bcd){0,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive49"
+regex = '''(a|ab|c|bcd){1,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive50"
+regex = '''(a|ab|c|bcd){2,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive51"
+regex = '''(a|ab|c|bcd){3,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive52"
+regex = '''(a|ab|c|bcd){4,}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive53"
+regex = '''(a|ab|c|bcd){0,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive54"
+regex = '''(a|ab|c|bcd){1,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive55"
+regex = '''(a|ab|c|bcd){2,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive56"
+regex = '''(a|ab|c|bcd){3,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive57"
+regex = '''(a|ab|c|bcd){4,10}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive58"
+regex = '''(a|ab|c|bcd)*(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive59"
+regex = '''(a|ab|c|bcd)+(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive65"
+regex = '''(ab|a|c|bcd){0,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive67"
+regex = '''(ab|a|c|bcd){1,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive69"
+regex = '''(ab|a|c|bcd){2,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive71"
+regex = '''(ab|a|c|bcd){3,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive72"
+regex = '''(ab|a|c|bcd){4,}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive74"
+regex = '''(ab|a|c|bcd){0,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive76"
+regex = '''(ab|a|c|bcd){1,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive78"
+regex = '''(ab|a|c|bcd){2,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive80"
+regex = '''(ab|a|c|bcd){3,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive81"
+regex = '''(ab|a|c|bcd){4,10}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive83"
+regex = '''(ab|a|c|bcd)*(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-expensive85"
+regex = '''(ab|a|c|bcd)+(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
diff --git a/tests/data/fowler/repetition-long.toml b/tests/data/fowler/repetition-long.toml

new file mode 100644 (file)

index 0000000..fa24c83
--- /dev/null
+++ b/tests/data/fowler/repetition-long.toml
@@ -0,0 +1,341 @@
+# !!! DO NOT EDIT !!!
+# Automatically generated by scripts/fowler-to-toml.
+# Numbers in the test names correspond to the line number of the test from
+# the original dat file.
+
+[[tests]]
+name = "repetition-long12"
+regex = '''X(.?){0,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long13"
+regex = '''X(.?){1,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long14"
+regex = '''X(.?){2,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long15"
+regex = '''X(.?){3,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long16"
+regex = '''X(.?){4,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long17"
+regex = '''X(.?){5,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long18"
+regex = '''X(.?){6,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long19"
+regex = '''X(.?){7,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [7, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long20"
+regex = '''X(.?){8,}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long22"
+regex = '''X(.?){0,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long24"
+regex = '''X(.?){1,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long26"
+regex = '''X(.?){2,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long28"
+regex = '''X(.?){3,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long30"
+regex = '''X(.?){4,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long32"
+regex = '''X(.?){5,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long34"
+regex = '''X(.?){6,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long36"
+regex = '''X(.?){7,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long37"
+regex = '''X(.?){8,8}Y'''
+input = '''X1234567Y'''
+captures = [[[0, 9], [8, 8]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long48"
+regex = '''(a|ab|c|bcd){0,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long49"
+regex = '''(a|ab|c|bcd){1,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long50"
+regex = '''(a|ab|c|bcd){2,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long51"
+regex = '''(a|ab|c|bcd){3,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long52"
+regex = '''(a|ab|c|bcd){4,}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long53"
+regex = '''(a|ab|c|bcd){0,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long54"
+regex = '''(a|ab|c|bcd){1,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long55"
+regex = '''(a|ab|c|bcd){2,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long56"
+regex = '''(a|ab|c|bcd){3,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [3, 6], [6, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long57"
+regex = '''(a|ab|c|bcd){4,10}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long58"
+regex = '''(a|ab|c|bcd)*(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long59"
+regex = '''(a|ab|c|bcd)+(d*)'''
+input = '''ababcd'''
+captures = [[[0, 1], [0, 1], [1, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long65"
+regex = '''(ab|a|c|bcd){0,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long67"
+regex = '''(ab|a|c|bcd){1,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long69"
+regex = '''(ab|a|c|bcd){2,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long71"
+regex = '''(ab|a|c|bcd){3,}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long72"
+regex = '''(ab|a|c|bcd){4,}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long74"
+regex = '''(ab|a|c|bcd){0,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long76"
+regex = '''(ab|a|c|bcd){1,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long78"
+regex = '''(ab|a|c|bcd){2,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long80"
+regex = '''(ab|a|c|bcd){3,10}(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long81"
+regex = '''(ab|a|c|bcd){4,10}(d*)'''
+input = '''ababcd'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long83"
+regex = '''(ab|a|c|bcd)*(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition-long85"
+regex = '''(ab|a|c|bcd)+(d*)'''
+input = '''ababcd'''
+captures = [[[0, 6], [4, 5], [5, 6]]]
+match_limit = 1
+unescape = true
+
diff --git a/tests/data/fowler/repetition.toml b/tests/data/fowler/repetition.toml

new file mode 100644 (file)

index 0000000..fc8da8d
--- /dev/null
+++ b/tests/data/fowler/repetition.toml
@@ -0,0 +1,397 @@
+# !!! DO NOT EDIT !!!
+# Automatically generated by scripts/fowler-to-toml.
+# Numbers in the test names correspond to the line number of the test from
+# the original dat file.
+
+[[tests]]
+name = "repetition10"
+regex = '''((..)|(.))'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition11"
+regex = '''((..)|(.))((..)|(.))'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition12"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition14"
+regex = '''((..)|(.)){1}'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition15"
+regex = '''((..)|(.)){2}'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition16"
+regex = '''((..)|(.)){3}'''
+input = ''''''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition18"
+regex = '''((..)|(.))*'''
+input = ''''''
+captures = [[[0, 0]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition20"
+regex = '''((..)|(.))'''
+input = '''a'''
+captures = [[[0, 1], [0, 1], [], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition21"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''a'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition22"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''a'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition24"
+regex = '''((..)|(.)){1}'''
+input = '''a'''
+captures = [[[0, 1], [0, 1], [], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition25"
+regex = '''((..)|(.)){2}'''
+input = '''a'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition26"
+regex = '''((..)|(.)){3}'''
+input = '''a'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition28"
+regex = '''((..)|(.))*'''
+input = '''a'''
+captures = [[[0, 1], [0, 1], [], [0, 1]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition30"
+regex = '''((..)|(.))'''
+input = '''aa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition31"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''aa'''
+captures = [[[0, 2], [0, 1], [], [0, 1], [1, 2], [], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition32"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aa'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition34"
+regex = '''((..)|(.)){1}'''
+input = '''aa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition35"
+regex = '''((..)|(.)){2}'''
+input = '''aa'''
+captures = [[[0, 2], [1, 2], [], [1, 2]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition36"
+regex = '''((..)|(.)){3}'''
+input = '''aa'''
+captures = []
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition38"
+regex = '''((..)|(.))*'''
+input = '''aa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition40"
+regex = '''((..)|(.))'''
+input = '''aaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition41"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''aaa'''
+captures = [[[0, 3], [0, 2], [0, 2], [], [2, 3], [], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition42"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaa'''
+captures = [[[0, 3], [0, 1], [], [0, 1], [1, 2], [], [1, 2], [2, 3], [], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition44"
+regex = '''((..)|(.)){1}'''
+input = '''aaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition46"
+regex = '''((..)|(.)){2}'''
+input = '''aaa'''
+captures = [[[0, 3], [2, 3], [0, 2], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition47"
+regex = '''((..)|(.)){3}'''
+input = '''aaa'''
+captures = [[[0, 3], [2, 3], [], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition50"
+regex = '''((..)|(.))*'''
+input = '''aaa'''
+captures = [[[0, 3], [2, 3], [0, 2], [2, 3]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition52"
+regex = '''((..)|(.))'''
+input = '''aaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition53"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''aaaa'''
+captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition54"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaaa'''
+captures = [[[0, 4], [0, 2], [0, 2], [], [2, 3], [], [2, 3], [3, 4], [], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition56"
+regex = '''((..)|(.)){1}'''
+input = '''aaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition57"
+regex = '''((..)|(.)){2}'''
+input = '''aaaa'''
+captures = [[[0, 4], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition59"
+regex = '''((..)|(.)){3}'''
+input = '''aaaa'''
+captures = [[[0, 4], [3, 4], [0, 2], [3, 4]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition61"
+regex = '''((..)|(.))*'''
+input = '''aaaa'''
+captures = [[[0, 4], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition63"
+regex = '''((..)|(.))'''
+input = '''aaaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition64"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''aaaaa'''
+captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition65"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaaaa'''
+captures = [[[0, 5], [0, 2], [0, 2], [], [2, 4], [2, 4], [], [4, 5], [], [4, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition67"
+regex = '''((..)|(.)){1}'''
+input = '''aaaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition68"
+regex = '''((..)|(.)){2}'''
+input = '''aaaaa'''
+captures = [[[0, 4], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition70"
+regex = '''((..)|(.)){3}'''
+input = '''aaaaa'''
+captures = [[[0, 5], [4, 5], [2, 4], [4, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition73"
+regex = '''((..)|(.))*'''
+input = '''aaaaa'''
+captures = [[[0, 5], [4, 5], [2, 4], [4, 5]]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition75"
+regex = '''((..)|(.))'''
+input = '''aaaaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition76"
+regex = '''((..)|(.))((..)|(.))'''
+input = '''aaaaaa'''
+captures = [[[0, 4], [0, 2], [0, 2], [], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition77"
+regex = '''((..)|(.))((..)|(.))((..)|(.))'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [0, 2], [0, 2], [], [2, 4], [2, 4], [], [4, 6], [4, 6], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition79"
+regex = '''((..)|(.)){1}'''
+input = '''aaaaaa'''
+captures = [[[0, 2], [0, 2], [0, 2], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition80"
+regex = '''((..)|(.)){2}'''
+input = '''aaaaaa'''
+captures = [[[0, 4], [2, 4], [2, 4], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition81"
+regex = '''((..)|(.)){3}'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [4, 6], [4, 6], []]]
+match_limit = 1
+unescape = true
+
+[[tests]]
+name = "repetition83"
+regex = '''((..)|(.))*'''
+input = '''aaaaaa'''
+captures = [[[0, 6], [4, 6], [4, 6], []]]
+match_limit = 1
+unescape = true
+
diff --git a/tests/data/iter.toml b/tests/data/iter.toml

new file mode 100644 (file)

index 0000000..6c0539f
--- /dev/null
+++ b/tests/data/iter.toml
@@ -0,0 +1,119 @@
+[[tests]]
+name = "1"
+regex = "a"
+input = "aaa"
+matches = [[0, 1], [1, 2], [2, 3]]
+
+[[tests]]
+name = "2"
+regex = "a"
+input = "aba"
+matches = [[0, 1], [2, 3]]
+
+[[tests]]
+name = "empty1"
+regex = ''
+input = ''
+matches = [[0, 0]]
+
+[[tests]]
+name = "empty2"
+regex = ''
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty3"
+regex = '()'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty4"
+regex = '()*'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty5"
+regex = '()+'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty6"
+regex = '()?'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty7"
+regex = '()()'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty8"
+regex = '()+|z'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty9"
+regex = 'z|()+'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty10"
+regex = '()+|b'
+input = 'abc'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+
+[[tests]]
+name = "empty11"
+regex = 'b|()+'
+input = 'abc'
+matches = [[0, 0], [1, 2], [3, 3]]
+
+[[tests]]
+name = "start1"
+regex = "^a"
+input = "a"
+matches = [[0, 1]]
+
+[[tests]]
+name = "start2"
+regex = "^a"
+input = "aa"
+matches = [[0, 1]]
+
+[[tests]]
+name = "anchored1"
+regex = "a"
+input = "a"
+matches = [[0, 1]]
+anchored = true
+
+# This test is pretty subtle. It demonstrates the crucial difference between
+# '^a' and 'a' compiled in 'anchored' mode. The former regex exclusively
+# matches at the start of a haystack and nowhere else. The latter regex has
+# no such restriction, but its automaton is constructed such that it lacks a
+# `.*?` prefix. So it can actually produce matches at multiple locations.
+# The anchored3 test drives this point home.
+[[tests]]
+name = "anchored2"
+regex = "a"
+input = "aa"
+matches = [[0, 1], [1, 2]]
+anchored = true
+
+# Unlikely anchored2, this test stops matching anything after it sees `b`
+# since it lacks a `.*?` prefix. Since it is looking for 'a' but sees 'b', it
+# determines that there are no remaining matches.
+[[tests]]
+name = "anchored3"
+regex = "a"
+input = "aaba"
+matches = [[0, 1], [1, 2]]
+anchored = true
diff --git a/tests/data/misc.toml b/tests/data/misc.toml

new file mode 100644 (file)

index 0000000..c05418d
--- /dev/null
+++ b/tests/data/misc.toml
@@ -0,0 +1,99 @@
+[[tests]]
+name = "ascii-literal"
+regex = "a"
+input = "a"
+matches = [[0, 1]]
+
+[[tests]]
+name = "ascii-literal-not"
+regex = "a"
+input = "z"
+matches = []
+
+[[tests]]
+name = "ascii-literal-anchored"
+regex = "a"
+input = "a"
+matches = [[0, 1]]
+anchored = true
+
+[[tests]]
+name = "ascii-literal-anchored-not"
+regex = "a"
+input = "z"
+matches = []
+anchored = true
+
+[[tests]]
+name = "anchor-start-end-line"
+regex = '(?m)^bar$'
+input = "foo\nbar\nbaz"
+matches = [[4, 7]]
+
+[[tests]]
+name = "prefix-literal-match"
+regex = '^abc'
+input = "abc"
+matches = [[0, 3]]
+
+[[tests]]
+name = "prefix-literal-match-ascii"
+regex = '^abc'
+input = "abc"
+matches = [[0, 3]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "prefix-literal-no-match"
+regex = '^abc'
+input = "zabc"
+matches = []
+
+[[tests]]
+name = "one-literal-edge"
+regex = 'abc'
+input = "xxxxxab"
+matches = []
+
+[[tests]]
+name = "terminates"
+regex = 'a$'
+input = "a"
+matches = [[0, 1]]
+
+[[tests]]
+name = "suffix-100"
+regex = '.*abcd'
+input = "abcd"
+matches = [[0, 4]]
+
+[[tests]]
+name = "suffix-200"
+regex = '.*(?:abcd)+'
+input = "abcd"
+matches = [[0, 4]]
+
+[[tests]]
+name = "suffix-300"
+regex = '.*(?:abcd)+'
+input = "abcdabcd"
+matches = [[0, 8]]
+
+[[tests]]
+name = "suffix-400"
+regex = '.*(?:abcd)+'
+input = "abcdxabcd"
+matches = [[0, 9]]
+
+[[tests]]
+name = "suffix-500"
+regex = '.*x(?:abcd)+'
+input = "abcdxabcd"
+matches = [[0, 9]]
+
+[[tests]]
+name = "suffix-600"
+regex = '[^abcd]*x(?:abcd)+'
+input = "abcdxabcd"
+matches = [[4, 9]]
diff --git a/tests/data/multiline.toml b/tests/data/multiline.toml

new file mode 100644 (file)

index 0000000..cefdb26
--- /dev/null
+++ b/tests/data/multiline.toml
@@ -0,0 +1,275 @@
+[[tests]]
+name = "basic1"
+regex = '(?m)^[a-z]+$'
+input = "abc\ndef\nxyz"
+matches = [[0, 3], [4, 7], [8, 11]]
+
+[[tests]]
+name = "basic2"
+regex = '(?m)^$'
+input = "abc\ndef\nxyz"
+matches = []
+
+[[tests]]
+name = "basic3"
+regex = '(?m)^'
+input = "abc\ndef\nxyz"
+matches = [[0, 0], [4, 4], [8, 8]]
+
+[[tests]]
+name = "basic4"
+regex = '(?m)$'
+input = "abc\ndef\nxyz"
+matches = [[3, 3], [7, 7], [11, 11]]
+
+[[tests]]
+name = "basic5"
+regex = '(?m)^[a-z]'
+input = "abc\ndef\nxyz"
+matches = [[0, 1], [4, 5], [8, 9]]
+
+[[tests]]
+name = "basic6"
+regex = '(?m)[a-z]^'
+input = "abc\ndef\nxyz"
+matches = []
+
+[[tests]]
+name = "basic7"
+regex = '(?m)[a-z]$'
+input = "abc\ndef\nxyz"
+matches = [[2, 3], [6, 7], [10, 11]]
+
+[[tests]]
+name = "basic8"
+regex = '(?m)$[a-z]'
+input = "abc\ndef\nxyz"
+matches = []
+
+[[tests]]
+name = "basic9"
+regex = '(?m)^$'
+input = ""
+matches = [[0, 0]]
+
+[[tests]]
+name = "repeat1"
+regex = '(?m)(?:^$)*'
+input = "a\nb\nc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+
+[[tests]]
+name = "repeat1-no-multi"
+regex = '(?:^$)*'
+input = "a\nb\nc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+
+[[tests]]
+name = "repeat2"
+regex = '(?m)(?:^|a)+'
+input = "a\naaa\n"
+matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
+
+[[tests]]
+name = "repeat100"
+regex = '(?m)(?:^|a)+'
+input = "a\naaa\n"
+matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
+
+[[tests]]
+name = "repeat2-no-multi"
+regex = '(?:^|a)+'
+input = "a\naaa\n"
+matches = [[0, 0], [2, 5]]
+
+[[tests]]
+name = "repeat3"
+regex = '(?m)(?:^|a)*'
+input = "a\naaa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
+
+[[tests]]
+name = "repeat3-no-multi"
+regex = '(?:^|a)*'
+input = "a\naaa\n"
+matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
+
+[[tests]]
+name = "repeat4"
+regex = '(?m)(?:^|a+)'
+input = "a\naaa\n"
+matches = [[0, 0], [2, 2], [3, 5], [6, 6]]
+
+[[tests]]
+name = "repeat4-no-multi"
+regex = '(?:^|a+)'
+input = "a\naaa\n"
+matches = [[0, 0], [2, 5]]
+
+[[tests]]
+name = "repeat5"
+regex = '(?m)(?:^|a*)'
+input = "a\naaa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 5], [6, 6]]
+
+[[tests]]
+name = "repeat5-no-multi"
+regex = '(?:^|a*)'
+input = "a\naaa\n"
+matches = [[0, 0], [1, 1], [2, 5], [6, 6]]
+
+[[tests]]
+name = "repeat6"
+regex = '(?m)(?:^[a-z])+'
+input = "abc\ndef\nxyz"
+matches = [[0, 1], [4, 5], [8, 9]]
+
+[[tests]]
+name = "repeat6-no-multi"
+regex = '(?:^[a-z])+'
+input = "abc\ndef\nxyz"
+matches = [[0, 1]]
+
+[[tests]]
+name = "repeat7"
+regex = '(?m)(?:^[a-z]{3}\n?)+'
+input = "abc\ndef\nxyz"
+matches = [[0, 11]]
+
+[[tests]]
+name = "repeat7-no-multi"
+regex = '(?:^[a-z]{3}\n?)+'
+input = "abc\ndef\nxyz"
+matches = [[0, 4]]
+
+[[tests]]
+name = "repeat8"
+regex = '(?m)(?:^[a-z]{3}\n?)*'
+input = "abc\ndef\nxyz"
+matches = [[0, 11]]
+
+[[tests]]
+name = "repeat8-no-multi"
+regex = '(?:^[a-z]{3}\n?)*'
+input = "abc\ndef\nxyz"
+matches = [[0, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11]]
+
+[[tests]]
+name = "repeat9"
+regex = '(?m)(?:\n?[a-z]{3}$)+'
+input = "abc\ndef\nxyz"
+matches = [[0, 11]]
+
+[[tests]]
+name = "repeat9-no-multi"
+regex = '(?:\n?[a-z]{3}$)+'
+input = "abc\ndef\nxyz"
+matches = [[7, 11]]
+
+[[tests]]
+name = "repeat10"
+regex = '(?m)(?:\n?[a-z]{3}$)*'
+input = "abc\ndef\nxyz"
+matches = [[0, 11]]
+
+[[tests]]
+name = "repeat10-no-multi"
+regex = '(?:\n?[a-z]{3}$)*'
+input = "abc\ndef\nxyz"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 11]]
+
+[[tests]]
+name = "repeat11"
+regex = '(?m)^*'
+input = "\naa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+
+[[tests]]
+name = "repeat11-no-multi"
+regex = '^*'
+input = "\naa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+
+[[tests]]
+name = "repeat12"
+regex = '(?m)^+'
+input = "\naa\n"
+matches = [[0, 0], [1, 1], [4, 4]]
+
+[[tests]]
+name = "repeat12-no-multi"
+regex = '^+'
+input = "\naa\n"
+matches = [[0, 0]]
+
+[[tests]]
+name = "repeat13"
+regex = '(?m)$*'
+input = "\naa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+
+[[tests]]
+name = "repeat13-no-multi"
+regex = '$*'
+input = "\naa\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+
+[[tests]]
+name = "repeat14"
+regex = '(?m)$+'
+input = "\naa\n"
+matches = [[0, 0], [3, 3], [4, 4]]
+
+[[tests]]
+name = "repeat14-no-multi"
+regex = '$+'
+input = "\naa\n"
+matches = [[4, 4]]
+
+[[tests]]
+name = "repeat15"
+regex = '(?m)(?:$\n)+'
+input = "\n\naaa\n\n"
+matches = [[0, 2], [5, 7]]
+
+[[tests]]
+name = "repeat15-no-multi"
+regex = '(?:$\n)+'
+input = "\n\naaa\n\n"
+matches = []
+
+[[tests]]
+name = "repeat16"
+regex = '(?m)(?:$\n)*'
+input = "\n\naaa\n\n"
+matches = [[0, 2], [3, 3], [4, 4], [5, 7]]
+
+[[tests]]
+name = "repeat16-no-multi"
+regex = '(?:$\n)*'
+input = "\n\naaa\n\n"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]]
+
+[[tests]]
+name = "repeat17"
+regex = '(?m)(?:$\n^)+'
+input = "\n\naaa\n\n"
+matches = [[0, 2], [5, 7]]
+
+[[tests]]
+name = "repeat17-no-multi"
+regex = '(?:$\n^)+'
+input = "\n\naaa\n\n"
+matches = []
+
+[[tests]]
+name = "repeat18"
+regex = '(?m)(?:^|$)+'
+input = "\n\naaa\n\n"
+matches = [[0, 0], [1, 1], [2, 2], [5, 5], [6, 6], [7, 7]]
+
+[[tests]]
+name = "repeat18-no-multi"
+regex = '(?:^|$)+'
+input = "\n\naaa\n\n"
+matches = [[0, 0], [7, 7]]
diff --git a/tests/data/no-unicode.toml b/tests/data/no-unicode.toml

new file mode 100644 (file)

index 0000000..c7fc966
--- /dev/null
+++ b/tests/data/no-unicode.toml
@@ -0,0 +1,158 @@
+[[tests]]
+name = "invalid-utf8-literal1"
+regex = '\xFF'
+input = '\xFF'
+matches = [[0, 1]]
+unicode = false
+utf8 = false
+unescape = true
+
+
+[[tests]]
+name = "mixed"
+regex = '(.+)(?-u)(.+)'
+input = '\xCE\x93\xCE\x94\xFF'
+matches = [[0, 5]]
+utf8 = false
+unescape = true
+
+
+[[tests]]
+name = "case1"
+regex = "a"
+input = "A"
+matches = [[0, 1]]
+case_insensitive = true
+unicode = false
+
+[[tests]]
+name = "case2"
+regex = "[a-z]+"
+input = "AaAaA"
+matches = [[0, 5]]
+case_insensitive = true
+unicode = false
+
+[[tests]]
+name = "case3"
+regex = "[a-z]+"
+input = "aA\u212AaA"
+matches = [[0, 7]]
+case_insensitive = true
+
+[[tests]]
+name = "case4"
+regex = "[a-z]+"
+input = "aA\u212AaA"
+matches = [[0, 2], [5, 7]]
+case_insensitive = true
+unicode = false
+
+
+[[tests]]
+name = "negate1"
+regex = "[^a]"
+input = "δ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "negate2"
+regex = "[^a]"
+input = "δ"
+matches = [[0, 1], [1, 2]]
+unicode = false
+utf8 = false
+
+
+[[tests]]
+name = "dotstar-prefix1"
+regex = "a"
+input = '\xFFa'
+matches = [[1, 2]]
+unicode = false
+utf8 = false
+unescape = true
+
+[[tests]]
+name = "dotstar-prefix2"
+regex = "a"
+input = '\xFFa'
+matches = [[1, 2]]
+utf8 = false
+unescape = true
+
+
+[[tests]]
+name = "null-bytes1"
+regex = '[^\x00]+\x00'
+input = 'foo\x00'
+matches = [[0, 4]]
+unicode = false
+utf8 = false
+unescape = true
+
+
+[[tests]]
+name = "word-ascii"
+regex = '\w+'
+input = "aδ"
+matches = [[0, 1]]
+unicode = false
+
+[[tests]]
+name = "word-unicode"
+regex = '\w+'
+input = "aδ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "decimal-ascii"
+regex = '\d+'
+input = "1२३9"
+matches = [[0, 1], [7, 8]]
+unicode = false
+
+[[tests]]
+name = "decimal-unicode"
+regex = '\d+'
+input = "1२३9"
+matches = [[0, 8]]
+
+[[tests]]
+name = "space-ascii"
+regex = '\s+'
+input = " \u1680"
+matches = [[0, 1]]
+unicode = false
+
+[[tests]]
+name = "space-unicode"
+regex = '\s+'
+input = " \u1680"
+matches = [[0, 4]]
+
+
+[[tests]]
+# See: https://github.com/rust-lang/regex/issues/484
+name = "iter1-bytes"
+regex = ''
+input = "☃"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+utf8 = false
+
+[[tests]]
+# See: https://github.com/rust-lang/regex/issues/484
+name = "iter1-utf8"
+regex = ''
+input = "☃"
+matches = [[0, 0], [3, 3]]
+
+[[tests]]
+# See: https://github.com/rust-lang/regex/issues/484
+# Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8.
+name = "iter2-bytes"
+regex = ''
+input = 'b\xFFr'
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+unescape = true
+utf8 = false
diff --git a/tests/data/overlapping.toml b/tests/data/overlapping.toml

new file mode 100644 (file)

index 0000000..6662876
--- /dev/null
+++ b/tests/data/overlapping.toml
@@ -0,0 +1,126 @@
+[[tests]]
+name = "repetition-plus-leftmost-first-100"
+regex = 'a+'
+input = "aaa"
+matches = [[0, 1], [0, 2], [0, 3]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-plus-all-100"
+regex = 'a+'
+input = "aaa"
+matches = [[0, 1], [0, 2], [0, 3]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-plus-leftmost-first-200"
+regex = '(abc)+'
+input = "zzabcabczzabc"
+matches = [[2, 5], [2, 8]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-plus-all-200"
+regex = '(abc)+'
+input = "zzabcabczzabc"
+matches = [[2, 5], [2, 8], [10, 13]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-star-leftmost-first-100"
+regex = 'a*'
+input = "aaa"
+matches = [[0, 0], [0, 1], [0, 2], [0, 3]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-star-all-100"
+regex = 'a*'
+input = "aaa"
+matches = [[0, 0], [0, 1], [0, 2], [0, 3]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-star-leftmost-first-200"
+regex = '(abc)*'
+input = "zzabcabczzabc"
+matches = [[0, 0]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "repetition-star-all-200"
+regex = '(abc)*'
+input = "zzabcabczzabc"
+matches = [
+  [0, 0], [1, 1], [2, 2], [3, 3], [4, 4],
+  [2, 5],
+  [6, 6], [7, 7],
+  [2, 8],
+  [9, 9], [10, 10], [11, 11], [12, 12],
+  [10, 13],
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "start-end-rep-leftmost-first"
+regex = '(^$)*'
+input = "abc"
+matches = [[0, 0]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "start-end-rep-all"
+regex = '(^$)*'
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "alt-leftmost-first-100"
+regex = 'abc|a'
+input = "zzabcazzaabc"
+matches = [[2, 3], [2, 5]]
+match_kind = "leftmost-first"
+search_kind = "overlapping"
+
+[[tests]]
+name = "alt-all-100"
+regex = 'abc|a'
+input = "zzabcazzaabc"
+matches = [[2, 3], [2, 5], [5, 6], [8, 9], [9, 10], [9, 12]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty-000"
+regex = ""
+input = "abc"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty-alt-000"
+regex = "|b"
+input = "abc"
+matches = [[0, 0], [1, 1], [1, 2], [3, 3]]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty-alt-010"
+regex = "b|"
+input = "abc"
+matches = [[0, 0], [1, 1], [1, 2], [3, 3]]
+match_kind = "all"
+search_kind = "overlapping"
diff --git a/tests/data/regression.toml b/tests/data/regression.toml

new file mode 100644 (file)

index 0000000..6a4dbb1
--- /dev/null
+++ b/tests/data/regression.toml
@@ -0,0 +1,423 @@
+# See: https://github.com/rust-lang/regex/issues/48
+[[tests]]
+name = "invalid-regex-no-crash-100"
+regex = '(*)'
+input = ""
+matches = []
+compiles = false
+
+# See: https://github.com/rust-lang/regex/issues/48
+[[tests]]
+name = "invalid-regex-no-crash-200"
+regex = '(?:?)'
+input = ""
+matches = []
+compiles = false
+
+# See: https://github.com/rust-lang/regex/issues/48
+[[tests]]
+name = "invalid-regex-no-crash-300"
+regex = '(?)'
+input = ""
+matches = []
+compiles = false
+
+# See: https://github.com/rust-lang/regex/issues/48
+[[tests]]
+name = "invalid-regex-no-crash-400"
+regex = '*'
+input = ""
+matches = []
+compiles = false
+
+# See: https://github.com/rust-lang/regex/issues/75
+[[tests]]
+name = "unsorted-binary-search-100"
+regex = '(?i-u)[a_]+'
+input = "A_"
+matches = [[0, 2]]
+
+# See: https://github.com/rust-lang/regex/issues/75
+[[tests]]
+name = "unsorted-binary-search-200"
+regex = '(?i-u)[A_]+'
+input = "a_"
+matches = [[0, 2]]
+
+# See: https://github.com/rust-lang/regex/issues/76
+[[tests]]
+name = "unicode-case-lower-nocase-flag"
+regex = '(?i)\p{Ll}+'
+input = "ΛΘΓΔα"
+matches = [[0, 10]]
+
+# See: https://github.com/rust-lang/regex/issues/99
+[[tests]]
+name = "negated-char-class-100"
+regex = '(?i)[^x]'
+input = "x"
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/99
+[[tests]]
+name = "negated-char-class-200"
+regex = '(?i)[^x]'
+input = "X"
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/101
+[[tests]]
+name = "ascii-word-underscore"
+regex = '[[:word:]]'
+input = "_"
+matches = [[0, 1]]
+
+# See: https://github.com/rust-lang/regex/issues/129
+[[tests]]
+name = "captures-repeat"
+regex = '([a-f]){2}(?P<foo>[x-z])'
+input = "abx"
+captures = [
+  [[0, 3], [0, 2], [2, 3]],
+]
+
+# See: https://github.com/rust-lang/regex/issues/153
+[[tests]]
+name = "alt-in-alt-100"
+regex = 'ab?|$'
+input = "az"
+matches = [[0, 1], [2, 2]]
+
+# See: https://github.com/rust-lang/regex/issues/153
+[[tests]]
+name = "alt-in-alt-200"
+regex = '^(.*?)(\n|\r\n?|$)'
+input = "ab\rcd"
+matches = [[0, 3]]
+
+# See: https://github.com/rust-lang/regex/issues/169
+[[tests]]
+name = "leftmost-first-prefix"
+regex = 'z*azb'
+input = "azb"
+matches = [[0, 3]]
+
+# See: https://github.com/rust-lang/regex/issues/191
+[[tests]]
+name = "many-alternates"
+regex = '1|2|3|4|5|6|7|8|9|10|int'
+input = "int"
+matches = [[0, 3]]
+
+# See: https://github.com/rust-lang/regex/issues/204
+[[tests]]
+name = "word-boundary-alone-100"
+regex = '\b'
+input = "Should this (work?)"
+matches = [[0, 0], [6, 6], [7, 7], [11, 11], [13, 13], [17, 17]]
+
+# See: https://github.com/rust-lang/regex/issues/204
+[[tests]]
+name = "word-boundary-alone-200"
+regex = '\b'
+input = "a b c"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+
+# See: https://github.com/rust-lang/regex/issues/264
+[[tests]]
+name = "word-boundary-ascii-no-capture"
+regex = '\B'
+input = "\U00028F3E"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+unicode = false
+utf8 = false
+
+# See: https://github.com/rust-lang/regex/issues/264
+[[tests]]
+name = "word-boundary-ascii-capture"
+regex = '(\B)'
+input = "\U00028F3E"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+unicode = false
+utf8 = false
+
+# See: https://github.com/rust-lang/regex/issues/268
+[[tests]]
+name = "partial-anchor"
+regex = '^a|b'
+input = "ba"
+matches = [[0, 1]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "endl-or-word-boundary"
+regex = '(?m:$)|(?-u:\b)'
+input = "\U0006084E"
+matches = [[4, 4]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "zero-or-end"
+regex = '(?i-u:\x00)|$'
+input = "\U000E682F"
+matches = [[4, 4]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "y-or-endl"
+regex = '(?i-u:y)|(?m:$)'
+input = "\U000B4331"
+matches = [[4, 4]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "word-boundary-start-x"
+regex = '(?u:\b)^(?-u:X)'
+input = "X"
+matches = [[0, 1]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "word-boundary-ascii-start-x"
+regex = '(?-u:\b)^(?-u:X)'
+input = "X"
+matches = [[0, 1]]
+
+# See: https://github.com/rust-lang/regex/issues/271
+[[tests]]
+name = "end-not-word-boundary"
+regex = '$\B'
+input = "\U0005C124\U000B576C"
+matches = [[8, 8]]
+unicode = false
+utf8 = false
+
+# See: https://github.com/rust-lang/regex/issues/280
+[[tests]]
+name = "partial-anchor-alternate-begin"
+regex = '^a|z'
+input = "yyyyya"
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/280
+[[tests]]
+name = "partial-anchor-alternate-end"
+regex = 'a$|z'
+input = "ayyyyy"
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/289
+[[tests]]
+name = "lits-unambiguous-100"
+regex = '(ABC|CDA|BC)X'
+input = "CDAX"
+matches = [[0, 4]]
+
+# See: https://github.com/rust-lang/regex/issues/291
+[[tests]]
+name = "lits-unambiguous-200"
+regex = '((IMG|CAM|MG|MB2)_|(DSCN|CIMG))(?P<n>[0-9]+)$'
+input = "CIMG2341"
+captures = [
+  [[0, 8], [0, 4], [], [0, 4], [4, 8]],
+]
+
+# See: https://github.com/rust-lang/regex/issues/303
+[[tests]]
+name = "negated-full-byte-range"
+regex = '[^\x00-\xFF]'
+input = ""
+matches = []
+compiles = false
+unicode = false
+utf8 = false
+
+# See: https://github.com/rust-lang/regex/issues/321
+[[tests]]
+name = "strange-anchor-non-complete-prefix"
+regex = 'a^{2}'
+input = ""
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/321
+[[tests]]
+name = "strange-anchor-non-complete-suffix"
+regex = '${2}a'
+input = ""
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/334
+# See: https://github.com/rust-lang/regex/issues/557
+[[tests]]
+name = "captures-after-dfa-premature-end-100"
+regex = 'a(b*(X|$))?'
+input = "abcbX"
+captures = [
+  [[0, 1], [], []],
+]
+
+# See: https://github.com/rust-lang/regex/issues/334
+# See: https://github.com/rust-lang/regex/issues/557
+[[tests]]
+name = "captures-after-dfa-premature-end-200"
+regex = 'a(bc*(X|$))?'
+input = "abcbX"
+captures = [
+  [[0, 1], [], []],
+]
+
+# See: https://github.com/rust-lang/regex/issues/334
+# See: https://github.com/rust-lang/regex/issues/557
+[[tests]]
+name = "captures-after-dfa-premature-end-300"
+regex = '(aa$)?'
+input = "aaz"
+captures = [
+  [[0, 0]],
+  [[1, 1]],
+  [[2, 2]],
+  [[3, 3]],
+]
+
+# See: https://github.com/rust-lang/regex/issues/437
+[[tests]]
+name = "literal-panic"
+regex = 'typename type\-parameter\-[0-9]+\-[0-9]+::.+'
+input = "test"
+matches = []
+
+# See: https://github.com/rust-lang/regex/issues/527
+[[tests]]
+name = "empty-flag-expr"
+regex = '(((?x)))'
+input = ""
+matches = [[0, 0]]
+
+# See: https://github.com/rust-lang/regex/issues/533
+[[tests]]
+name = "blank-matches-nothing-between-space-and-tab"
+regex = '[[:blank:]]'
+input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
+match = false
+unescape = true
+
+# See: https://github.com/rust-lang/regex/issues/533
+[[tests]]
+name = "blank-matches-nothing-between-space-and-tab-inverted"
+regex = '^[[:^blank:]]+$'
+input = '\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
+match = true
+unescape = true
+
+# See: https://github.com/rust-lang/regex/issues/555
+[[tests]]
+name = "invalid-repetition"
+regex = '(?m){1,1}'
+input = ""
+matches = []
+compiles = false
+
+# See: https://github.com/rust-lang/regex/issues/640
+[[tests]]
+name = "flags-are-unset"
+regex = '((?i)foo)|Bar'
+input = "foo Foo bar Bar"
+matches = [[0, 3], [4, 7], [12, 15]]
+
+# Note that 'Ј' is not 'j', but cyrillic Je
+# https://en.wikipedia.org/wiki/Je_(Cyrillic)
+#
+# See: https://github.com/rust-lang/regex/issues/659
+[[tests]]
+name = "empty-group-with-unicode"
+regex = '()Ј01'
+input = 'zЈ01'
+matches = [[1, 5]]
+
+# See: https://github.com/rust-lang/regex/issues/579
+[[tests]]
+name = "word-boundary-weird"
+regex = '\b..\b'
+input = "I have 12, he has 2!"
+matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]]
+
+# See: https://github.com/rust-lang/regex/issues/579
+[[tests]]
+name = "word-boundary-weird-ascii"
+regex = '\b..\b'
+input = "I have 12, he has 2!"
+matches = [[0, 2], [7, 9], [9, 11], [11, 13], [17, 19]]
+unicode = false
+utf8 = false
+
+# See: https://github.com/rust-lang/regex/issues/579
+[[tests]]
+name = "word-boundary-weird-minimal-ascii"
+regex = '\b..\b'
+input = "az,,b"
+matches = [[0, 2], [2, 4]]
+unicode = false
+utf8 = false
+
+# See: https://github.com/BurntSushi/ripgrep/issues/1203
+[[tests]]
+name = "reverse-suffix-100"
+regex = '[0-4][0-4][0-4]000'
+input = "153.230000"
+matches = [[4, 10]]
+
+# See: https://github.com/BurntSushi/ripgrep/issues/1203
+[[tests]]
+name = "reverse-suffix-200"
+regex = '[0-9][0-9][0-9]000'
+input = "153.230000\n"
+matches = [[4, 10]]
+
+# See: https://github.com/BurntSushi/ripgrep/issues/1247
+[[tests]]
+name = "stops"
+regex = '\bs(?:[ab])'
+input = 's\xE4'
+matches = []
+unescape = true
+
+# See: https://github.com/BurntSushi/ripgrep/issues/1247
+[[tests]]
+name = "stops-ascii"
+regex = '(?-u:\b)s(?:[ab])'
+input = 's\xE4'
+matches = []
+unescape = true
+
+# There is no issue for this bug.
+[[tests]]
+name = "anchored-prefix-100"
+regex = '^a[[:^space:]]'
+input = "a "
+matches = []
+
+# There is no issue for this bug.
+[[tests]]
+name = "anchored-prefix-200"
+regex = '^a[[:^space:]]'
+input = "foo boo a"
+matches = []
+
+# There is no issue for this bug.
+[[tests]]
+name = "anchored-prefix-300"
+regex = '^-[a-z]'
+input = "r-f"
+matches = []
+
+# Tests that a possible Aho-Corasick optimization works correctly. It only
+# kicks in when we have a lot of literals. By "works correctly," we mean that
+# leftmost-first match semantics are properly respected. That is, samwise
+# should match, not sam.
+#
+# There is no issue for this bug.
+[[tests]]
+name = "aho-corasick-100"
+regex = 'samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z'
+input = "samwise"
+matches = [[0, 7]]
diff --git a/tests/data/set.toml b/tests/data/set.toml

new file mode 100644 (file)

index 0000000..e0eb058
--- /dev/null
+++ b/tests/data/set.toml
@@ -0,0 +1,523 @@
+[[tests]]
+name = "basic10"
+regexes = ["a", "a"]
+input = "a"
+matches = [
+  { id = 0, offsets = [0, 1] },
+  { id = 1, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic10-leftmost-first"
+regexes = ["a", "a"]
+input = "a"
+matches = [
+  { id = 0, offsets = [0, 1] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "basic20"
+regexes = ["a", "a"]
+input = "ba"
+matches = [
+  { id = 0, offsets = [1, 2] },
+  { id = 1, offsets = [1, 2] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic30"
+regexes = ["a", "b"]
+input = "a"
+matches = [
+  { id = 0, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic40"
+regexes = ["a", "b"]
+input = "b"
+matches = [
+  { id = 1, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic50"
+regexes = ["a|b", "b|a"]
+input = "b"
+matches = [
+  { id = 0, offsets = [0, 1] },
+  { id = 1, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic60"
+regexes = ["foo", "oo"]
+input = "foo"
+matches = [
+  { id = 0, offsets = [0, 3] },
+  { id = 1, offsets = [1, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic60-leftmost-first"
+regexes = ["foo", "oo"]
+input = "foo"
+matches = [
+  { id = 0, offsets = [0, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "basic61"
+regexes = ["oo", "foo"]
+input = "foo"
+matches = [
+  { id = 1, offsets = [0, 3] },
+  { id = 0, offsets = [1, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic61-leftmost-first"
+regexes = ["oo", "foo"]
+input = "foo"
+matches = [
+  { id = 1, offsets = [0, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "basic70"
+regexes = ["abcd", "bcd", "cd", "d"]
+input = "abcd"
+matches = [
+  { id = 0, offsets = [0, 4] },
+  { id = 1, offsets = [1, 4] },
+  { id = 2, offsets = [2, 4] },
+  { id = 3, offsets = [3, 4] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic71"
+regexes = ["bcd", "cd", "d", "abcd"]
+input = "abcd"
+matches = [
+  { id = 3, offsets = [0, 4] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "basic80"
+regexes = ["^foo", "bar$"]
+input = "foo"
+matches = [
+  { id = 0, offsets = [0, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic81"
+regexes = ["^foo", "bar$"]
+input = "foo bar"
+matches = [
+  { id = 0, offsets = [0, 3] },
+  { id = 1, offsets = [4, 7] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic82"
+regexes = ["^foo", "bar$"]
+input = "bar"
+matches = [
+  { id = 1, offsets = [0, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic90"
+regexes = ["[a-z]+$", "foo"]
+input = "01234 foo"
+matches = [
+  { id = 0, offsets = [6, 9] },
+  { id = 1, offsets = [6, 9] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic91"
+regexes = ["[a-z]+$", "foo"]
+input = "foo 01234"
+matches = [
+  { id = 1, offsets = [0, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic100"
+regexes = [".*?", "a"]
+input = "zzza"
+matches = [
+  { id = 0, offsets = [0, 0] },
+  { id = 0, offsets = [0, 1] },
+  { id = 0, offsets = [0, 2] },
+  { id = 0, offsets = [0, 3] },
+  { id = 0, offsets = [0, 4] },
+  { id = 1, offsets = [3, 4] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic101"
+regexes = [".*", "a"]
+input = "zzza"
+matches = [
+  { id = 0, offsets = [0, 0] },
+  { id = 0, offsets = [0, 1] },
+  { id = 0, offsets = [0, 2] },
+  { id = 0, offsets = [0, 3] },
+  { id = 0, offsets = [0, 4] },
+  { id = 1, offsets = [3, 4] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic102"
+regexes = [".*", "a"]
+input = "zzz"
+matches = [
+  { id = 0, offsets = [0, 0] },
+  { id = 0, offsets = [0, 1] },
+  { id = 0, offsets = [0, 2] },
+  { id = 0, offsets = [0, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic110"
+regexes = ['\ba\b']
+input = "hello a bye"
+matches = [
+  { id = 0, offsets = [6, 7] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic111"
+regexes = ['\ba\b', '\be\b']
+input = "hello a bye e"
+matches = [
+  { id = 0, offsets = [6, 7] },
+  { id = 1, offsets = [12, 13] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic120"
+regexes = ["a"]
+input = "a"
+matches = [
+  { id = 0, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic121"
+regexes = [".*a"]
+input = "a"
+matches = [
+  { id = 0, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic122"
+regexes = [".*a", "β"]
+input = "β"
+matches = [
+  { id = 1, offsets = [0, 2] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "basic130"
+regexes = ["ab", "b"]
+input = "ba"
+matches = [
+  { id = 1, offsets = [0, 1] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty10"
+regexes = ["", "a"]
+input = "abc"
+matches = [
+  { id = 0, offsets = [0, 0] },
+  { id = 1, offsets = [0, 1] },
+  { id = 0, offsets = [1, 1] },
+  { id = 0, offsets = [2, 2] },
+  { id = 0, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty10-leftmost-first"
+regexes = ["", "a"]
+input = "abc"
+matches = [
+  { id = 0, offsets = [0, 0] },
+  { id = 0, offsets = [1, 1] },
+  { id = 0, offsets = [2, 2] },
+  { id = 0, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty11"
+regexes = ["a", ""]
+input = "abc"
+matches = [
+  { id = 1, offsets = [0, 0] },
+  { id = 0, offsets = [0, 1] },
+  { id = 1, offsets = [1, 1] },
+  { id = 1, offsets = [2, 2] },
+  { id = 1, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty11-leftmost-first"
+regexes = ["a", ""]
+input = "abc"
+matches = [
+  { id = 0, offsets = [0, 1] },
+  { id = 1, offsets = [2, 2] },
+  { id = 1, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty20"
+regexes = ["", "b"]
+input = "abc"
+matches = [
+  { id = 0, offsets = [0, 0] },
+  { id = 0, offsets = [1, 1] },
+  { id = 1, offsets = [1, 2] },
+  { id = 0, offsets = [2, 2] },
+  { id = 0, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty20-leftmost-first"
+regexes = ["", "b"]
+input = "abc"
+matches = [
+  { id = 0, offsets = [0, 0] },
+  { id = 0, offsets = [1, 1] },
+  { id = 0, offsets = [2, 2] },
+  { id = 0, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty21"
+regexes = ["b", ""]
+input = "abc"
+matches = [
+  { id = 1, offsets = [0, 0] },
+  { id = 1, offsets = [1, 1] },
+  { id = 0, offsets = [1, 2] },
+  { id = 1, offsets = [2, 2] },
+  { id = 1, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty21-leftmost-first"
+regexes = ["b", ""]
+input = "abc"
+matches = [
+  { id = 1, offsets = [0, 0] },
+  { id = 0, offsets = [1, 2] },
+  { id = 1, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty22"
+regexes = ["(?:)", "b"]
+input = "abc"
+matches = [
+  { id = 0, offsets = [0, 0] },
+  { id = 0, offsets = [1, 1] },
+  { id = 1, offsets = [1, 2] },
+  { id = 0, offsets = [2, 2] },
+  { id = 0, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty23"
+regexes = ["b", "(?:)"]
+input = "abc"
+matches = [
+  { id = 1, offsets = [0, 0] },
+  { id = 1, offsets = [1, 1] },
+  { id = 0, offsets = [1, 2] },
+  { id = 1, offsets = [2, 2] },
+  { id = 1, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty30"
+regexes = ["", "z"]
+input = "abc"
+matches = [
+  { id = 0, offsets = [0, 0] },
+  { id = 0, offsets = [1, 1] },
+  { id = 0, offsets = [2, 2] },
+  { id = 0, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty30-leftmost-first"
+regexes = ["", "z"]
+input = "abc"
+matches = [
+  { id = 0, offsets = [0, 0] },
+  { id = 0, offsets = [1, 1] },
+  { id = 0, offsets = [2, 2] },
+  { id = 0, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty31"
+regexes = ["z", ""]
+input = "abc"
+matches = [
+  { id = 1, offsets = [0, 0] },
+  { id = 1, offsets = [1, 1] },
+  { id = 1, offsets = [2, 2] },
+  { id = 1, offsets = [3, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty31-leftmost-first"
+regexes = ["z", ""]
+input = "abc"
+matches = [
+  { id = 1, offsets = [0, 0] },
+  { id = 1, offsets = [1, 1] },
+  { id = 1, offsets = [2, 2] },
+  { id = 1, offsets = [3, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "empty40"
+regexes = ["c(?:)", "b"]
+input = "abc"
+matches = [
+  { id = 1, offsets = [1, 2] },
+  { id = 0, offsets = [2, 3] },
+]
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "empty40-leftmost-first"
+regexes = ["c(?:)", "b"]
+input = "abc"
+matches = [
+  { id = 1, offsets = [1, 2] },
+  { id = 0, offsets = [2, 3] },
+]
+match_kind = "leftmost-first"
+search_kind = "leftmost"
+
+[[tests]]
+name = "nomatch10"
+regexes = ["a", "a"]
+input = "b"
+matches = []
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "nomatch20"
+regexes = ["^foo", "bar$"]
+input = "bar foo"
+matches = []
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "nomatch30"
+regexes = []
+input = "a"
+matches = []
+match_kind = "all"
+search_kind = "overlapping"
+
+[[tests]]
+name = "nomatch40"
+regexes = ["^rooted$", '\.log$']
+input = "notrooted"
+matches = []
+match_kind = "all"
+search_kind = "overlapping"
diff --git a/tests/data/unicode.toml b/tests/data/unicode.toml

new file mode 100644 (file)

index 0000000..016bbfd
--- /dev/null
+++ b/tests/data/unicode.toml
@@ -0,0 +1,514 @@
+# Basic Unicode literal support.
+[[tests]]
+name = "literal1"
+regex = '☃'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "literal2"
+regex = '☃+'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "literal3"
+regex = '(?i)☃+'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "literal4"
+regex = '(?i)Δ'
+input = "δ"
+matches = [[0, 2]]
+
+# Unicode word boundaries.
+[[tests]]
+name = "wb-100"
+regex = '\d\b'
+input = "6δ"
+matches = []
+
+[[tests]]
+name = "wb-200"
+regex = '\d\b'
+input = "6 "
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb-300"
+regex = '\d\B'
+input = "6δ"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb-400"
+regex = '\d\B'
+input = "6 "
+matches = []
+
+# Unicode character class support.
+[[tests]]
+name = "class1"
+regex = '[☃Ⅰ]+'
+input = "☃"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class2"
+regex = '\pN'
+input = "Ⅰ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class3"
+regex = '\pN+'
+input = "Ⅰ1Ⅱ2"
+matches = [[0, 8]]
+
+[[tests]]
+name = "class4"
+regex = '\PN+'
+input = "abⅠ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class5"
+regex = '[\PN]+'
+input = "abⅠ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class6"
+regex = '[^\PN]+'
+input = "abⅠ"
+matches = [[2, 5]]
+
+[[tests]]
+name = "class7"
+regex = '\p{Lu}+'
+input = "ΛΘΓΔα"
+matches = [[0, 8]]
+
+[[tests]]
+name = "class8"
+regex = '(?i)\p{Lu}+'
+input = "ΛΘΓΔα"
+matches = [[0, 10]]
+
+[[tests]]
+name = "class9"
+regex = '\pL+'
+input = "ΛΘΓΔα"
+matches = [[0, 10]]
+
+[[tests]]
+name = "class10"
+regex = '\p{Ll}+'
+input = "ΛΘΓΔα"
+matches = [[8, 10]]
+
+# Unicode aware "Perl" character classes.
+[[tests]]
+name = "perl1"
+regex = '\w+'
+input = "dδd"
+matches = [[0, 4]]
+
+[[tests]]
+name = "perl2"
+regex = '\w+'
+input = "⥡"
+matches = []
+
+[[tests]]
+name = "perl3"
+regex = '\W+'
+input = "⥡"
+matches = [[0, 3]]
+
+[[tests]]
+name = "perl4"
+regex = '\d+'
+input = "1२३9"
+matches = [[0, 8]]
+
+[[tests]]
+name = "perl5"
+regex = '\d+'
+input = "Ⅱ"
+matches = []
+
+[[tests]]
+name = "perl6"
+regex = '\D+'
+input = "Ⅱ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "perl7"
+regex = '\s+'
+input = " "
+matches = [[0, 3]]
+
+[[tests]]
+name = "perl8"
+regex = '\s+'
+input = "☃"
+matches = []
+
+[[tests]]
+name = "perl9"
+regex = '\S+'
+input = "☃"
+matches = [[0, 3]]
+
+# Specific tests for Unicode general category classes.
+[[tests]]
+name = "class-gencat1"
+regex = '\p{Cased_Letter}'
+input = "Ａ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat2"
+regex = '\p{Close_Punctuation}'
+input = "❯"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat3"
+regex = '\p{Connector_Punctuation}'
+input = "⁀"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat4"
+regex = '\p{Control}'
+input = "\u009F"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class-gencat5"
+regex = '\p{Currency_Symbol}'
+input = "￡"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat6"
+regex = '\p{Dash_Punctuation}'
+input = "〰"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat7"
+regex = '\p{Decimal_Number}'
+input = "𑓙"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat8"
+regex = '\p{Enclosing_Mark}'
+input = "\uA672"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat9"
+regex = '\p{Final_Punctuation}'
+input = "⸡"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat10"
+regex = '\p{Format}'
+input = "\U000E007F"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat11"
+regex = '\p{Initial_Punctuation}'
+input = "⸜"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat12"
+regex = '\p{Letter}'
+input = "Έ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class-gencat13"
+regex = '\p{Letter_Number}'
+input = "ↂ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat14"
+regex = '\p{Line_Separator}'
+input = "\u2028"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat15"
+regex = '\p{Lowercase_Letter}'
+input = "ϛ"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class-gencat16"
+regex = '\p{Mark}'
+input = "\U000E01EF"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat17"
+regex = '\p{Math}'
+input = "⋿"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat18"
+regex = '\p{Modifier_Letter}'
+input = "𖭃"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat19"
+regex = '\p{Modifier_Symbol}'
+input = "🏿"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat20"
+regex = '\p{Nonspacing_Mark}'
+input = "\U0001E94A"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat21"
+regex = '\p{Number}'
+input = "⓿"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat22"
+regex = '\p{Open_Punctuation}'
+input = "｟"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat23"
+regex = '\p{Other}'
+input = "\u0BC9"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat24"
+regex = '\p{Other_Letter}'
+input = "ꓷ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat25"
+regex = '\p{Other_Number}'
+input = "㉏"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat26"
+regex = '\p{Other_Punctuation}'
+input = "𞥞"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat27"
+regex = '\p{Other_Symbol}'
+input = "⅌"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat28"
+regex = '\p{Paragraph_Separator}'
+input = "\u2029"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat29"
+regex = '\p{Private_Use}'
+input = "\U0010FFFD"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat30"
+regex = '\p{Punctuation}'
+input = "𑁍"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat31"
+regex = '\p{Separator}'
+input = "\u3000"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat32"
+regex = '\p{Space_Separator}'
+input = "\u205F"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat33"
+regex = '\p{Spacing_Mark}'
+input = "\U00016F7E"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat34"
+regex = '\p{Symbol}'
+input = "⯈"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat35"
+regex = '\p{Titlecase_Letter}'
+input = "ῼ"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gencat36"
+regex = '\p{Unassigned}'
+input = "\U0010FFFF"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gencat37"
+regex = '\p{Uppercase_Letter}'
+input = "Ꝋ"
+matches = [[0, 3]]
+
+
+# Tests for Unicode emoji properties.
+[[tests]]
+name = "class-emoji1"
+regex = '\p{Emoji}'
+input = "\u23E9"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-emoji2"
+regex = '\p{emoji}'
+input = "\U0001F21A"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-emoji3"
+regex = '\p{extendedpictographic}'
+input = "\U0001FA6E"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-emoji4"
+regex = '\p{extendedpictographic}'
+input = "\U0001FFFD"
+matches = [[0, 4]]
+
+
+# Tests for Unicode grapheme cluster properties.
+[[tests]]
+name = "class-gcb1"
+regex = '\p{grapheme_cluster_break=prepend}'
+input = "\U00011D46"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gcb2"
+regex = '\p{gcb=regional_indicator}'
+input = "\U0001F1E6"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gcb3"
+regex = '\p{gcb=ri}'
+input = "\U0001F1E7"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gcb4"
+regex = '\p{regionalindicator}'
+input = "\U0001F1FF"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-gcb5"
+regex = '\p{gcb=lvt}'
+input = "\uC989"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-gcb6"
+regex = '\p{gcb=zwj}'
+input = "\u200D"
+matches = [[0, 3]]
+
+# Tests for Unicode word boundary properties.
+[[tests]]
+name = "class-word-break1"
+regex = '\p{word_break=Hebrew_Letter}'
+input = "\uFB46"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-word-break2"
+regex = '\p{wb=hebrewletter}'
+input = "\uFB46"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-word-break3"
+regex = '\p{wb=ExtendNumLet}'
+input = "\uFF3F"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-word-break4"
+regex = '\p{wb=WSegSpace}'
+input = "\u3000"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-word-break5"
+regex = '\p{wb=numeric}'
+input = "\U0001E950"
+matches = [[0, 4]]
+
+# Tests for Unicode sentence boundary properties.
+[[tests]]
+name = "class-sentence-break1"
+regex = '\p{sentence_break=Lower}'
+input = "\u0469"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class-sentence-break2"
+regex = '\p{sb=lower}'
+input = "\u0469"
+matches = [[0, 2]]
+
+[[tests]]
+name = "class-sentence-break3"
+regex = '\p{sb=Close}'
+input = "\uFF60"
+matches = [[0, 3]]
+
+[[tests]]
+name = "class-sentence-break4"
+regex = '\p{sb=Close}'
+input = "\U0001F677"
+matches = [[0, 4]]
+
+[[tests]]
+name = "class-sentence-break5"
+regex = '\p{sb=SContinue}'
+input = "\uFF64"
+matches = [[0, 3]]
diff --git a/tests/data/word-boundary.toml b/tests/data/word-boundary.toml

new file mode 100644 (file)

index 0000000..e84b25c
--- /dev/null
+++ b/tests/data/word-boundary.toml
@@ -0,0 +1,771 @@
+# Some of these are cribbed from RE2's test suite.
+
+# These test \b. Below are tests for \B.
+[[tests]]
+name = "wb1"
+regex = '\b'
+input = ""
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb2"
+regex = '\b'
+input = "a"
+matches = [[0, 0], [1, 1]]
+unicode = false
+
+[[tests]]
+name = "wb3"
+regex = '\b'
+input = "ab"
+matches = [[0, 0], [2, 2]]
+unicode = false
+
+[[tests]]
+name = "wb4"
+regex = '^\b'
+input = "ab"
+matches = [[0, 0]]
+unicode = false
+
+[[tests]]
+name = "wb5"
+regex = '\b$'
+input = "ab"
+matches = [[2, 2]]
+unicode = false
+
+[[tests]]
+name = "wb6"
+regex = '^\b$'
+input = "ab"
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb7"
+regex = '\bbar\b'
+input = "nobar bar foo bar"
+matches = [[6, 9], [14, 17]]
+unicode = false
+
+[[tests]]
+name = "wb8"
+regex = 'a\b'
+input = "faoa x"
+matches = [[3, 4]]
+unicode = false
+
+[[tests]]
+name = "wb9"
+regex = '\bbar'
+input = "bar x"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb10"
+regex = '\bbar'
+input = "foo\nbar x"
+matches = [[4, 7]]
+unicode = false
+
+[[tests]]
+name = "wb11"
+regex = 'bar\b'
+input = "foobar"
+matches = [[3, 6]]
+unicode = false
+
+[[tests]]
+name = "wb12"
+regex = 'bar\b'
+input = "foobar\nxxx"
+matches = [[3, 6]]
+unicode = false
+
+[[tests]]
+name = "wb13"
+regex = '(foo|bar|[A-Z])\b'
+input = "foo"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb14"
+regex = '(foo|bar|[A-Z])\b'
+input = "foo\n"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb15"
+regex = '\b(foo|bar|[A-Z])'
+input = "foo"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb16"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "X"
+matches = [[0, 1]]
+unicode = false
+
+[[tests]]
+name = "wb17"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "XY"
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb18"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "bar"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb19"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "foo"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb20"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "foo\n"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb21"
+regex = '\b(foo|bar|[A-Z])\b'
+input = "ffoo bbar N x"
+matches = [[10, 11]]
+unicode = false
+
+[[tests]]
+name = "wb22"
+regex = '\b(fo|foo)\b'
+input = "fo"
+matches = [[0, 2]]
+unicode = false
+
+[[tests]]
+name = "wb23"
+regex = '\b(fo|foo)\b'
+input = "foo"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb24"
+regex = '\b\b'
+input = ""
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb25"
+regex = '\b\b'
+input = "a"
+matches = [[0, 0], [1, 1]]
+unicode = false
+
+[[tests]]
+name = "wb26"
+regex = '\b$'
+input = ""
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb27"
+regex = '\b$'
+input = "x"
+matches = [[1, 1]]
+unicode = false
+
+[[tests]]
+name = "wb28"
+regex = '\b$'
+input = "y x"
+matches = [[3, 3]]
+unicode = false
+
+[[tests]]
+name = "wb29"
+regex = '(?-u:\b).$'
+input = "x"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb30"
+regex = '^\b(fo|foo)\b'
+input = "fo"
+matches = [[0, 2]]
+unicode = false
+
+[[tests]]
+name = "wb31"
+regex = '^\b(fo|foo)\b'
+input = "foo"
+matches = [[0, 3]]
+unicode = false
+
+[[tests]]
+name = "wb32"
+regex = '^\b$'
+input = ""
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb33"
+regex = '^\b$'
+input = "x"
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb34"
+regex = '^(?-u:\b).$'
+input = "x"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb35"
+regex = '^(?-u:\b).(?-u:\b)$'
+input = "x"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb36"
+regex = '^^^^^\b$$$$$'
+input = ""
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb37"
+regex = '^^^^^(?-u:\b).$$$$$'
+input = "x"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb38"
+regex = '^^^^^\b$$$$$'
+input = "x"
+matches = []
+unicode = false
+
+[[tests]]
+name = "wb39"
+regex = '^^^^^(?-u:\b\b\b).(?-u:\b\b\b)$$$$$'
+input = "x"
+matches = [[0, 1]]
+
+[[tests]]
+name = "wb40"
+regex = '(?-u:\b).+(?-u:\b)'
+input = "$$abc$$"
+matches = [[2, 5]]
+
+[[tests]]
+name = "wb41"
+regex = '\b'
+input = "a b c"
+matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
+unicode = false
+
+[[tests]]
+name = "wb42"
+regex = '\bfoo\b'
+input = "zzz foo zzz"
+matches = [[4, 7]]
+unicode = false
+
+[[tests]]
+name = "wb43"
+regex = '\b^'
+input = "ab"
+matches = [[0, 0]]
+unicode = false
+
+[[tests]]
+name = "wb44"
+regex = '$\b'
+input = "ab"
+matches = [[2, 2]]
+unicode = false
+
+
+# Tests for \B. Note that \B is not allowed if UTF-8 mode is enabled, so we
+# have to disable it for most of these tests. This is because \B can match at
+# non-UTF-8 boundaries.
+[[tests]]
+name = "nb1"
+regex = '\Bfoo\B'
+input = "n foo xfoox that"
+matches = [[7, 10]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb2"
+regex = 'a\B'
+input = "faoa x"
+matches = [[1, 2]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb3"
+regex = '\Bbar'
+input = "bar x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb4"
+regex = '\Bbar'
+input = "foo\nbar x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb5"
+regex = 'bar\B'
+input = "foobar"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb6"
+regex = 'bar\B'
+input = "foobar\nxxx"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb7"
+regex = '(foo|bar|[A-Z])\B'
+input = "foox"
+matches = [[0, 3]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb8"
+regex = '(foo|bar|[A-Z])\B'
+input = "foo\n"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb9"
+regex = '\B'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb10"
+regex = '\B'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb11"
+regex = '\B(foo|bar|[A-Z])'
+input = "foo"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb12"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "xXy"
+matches = [[1, 2]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb13"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "XY"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb14"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "XYZ"
+matches = [[1, 2]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb15"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "abara"
+matches = [[1, 4]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb16"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "xfoo_"
+matches = [[1, 4]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb17"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "xfoo\n"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb18"
+regex = '\B(foo|bar|[A-Z])\B'
+input = "foo bar vNX"
+matches = [[9, 10]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb19"
+regex = '\B(fo|foo)\B'
+input = "xfoo"
+matches = [[1, 3]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb20"
+regex = '\B(foo|fo)\B'
+input = "xfooo"
+matches = [[1, 4]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb21"
+regex = '\B\B'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb22"
+regex = '\B\B'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb23"
+regex = '\B$'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb24"
+regex = '\B$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb25"
+regex = '\B$'
+input = "y x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb26"
+regex = '\B.$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb27"
+regex = '^\B(fo|foo)\B'
+input = "fo"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb28"
+regex = '^\B(fo|foo)\B'
+input = "fo"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb29"
+regex = '^\B'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb30"
+regex = '^\B'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb31"
+regex = '^\B\B'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb32"
+regex = '^\B\B'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb33"
+regex = '^\B$'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb34"
+regex = '^\B$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb35"
+regex = '^\B.$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb36"
+regex = '^\B.\B$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb37"
+regex = '^^^^^\B$$$$$'
+input = ""
+matches = [[0, 0]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb38"
+regex = '^^^^^\B.$$$$$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "nb39"
+regex = '^^^^^\B$$$$$'
+input = "x"
+matches = []
+unicode = false
+utf8 = false
+
+
+# unicode1* and unicode2* work for both Unicode and ASCII because all matches
+# are reported as byte offsets, and « and » do not correspond to word
+# boundaries at either the character or byte level.
+[[tests]]
+name = "unicode1"
+regex = '\bx\b'
+input = "«x"
+matches = [[2, 3]]
+
+[[tests]]
+name = "unicode1-only-ascii"
+regex = '\bx\b'
+input = "«x"
+matches = [[2, 3]]
+unicode = false
+
+[[tests]]
+name = "unicode2"
+regex = '\bx\b'
+input = "x»"
+matches = [[0, 1]]
+
+[[tests]]
+name = "unicode2-only-ascii"
+regex = '\bx\b'
+input = "x»"
+matches = [[0, 1]]
+unicode = false
+
+# ASCII word boundaries are completely oblivious to Unicode characters, so
+# even though β is a character, an ASCII \b treats it as a word boundary
+# when it is adjacent to another ASCII character. (The ASCII \b only looks
+# at the leading byte of β.) For Unicode \b, the tests are precisely inverted.
+[[tests]]
+name = "unicode3"
+regex = '\bx\b'
+input = 'áxβ'
+matches = []
+
+[[tests]]
+name = "unicode3-only-ascii"
+regex = '\bx\b'
+input = 'áxβ'
+matches = [[2, 3]]
+unicode = false
+
+[[tests]]
+name = "unicode4"
+regex = '\Bx\B'
+input = 'áxβ'
+matches = [[2, 3]]
+
+[[tests]]
+name = "unicode4-only-ascii"
+regex = '\Bx\B'
+input = 'áxβ'
+matches = []
+unicode = false
+utf8 = false
+
+# The same as above, but with \b instead of \B as a sanity check.
+[[tests]]
+name = "unicode5"
+regex = '\b'
+input = "0\U0007EF5E"
+matches = [[0, 0], [1, 1]]
+
+[[tests]]
+name = "unicode5-only-ascii"
+regex = '\b'
+input = "0\U0007EF5E"
+matches = [[0, 0], [1, 1]]
+unicode = false
+utf8 = false
+
+[[tests]]
+name = "unicode5-noutf8"
+regex = '\b'
+input = '0\xFF\xFF\xFF\xFF'
+matches = [[0, 0], [1, 1]]
+unescape = true
+utf8 = false
+
+[[tests]]
+name = "unicode5-noutf8-only-ascii"
+regex = '\b'
+input = '0\xFF\xFF\xFF\xFF'
+matches = [[0, 0], [1, 1]]
+unescape = true
+unicode = false
+utf8 = false
+
+# Weird special case to ensure that ASCII \B treats each individual code unit
+# as a non-word byte. (The specific codepoint is irrelevant. It's an arbitrary
+# codepoint that uses 4 bytes in its UTF-8 encoding and is not a member of the
+# \w character class.)
+[[tests]]
+name = "unicode5-not"
+regex = '\B'
+input = "0\U0007EF5E"
+matches = [[5, 5]]
+
+[[tests]]
+name = "unicode5-not-only-ascii"
+regex = '\B'
+input = "0\U0007EF5E"
+matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
+unicode = false
+utf8 = false
+
+# This gets no matches since \B only matches in the presence of valid UTF-8
+# when Unicode is enabled, even when UTF-8 mode is disabled.
+[[tests]]
+name = "unicode5-not-noutf8"
+regex = '\B'
+input = '0\xFF\xFF\xFF\xFF'
+matches = []
+unescape = true
+utf8 = false
+
+# But this DOES get matches since \B in ASCII mode only looks at individual
+# bytes.
+[[tests]]
+name = "unicode5-not-noutf8-only-ascii"
+regex = '\B'
+input = '0\xFF\xFF\xFF\xFF'
+matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
+unescape = true
+unicode = false
+utf8 = false
+
+# Some tests of no particular significance.
+[[tests]]
+name = "unicode6"
+regex = '\b[0-9]+\b'
+input = "foo 123 bar 456 quux 789"
+matches = [[4, 7], [12, 15], [21, 24]]
+
+[[tests]]
+name = "unicode7"
+regex = '\b[0-9]+\b'
+input = "foo 123 bar a456 quux 789"
+matches = [[4, 7], [22, 25]]
+
+[[tests]]
+name = "unicode8"
+regex = '\b[0-9]+\b'
+input = "foo 123 bar 456a quux 789"
+matches = [[4, 7], [22, 25]]
diff --git a/tests/dfa/api.rs b/tests/dfa/api.rs

new file mode 100644 (file)

index 0000000..80d7d70
--- /dev/null
+++ b/tests/dfa/api.rs
@@ -0,0 +1,133 @@
+use std::error::Error;
+
+use regex_automata::{
+    dfa::{dense, regex::Regex, Automaton, OverlappingState},
+    nfa::thompson,
+    HalfMatch, MatchError, MatchKind, MultiMatch,
+};
+
+use crate::util::{BunkPrefilter, SubstringPrefilter};
+
+// Tests that quit bytes in the forward direction work correctly.
+#[test]
+fn quit_fwd() -> Result<(), Box<dyn Error>> {
+    let dfa = dense::Builder::new()
+        .configure(dense::Config::new().quit(b'x', true))
+        .build("[[:word:]]+$")?;
+
+    assert_eq!(
+        dfa.find_earliest_fwd(b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+    assert_eq!(
+        dfa.find_leftmost_fwd(b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+    assert_eq!(
+        dfa.find_overlapping_fwd(b"abcxyz", &mut OverlappingState::start()),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+
+    Ok(())
+}
+
+// Tests that quit bytes in the reverse direction work correctly.
+#[test]
+fn quit_rev() -> Result<(), Box<dyn Error>> {
+    let dfa = dense::Builder::new()
+        .configure(dense::Config::new().quit(b'x', true))
+        .thompson(thompson::Config::new().reverse(true))
+        .build("^[[:word:]]+")?;
+
+    assert_eq!(
+        dfa.find_earliest_rev(b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+    assert_eq!(
+        dfa.find_leftmost_rev(b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+
+    Ok(())
+}
+
+// Tests that if we heuristically enable Unicode word boundaries but then
+// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
+// will panic.
+#[test]
+#[should_panic]
+fn quit_panics() {
+    dense::Config::new().unicode_word_boundary(true).quit(b'\xFF', false);
+}
+
+// Tests that if we attempt an overlapping search using a regex without a
+// reverse DFA compiled with 'starts_for_each_pattern', then we get a panic.
+#[test]
+#[should_panic]
+fn incorrect_config_overlapping_search_panics() {
+    let forward = dense::DFA::new(r"abca").unwrap();
+    let reverse = dense::Builder::new()
+        .configure(
+            dense::Config::new()
+                .anchored(true)
+                .match_kind(MatchKind::All)
+                .starts_for_each_pattern(false),
+        )
+        .thompson(thompson::Config::new().reverse(true))
+        .build(r"abca")
+        .unwrap();
+
+    let re = Regex::builder().build_from_dfas(forward, reverse);
+    let haystack = "bar abcabcabca abca foo".as_bytes();
+    re.find_overlapping(haystack, &mut OverlappingState::start());
+}
+
+// This tests an intesting case where even if the Unicode word boundary option
+// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
+// word boundaries to be enabled.
+#[test]
+fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
+    let mut config = dense::Config::new();
+    for b in 0x80..=0xFF {
+        config = config.quit(b, true);
+    }
+    let dfa = dense::Builder::new().configure(config).build(r"\b")?;
+    let expected = HalfMatch::must(0, 1);
+    assert_eq!(dfa.find_leftmost_fwd(b" a"), Ok(Some(expected)));
+    Ok(())
+}
+
+// Tests that we can provide a prefilter to a Regex, and the search reports
+// correct results.
+#[test]
+fn prefilter_works() -> Result<(), Box<dyn Error>> {
+    let re = Regex::new(r"a[0-9]+")
+        .unwrap()
+        .with_prefilter(SubstringPrefilter::new("a"));
+    let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
+    let matches: Vec<(usize, usize)> =
+        re.find_leftmost_iter(text).map(|m| (m.start(), m.end())).collect();
+    assert_eq!(
+        matches,
+        vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
+    );
+    Ok(())
+}
+
+// This test confirms that a prefilter is active by using a prefilter that
+// reports false negatives.
+#[test]
+fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
+    let text = b"za123";
+    let re = Regex::new(r"a[0-9]+")
+        .unwrap()
+        .with_prefilter(SubstringPrefilter::new("a"));
+    assert_eq!(re.find_leftmost(b"za123"), Some(MultiMatch::must(0, 1, 5)));
+    assert_eq!(re.find_leftmost(b"a123"), Some(MultiMatch::must(0, 0, 4)));
+    let re = re.with_prefilter(BunkPrefilter::new());
+    assert_eq!(re.find_leftmost(b"za123"), None);
+    // This checks that the prefilter is used when first starting the search,
+    // instead of waiting until at least one transition has occurred.
+    assert_eq!(re.find_leftmost(b"a123"), None);
+    Ok(())
+}
diff --git a/tests/dfa/mod.rs b/tests/dfa/mod.rs

new file mode 100644 (file)

index 0000000..f429951
--- /dev/null
+++ b/tests/dfa/mod.rs
@@ -0,0 +1,2 @@
+mod api;
+mod suite;
diff --git a/tests/dfa/suite.rs b/tests/dfa/suite.rs

new file mode 100644 (file)

index 0000000..426ae34
--- /dev/null
+++ b/tests/dfa/suite.rs
@@ -0,0 +1,280 @@
+use regex_automata::{
+    dfa::{self, dense, regex::Regex, sparse, Automaton},
+    nfa::thompson,
+    MatchKind, SyntaxConfig,
+};
+use regex_syntax as syntax;
+
+use regex_test::{
+    bstr::{BString, ByteSlice},
+    CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
+    SearchKind as TestSearchKind, TestResult, TestRunner,
+};
+
+use crate::{suite, Result};
+
+/// Runs the test suite with the default configuration.
+#[test]
+fn unminimized_default() -> Result<()> {
+    let builder = Regex::builder();
+    TestRunner::new()?
+        .test_iter(suite()?.iter(), dense_compiler(builder))
+        .assert();
+    Ok(())
+}
+
+/// Runs the test suite with byte classes disabled.
+#[test]
+fn unminimized_no_byte_class() -> Result<()> {
+    let mut builder = Regex::builder();
+    builder.dense(dense::Config::new().byte_classes(false));
+
+    TestRunner::new()?
+        .test_iter(suite()?.iter(), dense_compiler(builder))
+        .assert();
+    Ok(())
+}
+
+/// Runs the test suite with NFA shrinking disabled.
+#[test]
+fn unminimized_no_nfa_shrink() -> Result<()> {
+    let mut builder = Regex::builder();
+    builder.thompson(thompson::Config::new().shrink(false));
+
+    TestRunner::new()?
+        .test_iter(suite()?.iter(), dense_compiler(builder))
+        .assert();
+    Ok(())
+}
+
+/// Runs the test suite on a minimized DFA with an otherwise default
+/// configuration.
+#[test]
+fn minimized_default() -> Result<()> {
+    let mut builder = Regex::builder();
+    builder.dense(dense::Config::new().minimize(true));
+    TestRunner::new()?
+        // These regexes tend to be too big. Minimization takes... forever.
+        .blacklist("expensive")
+        .test_iter(suite()?.iter(), dense_compiler(builder))
+        .assert();
+    Ok(())
+}
+
+/// Runs the test suite on a minimized DFA with byte classes disabled.
+#[test]
+fn minimized_no_byte_class() -> Result<()> {
+    let mut builder = Regex::builder();
+    builder.dense(dense::Config::new().minimize(true).byte_classes(false));
+
+    TestRunner::new()?
+        // These regexes tend to be too big. Minimization takes... forever.
+        .blacklist("expensive")
+        .test_iter(suite()?.iter(), dense_compiler(builder))
+        .assert();
+    Ok(())
+}
+
+/// Runs the test suite on a sparse unminimized DFA.
+#[test]
+fn sparse_unminimized_default() -> Result<()> {
+    let builder = Regex::builder();
+    TestRunner::new()?
+        .test_iter(suite()?.iter(), sparse_compiler(builder))
+        .assert();
+    Ok(())
+}
+
+/// Another basic sanity test that checks we can serialize and then deserialize
+/// a regex, and that the resulting regex can be used for searching correctly.
+#[test]
+fn serialization_unminimized_default() -> Result<()> {
+    let builder = Regex::builder();
+    let my_compiler = |builder| {
+        compiler(builder, |builder, re| {
+            let builder = builder.clone();
+            let (fwd_bytes, _) = re.forward().to_bytes_native_endian();
+            let (rev_bytes, _) = re.reverse().to_bytes_native_endian();
+            Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+                let fwd: dense::DFA<&[u32]> =
+                    dense::DFA::from_bytes(&fwd_bytes).unwrap().0;
+                let rev: dense::DFA<&[u32]> =
+                    dense::DFA::from_bytes(&rev_bytes).unwrap().0;
+                let re = builder.build_from_dfas(fwd, rev);
+
+                run_test(&re, test)
+            }))
+        })
+    };
+    TestRunner::new()?
+        .test_iter(suite()?.iter(), my_compiler(builder))
+        .assert();
+    Ok(())
+}
+
+/// A basic sanity test that checks we can serialize and then deserialize a
+/// regex using sparse DFAs, and that the resulting regex can be used for
+/// searching correctly.
+#[test]
+fn sparse_serialization_unminimized_default() -> Result<()> {
+    let builder = Regex::builder();
+    let my_compiler = |builder| {
+        compiler(builder, |builder, re| {
+            let builder = builder.clone();
+            let fwd_bytes = re.forward().to_sparse()?.to_bytes_native_endian();
+            let rev_bytes = re.reverse().to_sparse()?.to_bytes_native_endian();
+            Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+                let fwd: sparse::DFA<&[u8]> =
+                    sparse::DFA::from_bytes(&fwd_bytes).unwrap().0;
+                let rev: sparse::DFA<&[u8]> =
+                    sparse::DFA::from_bytes(&rev_bytes).unwrap().0;
+                let re = builder.build_from_dfas(fwd, rev);
+                run_test(&re, test)
+            }))
+        })
+    };
+    TestRunner::new()?
+        .test_iter(suite()?.iter(), my_compiler(builder))
+        .assert();
+    Ok(())
+}
+
+fn dense_compiler(
+    builder: dfa::regex::Builder,
+) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+    compiler(builder, |_, re| {
+        Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+            run_test(&re, test)
+        }))
+    })
+}
+
+fn sparse_compiler(
+    builder: dfa::regex::Builder,
+) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+    compiler(builder, |builder, re| {
+        let fwd = re.forward().to_sparse()?;
+        let rev = re.reverse().to_sparse()?;
+        let re = builder.build_from_dfas(fwd, rev);
+        Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+            run_test(&re, test)
+        }))
+    })
+}
+
+fn compiler(
+    mut builder: dfa::regex::Builder,
+    mut create_matcher: impl FnMut(
+        &dfa::regex::Builder,
+        Regex,
+    ) -> Result<CompiledRegex>,
+) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+    move |test, regexes| {
+        let regexes = regexes
+            .iter()
+            .map(|r| r.to_str().map(|s| s.to_string()))
+            .collect::<std::result::Result<Vec<String>, _>>()?;
+
+        // Check if our regex contains things that aren't supported by DFAs.
+        // That is, Unicode word boundaries when searching non-ASCII text.
+        let mut thompson = thompson::Builder::new();
+        thompson.configure(config_thompson(test));
+        // TODO: Modify Hir to report facts like this, instead of needing to
+        // build an NFA to do it.
+        if let Ok(nfa) = thompson.build_many(&regexes) {
+            let non_ascii = test.input().iter().any(|&b| !b.is_ascii());
+            if nfa.has_word_boundary_unicode() && non_ascii {
+                return Ok(CompiledRegex::skip());
+            }
+        }
+        if !configure_regex_builder(test, &mut builder) {
+            return Ok(CompiledRegex::skip());
+        }
+        create_matcher(&builder, builder.build_many(&regexes)?)
+    }
+}
+
+fn run_test<A: Automaton>(re: &Regex<A>, test: &RegexTest) -> Vec<TestResult> {
+    let is_match = if re.is_match(test.input()) {
+        TestResult::matched()
+    } else {
+        TestResult::no_match()
+    };
+    let is_match = is_match.name("is_match");
+
+    let find_matches = match test.search_kind() {
+        TestSearchKind::Earliest => {
+            let it = re
+                .find_earliest_iter(test.input())
+                .take(test.match_limit().unwrap_or(std::usize::MAX))
+                .map(|m| Match {
+                    id: m.pattern().as_usize(),
+                    start: m.start(),
+                    end: m.end(),
+                });
+            TestResult::matches(it).name("find_earliest_iter")
+        }
+        TestSearchKind::Leftmost => {
+            let it = re
+                .find_leftmost_iter(test.input())
+                .take(test.match_limit().unwrap_or(std::usize::MAX))
+                .map(|m| Match {
+                    id: m.pattern().as_usize(),
+                    start: m.start(),
+                    end: m.end(),
+                });
+            TestResult::matches(it).name("find_leftmost_iter")
+        }
+        TestSearchKind::Overlapping => {
+            let it = re
+                .find_overlapping_iter(test.input())
+                .take(test.match_limit().unwrap_or(std::usize::MAX))
+                .map(|m| Match {
+                    id: m.pattern().as_usize(),
+                    start: m.start(),
+                    end: m.end(),
+                });
+            TestResult::matches(it).name("find_overlapping_iter")
+        }
+    };
+
+    vec![is_match, find_matches]
+}
+
+/// Configures the given regex builder with all relevant settings on the given
+/// regex test.
+///
+/// If the regex test has a setting that is unsupported, then this returns
+/// false (implying the test should be skipped).
+fn configure_regex_builder(
+    test: &RegexTest,
+    builder: &mut dfa::regex::Builder,
+) -> bool {
+    let match_kind = match test.match_kind() {
+        TestMatchKind::All => MatchKind::All,
+        TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
+        TestMatchKind::LeftmostLongest => return false,
+    };
+
+    let syntax_config = SyntaxConfig::new()
+        .case_insensitive(test.case_insensitive())
+        .unicode(test.unicode())
+        .utf8(test.utf8());
+    let dense_config = dense::Config::new()
+        .anchored(test.anchored())
+        .match_kind(match_kind)
+        .unicode_word_boundary(true);
+    let regex_config = Regex::config().utf8(test.utf8());
+
+    builder
+        .configure(regex_config)
+        .syntax(syntax_config)
+        .thompson(config_thompson(test))
+        .dense(dense_config);
+    true
+}
+
+/// Configuration of a Thompson NFA compiler from a regex test.
+fn config_thompson(test: &RegexTest) -> thompson::Config {
+    thompson::Config::new().utf8(test.utf8())
+}
diff --git a/tests/hybrid/api.rs b/tests/hybrid/api.rs

new file mode 100644 (file)

index 0000000..9a834db
--- /dev/null
+++ b/tests/hybrid/api.rs
@@ -0,0 +1,195 @@
+use std::error::Error;
+
+use regex_automata::{
+    hybrid::{
+        dfa::{self, DFA},
+        regex::Regex,
+        OverlappingState,
+    },
+    nfa::thompson,
+    HalfMatch, MatchError, MatchKind, MultiMatch,
+};
+
+use crate::util::{BunkPrefilter, SubstringPrefilter};
+
+// Tests that too many cache resets cause the lazy DFA to quit.
+//
+// We only test this on 64-bit because the test is gingerly crafted based on
+// implementation details of cache sizes. It's not a great test because of
+// that, but it does check some interesting properties around how positions are
+// reported when a search "gives up."
+#[test]
+#[cfg(target_pointer_width = "64")]
+fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
+    // This is a carefully chosen regex. The idea is to pick one that requires
+    // some decent number of states (hence the bounded repetition). But we
+    // specifically choose to create a class with an ASCII letter and a
+    // non-ASCII letter so that we can check that no new states are created
+    // once the cache is full. Namely, if we fill up the cache on a haystack
+    // of 'a's, then in order to match one 'β', a new state will need to be
+    // created since a 'β' is encoded with multiple bytes. Since there's no
+    // room for this state, the search should quit at the very first position.
+    let pattern = r"[aβ]{100}";
+    let dfa = DFA::builder()
+        .configure(
+            // Configure it so that we have the minimum cache capacity
+            // possible. And that if any resets occur, the search quits.
+            DFA::config()
+                .skip_cache_capacity_check(true)
+                .cache_capacity(0)
+                .minimum_cache_clear_count(Some(0)),
+        )
+        .build(pattern)?;
+    let mut cache = dfa.create_cache();
+
+    let haystack = "a".repeat(101).into_bytes();
+    let err = MatchError::GaveUp { offset: 25 };
+    assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
+    assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
+    assert_eq!(
+        dfa.find_overlapping_fwd(
+            &mut cache,
+            &haystack,
+            &mut OverlappingState::start()
+        ),
+        Err(err.clone())
+    );
+
+    let haystack = "β".repeat(101).into_bytes();
+    let err = MatchError::GaveUp { offset: 0 };
+    assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+    // no need to test that other find routines quit, since we did that above
+
+    // OK, if we reset the cache, then we should be able to create more states
+    // and make more progress with searching for betas.
+    cache.reset(&dfa);
+    let err = MatchError::GaveUp { offset: 26 };
+    assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+
+    // ... switching back to ASCII still makes progress since it just needs to
+    // set transitions on existing states!
+    let haystack = "a".repeat(101).into_bytes();
+    let err = MatchError::GaveUp { offset: 13 };
+    assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+
+    Ok(())
+}
+
+// Tests that quit bytes in the forward direction work correctly.
+#[test]
+fn quit_fwd() -> Result<(), Box<dyn Error>> {
+    let dfa = DFA::builder()
+        .configure(DFA::config().quit(b'x', true))
+        .build("[[:word:]]+$")?;
+    let mut cache = dfa.create_cache();
+
+    assert_eq!(
+        dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+    assert_eq!(
+        dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+    assert_eq!(
+        dfa.find_overlapping_fwd(
+            &mut cache,
+            b"abcxyz",
+            &mut OverlappingState::start()
+        ),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+
+    Ok(())
+}
+
+// Tests that quit bytes in the reverse direction work correctly.
+#[test]
+fn quit_rev() -> Result<(), Box<dyn Error>> {
+    let dfa = DFA::builder()
+        .configure(DFA::config().quit(b'x', true))
+        .thompson(thompson::Config::new().reverse(true))
+        .build("^[[:word:]]+")?;
+    let mut cache = dfa.create_cache();
+
+    assert_eq!(
+        dfa.find_earliest_rev(&mut cache, b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+    assert_eq!(
+        dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+
+    Ok(())
+}
+
+// Tests that if we heuristically enable Unicode word boundaries but then
+// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
+// will panic.
+#[test]
+#[should_panic]
+fn quit_panics() {
+    DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
+}
+
+// This tests an intesting case where even if the Unicode word boundary option
+// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
+// word boundaries to be enabled.
+#[test]
+fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
+    let mut config = DFA::config();
+    for b in 0x80..=0xFF {
+        config = config.quit(b, true);
+    }
+    let dfa = DFA::builder().configure(config).build(r"\b")?;
+    let mut cache = dfa.create_cache();
+    let expected = HalfMatch::must(0, 1);
+    assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
+    Ok(())
+}
+
+// Tests that we can provide a prefilter to a Regex, and the search reports
+// correct results.
+#[test]
+fn prefilter_works() -> Result<(), Box<dyn Error>> {
+    let mut re = Regex::new(r"a[0-9]+").unwrap();
+    re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
+    let mut cache = re.create_cache();
+
+    let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
+    let matches: Vec<(usize, usize)> = re
+        .find_leftmost_iter(&mut cache, text)
+        .map(|m| (m.start(), m.end()))
+        .collect();
+    assert_eq!(
+        matches,
+        vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
+    );
+    Ok(())
+}
+
+// This test confirms that a prefilter is active by using a prefilter that
+// reports false negatives.
+#[test]
+fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
+    let text = b"za123";
+    let mut re = Regex::new(r"a[0-9]+").unwrap();
+    let mut cache = re.create_cache();
+
+    re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
+    assert_eq!(
+        re.find_leftmost(&mut cache, b"za123"),
+        Some(MultiMatch::must(0, 1, 5))
+    );
+    assert_eq!(
+        re.find_leftmost(&mut cache, b"a123"),
+        Some(MultiMatch::must(0, 0, 4))
+    );
+    re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
+    assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
+    // This checks that the prefilter is used when first starting the search,
+    // instead of waiting until at least one transition has occurred.
+    assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
+    Ok(())
+}
diff --git a/tests/hybrid/mod.rs b/tests/hybrid/mod.rs

new file mode 100644 (file)

index 0000000..f429951
--- /dev/null
+++ b/tests/hybrid/mod.rs
@@ -0,0 +1,2 @@
+mod api;
+mod suite;
diff --git a/tests/hybrid/suite.rs b/tests/hybrid/suite.rs

new file mode 100644 (file)

index 0000000..d60570d
--- /dev/null
+++ b/tests/hybrid/suite.rs
@@ -0,0 +1,212 @@
+use regex_automata::{
+    hybrid::{
+        dfa::DFA,
+        regex::{self, Regex},
+    },
+    nfa::thompson,
+    MatchKind, SyntaxConfig,
+};
+use regex_syntax as syntax;
+
+use regex_test::{
+    bstr::{BString, ByteSlice},
+    CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
+    SearchKind as TestSearchKind, TestResult, TestRunner,
+};
+
+use crate::{suite, Result};
+
+/// Tests the default configuration of the hybrid NFA/DFA.
+#[test]
+fn default() -> Result<()> {
+    let builder = Regex::builder();
+    TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+    Ok(())
+}
+
+/// Tests the hybrid NFA/DFA with NFA shrinking disabled.
+///
+/// This is actually the typical configuration one wants for a lazy DFA. NFA
+/// shrinking is mostly only advantageous when building a full DFA since it
+/// can sharply decrease the amount of time determinization takes. But NFA
+/// shrinking is itself otherwise fairly expensive. Since a lazy DFA has
+/// no compilation time (other than for building the NFA of course) before
+/// executing a search, it's usually worth it to forgo NFA shrinking.
+#[test]
+fn no_nfa_shrink() -> Result<()> {
+    let mut builder = Regex::builder();
+    builder.thompson(thompson::Config::new().shrink(false));
+    TestRunner::new()?
+        // Without NFA shrinking, this test blows the default cache capacity.
+        .blacklist("expensive/regression-many-repeat-no-stack-overflow")
+        .test_iter(suite()?.iter(), compiler(builder))
+        .assert();
+    Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled.
+#[test]
+fn starts_for_each_pattern() -> Result<()> {
+    let mut builder = Regex::builder();
+    builder.dfa(DFA::config().starts_for_each_pattern(true));
+    TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+    Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when byte classes are disabled.
+///
+/// N.B. Disabling byte classes doesn't avoid any indirection at search time.
+/// All it does is cause every byte value to be its own distinct equivalence
+/// class.
+#[test]
+fn no_byte_classes() -> Result<()> {
+    let mut builder = Regex::builder();
+    builder.dfa(DFA::config().byte_classes(false));
+    TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+    Ok(())
+}
+
+/// Tests that hybrid NFA/DFA never clears its cache for any test with the
+/// default capacity.
+///
+/// N.B. If a regex suite test is added that causes the cache to be cleared,
+/// then this should just skip that test. (Which can be done by calling the
+/// 'blacklist' method on 'TestRunner'.)
+#[test]
+fn no_cache_clearing() -> Result<()> {
+    let mut builder = Regex::builder();
+    builder.dfa(DFA::config().minimum_cache_clear_count(Some(0)));
+    TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+    Ok(())
+}
+
+/// Tests the hybrid NFA/DFA when the minimum cache capacity is set.
+#[test]
+fn min_cache_capacity() -> Result<()> {
+    let mut builder = Regex::builder();
+    builder
+        .dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true));
+    TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+    Ok(())
+}
+
+fn compiler(
+    mut builder: regex::Builder,
+) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+    move |test, regexes| {
+        let regexes = regexes
+            .iter()
+            .map(|r| r.to_str().map(|s| s.to_string()))
+            .collect::<std::result::Result<Vec<String>, _>>()?;
+
+        // Check if our regex contains things that aren't supported by DFAs.
+        // That is, Unicode word boundaries when searching non-ASCII text.
+        let mut thompson = thompson::Builder::new();
+        thompson.syntax(config_syntax(test)).configure(config_thompson(test));
+        if let Ok(nfa) = thompson.build_many(&regexes) {
+            let non_ascii = test.input().iter().any(|&b| !b.is_ascii());
+            if nfa.has_word_boundary_unicode() && non_ascii {
+                return Ok(CompiledRegex::skip());
+            }
+        }
+        if !configure_regex_builder(test, &mut builder) {
+            return Ok(CompiledRegex::skip());
+        }
+        let re = builder.build_many(&regexes)?;
+        let mut cache = re.create_cache();
+        Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+            run_test(&re, &mut cache, test)
+        }))
+    }
+}
+
+fn run_test(
+    re: &Regex,
+    cache: &mut regex::Cache,
+    test: &RegexTest,
+) -> Vec<TestResult> {
+    let is_match = if re.is_match(cache, test.input()) {
+        TestResult::matched()
+    } else {
+        TestResult::no_match()
+    };
+    let is_match = is_match.name("is_match");
+
+    let find_matches = match test.search_kind() {
+        TestSearchKind::Earliest => {
+            let it = re
+                .find_earliest_iter(cache, test.input())
+                .take(test.match_limit().unwrap_or(std::usize::MAX))
+                .map(|m| Match {
+                    id: m.pattern().as_usize(),
+                    start: m.start(),
+                    end: m.end(),
+                });
+            TestResult::matches(it).name("find_earliest_iter")
+        }
+        TestSearchKind::Leftmost => {
+            let it = re
+                .find_leftmost_iter(cache, test.input())
+                .take(test.match_limit().unwrap_or(std::usize::MAX))
+                .map(|m| Match {
+                    id: m.pattern().as_usize(),
+                    start: m.start(),
+                    end: m.end(),
+                });
+            TestResult::matches(it).name("find_leftmost_iter")
+        }
+        TestSearchKind::Overlapping => {
+            let it = re
+                .find_overlapping_iter(cache, test.input())
+                .take(test.match_limit().unwrap_or(std::usize::MAX))
+                .map(|m| Match {
+                    id: m.pattern().as_usize(),
+                    start: m.start(),
+                    end: m.end(),
+                });
+            TestResult::matches(it).name("find_overlapping_iter")
+        }
+    };
+    vec![is_match, find_matches]
+}
+
+/// Configures the given regex builder with all relevant settings on the given
+/// regex test.
+///
+/// If the regex test has a setting that is unsupported, then this returns
+/// false (implying the test should be skipped).
+fn configure_regex_builder(
+    test: &RegexTest,
+    builder: &mut regex::Builder,
+) -> bool {
+    let match_kind = match test.match_kind() {
+        TestMatchKind::All => MatchKind::All,
+        TestMatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
+        TestMatchKind::LeftmostLongest => return false,
+    };
+
+    let dense_config = DFA::config()
+        .anchored(test.anchored())
+        .match_kind(match_kind)
+        .unicode_word_boundary(true);
+    let regex_config = Regex::config().utf8(test.utf8());
+    builder
+        .configure(regex_config)
+        .syntax(config_syntax(test))
+        .thompson(config_thompson(test))
+        .dfa(dense_config);
+    true
+}
+
+/// Configuration of a Thompson NFA compiler from a regex test.
+fn config_thompson(test: &RegexTest) -> thompson::Config {
+    thompson::Config::new().utf8(test.utf8())
+}
+
+/// Configuration of the regex parser from a regex test.
+fn config_syntax(test: &RegexTest) -> SyntaxConfig {
+    SyntaxConfig::new()
+        .case_insensitive(test.case_insensitive())
+        .unicode(test.unicode())
+        .utf8(test.utf8())
+}
diff --git a/tests/nfa/mod.rs b/tests/nfa/mod.rs

new file mode 100644 (file)

index 0000000..3268621
--- /dev/null
+++ b/tests/nfa/mod.rs
@@ -0,0 +1 @@
+mod thompson;
diff --git a/tests/nfa/thompson/mod.rs b/tests/nfa/thompson/mod.rs

new file mode 100644 (file)

index 0000000..3a03f52
--- /dev/null
+++ b/tests/nfa/thompson/mod.rs
@@ -0,0 +1 @@
+mod pikevm;
diff --git a/tests/nfa/thompson/pikevm/api.rs b/tests/nfa/thompson/pikevm/api.rs

new file mode 100644 (file)

index 0000000..c8199f7
--- /dev/null
+++ b/tests/nfa/thompson/pikevm/api.rs
@@ -0,0 +1,191 @@
+/*
+use std::error::Error;
+
+use regex_automata::{
+    hybrid::{
+        dfa::{self, DFA},
+        regex::Regex,
+        OverlappingState,
+    },
+    nfa::thompson,
+    HalfMatch, MatchError, MatchKind, MultiMatch,
+};
+
+use crate::util::{BunkPrefilter, SubstringPrefilter};
+
+// Tests that too many cache resets cause the lazy DFA to quit.
+#[test]
+fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
+    // This is a carefully chosen regex. The idea is to pick one that requires
+    // some decent number of states (hence the bounded repetition). But we
+    // specifically choose to create a class with an ASCII letter and a
+    // non-ASCII letter so that we can check that no new states are created
+    // once the cache is full. Namely, if we fill up the cache on a haystack
+    // of 'a's, then in order to match one 'β', a new state will need to be
+    // created since a 'β' is encoded with multiple bytes. Since there's no
+    // room for this state, the search should quit at the very first position.
+    let pattern = r"[aβ]{100}";
+    let dfa = DFA::builder()
+        .configure(
+            // Configure it so that we have the minimum cache capacity
+            // possible. And that if any resets occur, the search quits.
+            DFA::config()
+                .skip_cache_capacity_check(true)
+                .cache_capacity(0)
+                .minimum_cache_clear_count(Some(0)),
+        )
+        .build(pattern)?;
+    let mut cache = dfa.create_cache();
+
+    let haystack = "a".repeat(101).into_bytes();
+    let err = MatchError::GaveUp { offset: 25 };
+    assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
+    assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
+    assert_eq!(
+        dfa.find_overlapping_fwd(
+            &mut cache,
+            &haystack,
+            &mut OverlappingState::start()
+        ),
+        Err(err.clone())
+    );
+
+    let haystack = "β".repeat(101).into_bytes();
+    let err = MatchError::GaveUp { offset: 0 };
+    assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+    // no need to test that other find routines quit, since we did that above
+
+    // OK, if we reset the cache, then we should be able to create more states
+    // and make more progress with searching for betas.
+    cache.reset(&dfa);
+    let err = MatchError::GaveUp { offset: 26 };
+    assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+
+    // ... switching back to ASCII still makes progress since it just needs to
+    // set transitions on existing states!
+    let haystack = "a".repeat(101).into_bytes();
+    let err = MatchError::GaveUp { offset: 13 };
+    assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
+
+    Ok(())
+}
+
+// Tests that quit bytes in the forward direction work correctly.
+#[test]
+fn quit_fwd() -> Result<(), Box<dyn Error>> {
+    let dfa = DFA::builder()
+        .configure(DFA::config().quit(b'x', true))
+        .build("[[:word:]]+$")?;
+    let mut cache = dfa.create_cache();
+
+    assert_eq!(
+        dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+    assert_eq!(
+        dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+    assert_eq!(
+        dfa.find_overlapping_fwd(
+            &mut cache,
+            b"abcxyz",
+            &mut OverlappingState::start()
+        ),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+
+    Ok(())
+}
+
+// Tests that quit bytes in the reverse direction work correctly.
+#[test]
+fn quit_rev() -> Result<(), Box<dyn Error>> {
+    let dfa = DFA::builder()
+        .configure(DFA::config().quit(b'x', true))
+        .thompson(thompson::Config::new().reverse(true))
+        .build("^[[:word:]]+")?;
+    let mut cache = dfa.create_cache();
+
+    assert_eq!(
+        dfa.find_earliest_rev(&mut cache, b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+    assert_eq!(
+        dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
+        Err(MatchError::Quit { byte: b'x', offset: 3 })
+    );
+
+    Ok(())
+}
+
+// Tests that if we heuristically enable Unicode word boundaries but then
+// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
+// will panic.
+#[test]
+#[should_panic]
+fn quit_panics() {
+    DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
+}
+
+// This tests an intesting case where even if the Unicode word boundary option
+// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
+// word boundaries to be enabled.
+#[test]
+fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
+    let mut config = DFA::config();
+    for b in 0x80..=0xFF {
+        config = config.quit(b, true);
+    }
+    let dfa = DFA::builder().configure(config).build(r"\b")?;
+    let mut cache = dfa.create_cache();
+    let expected = HalfMatch::must(0, 1);
+    assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
+    Ok(())
+}
+
+// Tests that we can provide a prefilter to a Regex, and the search reports
+// correct results.
+#[test]
+fn prefilter_works() -> Result<(), Box<dyn Error>> {
+    let mut re = Regex::new(r"a[0-9]+").unwrap();
+    re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
+    let mut cache = re.create_cache();
+
+    let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
+    let matches: Vec<(usize, usize)> = re
+        .find_leftmost_iter(&mut cache, text)
+        .map(|m| (m.start(), m.end()))
+        .collect();
+    assert_eq!(
+        matches,
+        vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
+    );
+    Ok(())
+}
+
+// This test confirms that a prefilter is active by using a prefilter that
+// reports false negatives.
+#[test]
+fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
+    let text = b"za123";
+    let mut re = Regex::new(r"a[0-9]+").unwrap();
+    let mut cache = re.create_cache();
+
+    re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
+    assert_eq!(
+        re.find_leftmost(&mut cache, b"za123"),
+        Some(MultiMatch::must(0, 1, 5))
+    );
+    assert_eq!(
+        re.find_leftmost(&mut cache, b"a123"),
+        Some(MultiMatch::must(0, 0, 4))
+    );
+    re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
+    assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
+    // This checks that the prefilter is used when first starting the search,
+    // instead of waiting until at least one transition has occurred.
+    assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
+    Ok(())
+}
+*/
diff --git a/tests/nfa/thompson/pikevm/mod.rs b/tests/nfa/thompson/pikevm/mod.rs

new file mode 100644 (file)

index 0000000..f429951
--- /dev/null
+++ b/tests/nfa/thompson/pikevm/mod.rs
@@ -0,0 +1,2 @@
+mod api;
+mod suite;
diff --git a/tests/nfa/thompson/pikevm/suite.rs b/tests/nfa/thompson/pikevm/suite.rs

new file mode 100644 (file)

index 0000000..e5505d5
--- /dev/null
+++ b/tests/nfa/thompson/pikevm/suite.rs
@@ -0,0 +1,109 @@
+use regex_automata::{
+    nfa::thompson::{
+        self,
+        pikevm::{self, PikeVM},
+    },
+    MatchKind, SyntaxConfig,
+};
+use regex_syntax as syntax;
+
+use regex_test::{
+    bstr::{BString, ByteSlice},
+    CompiledRegex, Match, MatchKind as TestMatchKind, RegexTest, RegexTests,
+    SearchKind as TestSearchKind, TestResult, TestRunner,
+};
+
+use crate::{suite, Result};
+
+/// Tests the default configuration of the hybrid NFA/DFA.
+#[test]
+fn default() -> Result<()> {
+    let builder = PikeVM::builder();
+    TestRunner::new()?.test_iter(suite()?.iter(), compiler(builder)).assert();
+    Ok(())
+}
+
+fn compiler(
+    mut builder: pikevm::Builder,
+) -> impl FnMut(&RegexTest, &[BString]) -> Result<CompiledRegex> {
+    move |test, regexes| {
+        let regexes = regexes
+            .iter()
+            .map(|r| r.to_str().map(|s| s.to_string()))
+            .collect::<std::result::Result<Vec<String>, _>>()?;
+        if !configure_pikevm_builder(test, &mut builder) {
+            return Ok(CompiledRegex::skip());
+        }
+        let re = builder.build_many(&regexes)?;
+        let mut cache = re.create_cache();
+        Ok(CompiledRegex::compiled(move |test| -> Vec<TestResult> {
+            run_test(&re, &mut cache, test)
+        }))
+    }
+}
+
+fn run_test(
+    re: &PikeVM,
+    cache: &mut pikevm::Cache,
+    test: &RegexTest,
+) -> Vec<TestResult> {
+    // let is_match = if re.is_match(cache, test.input()) {
+    // TestResult::matched()
+    // } else {
+    // TestResult::no_match()
+    // };
+    // let is_match = is_match.name("is_match");
+
+    let find_matches = match test.search_kind() {
+        TestSearchKind::Earliest => {
+            TestResult::skip().name("find_earliest_iter")
+        }
+        TestSearchKind::Leftmost => {
+            let it = re
+                .find_leftmost_iter(cache, test.input())
+                .take(test.match_limit().unwrap_or(std::usize::MAX))
+                .map(|m| Match {
+                    id: m.pattern().as_usize(),
+                    start: m.start(),
+                    end: m.end(),
+                });
+            TestResult::matches(it).name("find_leftmost_iter")
+        }
+        TestSearchKind::Overlapping => {
+            TestResult::skip().name("find_overlapping_iter")
+        }
+    };
+    // vec![is_match, find_matches]
+    vec![find_matches]
+}
+
+/// Configures the given regex builder with all relevant settings on the given
+/// regex test.
+///
+/// If the regex test has a setting that is unsupported, then this returns
+/// false (implying the test should be skipped).
+fn configure_pikevm_builder(
+    test: &RegexTest,
+    builder: &mut pikevm::Builder,
+) -> bool {
+    let pikevm_config =
+        PikeVM::config().anchored(test.anchored()).utf8(test.utf8());
+    builder
+        .configure(pikevm_config)
+        .syntax(config_syntax(test))
+        .thompson(config_thompson(test));
+    true
+}
+
+/// Configuration of a Thompson NFA compiler from a regex test.
+fn config_thompson(test: &RegexTest) -> thompson::Config {
+    thompson::Config::new().utf8(test.utf8())
+}
+
+/// Configuration of the regex parser from a regex test.
+fn config_syntax(test: &RegexTest) -> SyntaxConfig {
+    SyntaxConfig::new()
+        .case_insensitive(test.case_insensitive())
+        .unicode(test.unicode())
+        .utf8(test.utf8())
+}
diff --git a/tests/regression.rs b/tests/regression.rs

new file mode 100644 (file)

index 0000000..e5355fe
--- /dev/null
+++ b/tests/regression.rs
@@ -0,0 +1,44 @@
+use regex_automata::{
+    dfa::{dense, Automaton},
+    MatchError,
+};
+
+// A regression test for checking that minimization correctly translates
+// whether a state is a match state or not. Previously, it was possible for
+// minimization to mark a non-matching state as matching.
+#[test]
+fn minimize_sets_correct_match_states() {
+    let pattern =
+        // This is a subset of the grapheme matching regex. I couldn't seem
+        // to get a repro any smaller than this unfortunately.
+        r"(?x)
+            (?:
+                \p{gcb=Prepend}*
+                (?:
+                    (?:
+                        (?:
+                            \p{gcb=L}*
+                            (?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT})
+                            \p{gcb=T}*
+                        )
+                        |
+                        \p{gcb=L}+
+                        |
+                        \p{gcb=T}+
+                    )
+                    |
+                    \p{Extended_Pictographic}
+                    (?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})*
+                    |
+                    [^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}]
+                )
+                [\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]*
+            )
+        ";
+
+    let dfa = dense::Builder::new()
+        .configure(dense::Config::new().anchored(true).minimize(true))
+        .build(pattern)
+        .unwrap();
+    assert_eq!(Ok(None), dfa.find_leftmost_fwd(b"\xE2"));
+}
diff --git a/tests/tests.rs b/tests/tests.rs

new file mode 100644 (file)

index 0000000..e472847
--- /dev/null
+++ b/tests/tests.rs
@@ -0,0 +1,44 @@
+#![allow(warnings)]
+
+use regex_test::RegexTests;
+
+mod dfa;
+mod hybrid;
+mod nfa;
+mod regression;
+mod util;
+
+type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
+
+fn suite() -> Result<RegexTests> {
+    let mut tests = RegexTests::new();
+    macro_rules! load {
+        ($name:expr) => {{
+            const DATA: &[u8] =
+                include_bytes!(concat!("data/", $name, ".toml"));
+            tests.load_slice($name, DATA)?;
+        }};
+    }
+
+    load!("bytes");
+    load!("crazy");
+    load!("earliest");
+    load!("empty");
+    load!("expensive");
+    load!("flags");
+    load!("iter");
+    load!("misc");
+    load!("multiline");
+    load!("no-unicode");
+    load!("overlapping");
+    load!("regression");
+    load!("set");
+    load!("unicode");
+    load!("word-boundary");
+    load!("fowler/basic");
+    load!("fowler/nullsubexpr");
+    load!("fowler/repetition");
+    load!("fowler/repetition-expensive");
+
+    Ok(tests)
+}
diff --git a/tests/util.rs b/tests/util.rs

new file mode 100644 (file)

index 0000000..499aa8c
--- /dev/null
+++ b/tests/util.rs
@@ -0,0 +1,57 @@
+use regex_automata::util::prefilter::{self, Candidate, Prefilter};
+
+#[derive(Clone, Debug)]
+pub struct SubstringPrefilter(bstr::Finder<'static>);
+
+impl SubstringPrefilter {
+    pub fn new<B: AsRef<[u8]>>(needle: B) -> SubstringPrefilter {
+        SubstringPrefilter(bstr::Finder::new(needle.as_ref()).into_owned())
+    }
+}
+
+impl Prefilter for SubstringPrefilter {
+    #[inline]
+    fn next_candidate(
+        &self,
+        state: &mut prefilter::State,
+        haystack: &[u8],
+        at: usize,
+    ) -> Candidate {
+        self.0
+            .find(&haystack[at..])
+            .map(|i| Candidate::PossibleStartOfMatch(at + i))
+            .unwrap_or(Candidate::None)
+    }
+
+    fn heap_bytes(&self) -> usize {
+        self.0.needle().len()
+    }
+}
+
+/// A prefilter that always returns `Candidate::None`, even if it's a false
+/// negative. This is useful for confirming that a prefilter is actually
+/// active by asserting an incorrect result.
+#[derive(Clone, Debug)]
+pub struct BunkPrefilter(());
+
+impl BunkPrefilter {
+    pub fn new() -> BunkPrefilter {
+        BunkPrefilter(())
+    }
+}
+
+impl Prefilter for BunkPrefilter {
+    #[inline]
+    fn next_candidate(
+        &self,
+        _state: &mut prefilter::State,
+        _haystack: &[u8],
+        _at: usize,
+    ) -> Candidate {
+        Candidate::None
+    }
+
+    fn heap_bytes(&self) -> usize {
+        0
+    }
+}
author	Woohyun Jung <wh0705.jung@samsung.com>
	Tue, 14 Mar 2023 03:57:19 +0000 (12:57 +0900)
committer	Woohyun Jung <wh0705.jung@samsung.com>
	Tue, 14 Mar 2023 03:57:19 +0000 (12:57 +0900)
.cargo_vcs_info.json	[new file with mode: 0644]	patch \| blob
.gitignore	[new file with mode: 0644]	patch \| blob
COPYING	[new file with mode: 0644]	patch \| blob
Cargo.toml	[new file with mode: 0644]	patch \| blob
Cargo.toml.orig	[new file with mode: 0644]	patch \| blob
LICENSE-MIT	[new file with mode: 0644]	patch \| blob
PLANS.md	[new file with mode: 0644]	patch \| blob
README.md	[new file with mode: 0644]	patch \| blob
TODO	[new file with mode: 0644]	patch \| blob
UNLICENSE	[new file with mode: 0644]	patch \| blob
rustfmt.toml	[new file with mode: 0644]	patch \| blob
src/dfa/accel.rs	[new file with mode: 0644]	patch \| blob
src/dfa/automaton.rs	[new file with mode: 0644]	patch \| blob
src/dfa/dense.rs	[new file with mode: 0644]	patch \| blob
src/dfa/determinize.rs	[new file with mode: 0644]	patch \| blob
src/dfa/error.rs	[new file with mode: 0644]	patch \| blob
src/dfa/minimize.rs	[new file with mode: 0644]	patch \| blob
src/dfa/mod.rs	[new file with mode: 0644]	patch \| blob
src/dfa/regex.rs	[new file with mode: 0644]	patch \| blob
src/dfa/search.rs	[new file with mode: 0644]	patch \| blob
src/dfa/search_unsafe.rs	[new file with mode: 0644]	patch \| blob
src/dfa/sparse.rs	[new file with mode: 0644]	patch \| blob
src/dfa/special.rs	[new file with mode: 0644]	patch \| blob
src/dfa/transducer.rs	[new file with mode: 0644]	patch \| blob
src/hybrid/dfa.rs	[new file with mode: 0644]	patch \| blob
src/hybrid/error.rs	[new file with mode: 0644]	patch \| blob
src/hybrid/id.rs	[new file with mode: 0644]	patch \| blob
src/hybrid/mod.rs	[new file with mode: 0644]	patch \| blob
src/hybrid/regex.rs	[new file with mode: 0644]	patch \| blob
src/hybrid/search.rs	[new file with mode: 0644]	patch \| blob
src/lib.rs	[new file with mode: 0644]	patch \| blob
src/macros.rs	[new file with mode: 0644]	patch \| blob
src/nfa/mod.rs	[new file with mode: 0644]	patch \| blob
src/nfa/thompson/compiler.rs	[new file with mode: 0644]	patch \| blob
src/nfa/thompson/error.rs	[new file with mode: 0644]	patch \| blob
src/nfa/thompson/map.rs	[new file with mode: 0644]	patch \| blob
src/nfa/thompson/mod.rs	[new file with mode: 0644]	patch \| blob
src/nfa/thompson/pikevm.rs	[new file with mode: 0644]	patch \| blob
src/nfa/thompson/range_trie.rs	[new file with mode: 0644]	patch \| blob
src/util/alphabet.rs	[new file with mode: 0644]	patch \| blob
src/util/bytes.rs	[new file with mode: 0644]	patch \| blob
src/util/determinize/mod.rs	[new file with mode: 0644]	patch \| blob
src/util/determinize/state.rs	[new file with mode: 0644]	patch \| blob
src/util/id.rs	[new file with mode: 0644]	patch \| blob
src/util/lazy.rs	[new file with mode: 0644]	patch \| blob
src/util/matchtypes.rs	[new file with mode: 0644]	patch \| blob
src/util/mod.rs	[new file with mode: 0644]	patch \| blob
src/util/prefilter.rs	[new file with mode: 0644]	patch \| blob
src/util/sparse_set.rs	[new file with mode: 0644]	patch \| blob
src/util/start.rs	[new file with mode: 0644]	patch \| blob
src/util/syntax.rs	[new file with mode: 0644]	patch \| blob
tests/data/bytes.toml	[new file with mode: 0644]	patch \| blob
tests/data/crazy.toml	[new file with mode: 0644]	patch \| blob
tests/data/earliest.toml	[new file with mode: 0644]	patch \| blob
tests/data/empty.toml	[new file with mode: 0644]	patch \| blob
tests/data/expensive.toml	[new file with mode: 0644]	patch \| blob
tests/data/flags.toml	[new file with mode: 0644]	patch \| blob
tests/data/fowler/basic.toml	[new file with mode: 0644]	patch \| blob
tests/data/fowler/dat/README	[new file with mode: 0644]	patch \| blob
tests/data/fowler/dat/basic.dat	[new file with mode: 0644]	patch \| blob
tests/data/fowler/dat/nullsubexpr.dat	[new file with mode: 0644]	patch \| blob
tests/data/fowler/dat/repetition-expensive.dat	[new file with mode: 0644]	patch \| blob
tests/data/fowler/dat/repetition.dat	[new file with mode: 0644]	patch \| blob
tests/data/fowler/nullsubexpr.toml	[new file with mode: 0644]	patch \| blob
tests/data/fowler/repetition-expensive.toml	[new file with mode: 0644]	patch \| blob
tests/data/fowler/repetition-long.toml	[new file with mode: 0644]	patch \| blob
tests/data/fowler/repetition.toml	[new file with mode: 0644]	patch \| blob
tests/data/iter.toml	[new file with mode: 0644]	patch \| blob
tests/data/misc.toml	[new file with mode: 0644]	patch \| blob
tests/data/multiline.toml	[new file with mode: 0644]	patch \| blob
tests/data/no-unicode.toml	[new file with mode: 0644]	patch \| blob
tests/data/overlapping.toml	[new file with mode: 0644]	patch \| blob
tests/data/regression.toml	[new file with mode: 0644]	patch \| blob
tests/data/set.toml	[new file with mode: 0644]	patch \| blob
tests/data/unicode.toml	[new file with mode: 0644]	patch \| blob
tests/data/word-boundary.toml	[new file with mode: 0644]	patch \| blob
tests/dfa/api.rs	[new file with mode: 0644]	patch \| blob
tests/dfa/mod.rs	[new file with mode: 0644]	patch \| blob
tests/dfa/suite.rs	[new file with mode: 0644]	patch \| blob
tests/hybrid/api.rs	[new file with mode: 0644]	patch \| blob
tests/hybrid/mod.rs	[new file with mode: 0644]	patch \| blob
tests/hybrid/suite.rs	[new file with mode: 0644]	patch \| blob
tests/nfa/mod.rs	[new file with mode: 0644]	patch \| blob
tests/nfa/thompson/mod.rs	[new file with mode: 0644]	patch \| blob
tests/nfa/thompson/pikevm/api.rs	[new file with mode: 0644]	patch \| blob
tests/nfa/thompson/pikevm/mod.rs	[new file with mode: 0644]	patch \| blob
tests/nfa/thompson/pikevm/suite.rs	[new file with mode: 0644]	patch \| blob
tests/regression.rs	[new file with mode: 0644]	patch \| blob
tests/tests.rs	[new file with mode: 0644]	patch \| blob
tests/util.rs	[new file with mode: 0644]	patch \| blob