1 // Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Tested by search_test.cc.
7 // Prog::SearchNFA, an NFA search.
8 // This is an actual NFA like the theorists talk about,
9 // not the pseudo-NFA found in backtracking regexp implementations.
13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14 // which is a variant of the one described in Thompson's 1968 CACM paper.
15 // See http://swtch.com/~rsc/regexp/ for various history. The main feature
16 // over the DFA implementation is that it tracks submatch boundaries.
18 // When the choice of submatch boundaries is ambiguous, this particular
19 // implementation makes the same choices that traditional backtracking
20 // implementations (in particular, Perl and PCRE) do.
21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22 // time in the length of the input.
24 // Like Thompson's original machine and like the DFA implementation, this
25 // implementation notices a match only once it is one byte past it.
28 #include "re2/regexp.h"
29 #include "util/sparse_array.h"
30 #include "util/sparse_set.h"
39 // Searches for a matching string.
40 // * If anchored is true, only considers matches starting at offset.
41 // Otherwise finds lefmost match at or after offset.
42 // * If longest is true, returns the longest match starting
43 // at the chosen start point. Otherwise returns the so-called
44 // left-biased match, the one traditional backtracking engines
45 // (like Perl and PCRE) find.
46 // Records submatch boundaries in submatch[1..nsubmatch-1].
47 // Submatch[0] is the entire match. When there is a choice in
48 // which text matches each subexpression, the submatch boundaries
49 // are chosen to match what a backtracking implementation would choose.
50 bool Search(const StringPiece& text, const StringPiece& context,
51 bool anchored, bool longest,
52 StringPiece* submatch, int nsubmatch);
54 static const int Debug = 0;
60 Thread* next; // when on free list
65 // State for explicit stack in AddToThreadq.
67 int id; // Inst to process
69 const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
72 : id(0), j(-1), cap_j(NULL) {}
73 explicit AddState(int id)
74 : id(id), j(-1), cap_j(NULL) {}
75 AddState(int id, const char* cap_j, int j)
76 : id(id), j(j), cap_j(cap_j) {}
79 // Threadq is a list of threads. The list is sorted by the order
80 // in which Perl would explore that particular state -- the earlier
81 // choices appear earlier in the list.
82 typedef SparseArray<Thread*> Threadq;
84 inline Thread* AllocThread();
85 inline void FreeThread(Thread*);
87 // Add id (or its children, following unlabeled arrows)
88 // to the workqueue q with associated capture info.
89 void AddToThreadq(Threadq* q, int id, int flag,
90 const char* p, const char** capture);
92 // Run runq on byte c, appending new states to nextq.
93 // Updates matched_ and match_ as new, better matches are found.
94 // p is position of the next byte (the one after c)
95 // in the input string, used when processing capturing parens.
96 // flag is the bitwise or of Bol, Eol, etc., specifying whether
97 // ^, $ and \b match the current input point (after c).
98 inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
100 // Returns text version of capture information, for debugging.
101 string FormatCapture(const char** capture);
103 inline void CopyCapture(const char** dst, const char** src);
105 // Computes whether all matches must begin with the same first
106 // byte, and if so, returns that byte. If not, returns -1.
107 int ComputeFirstByte();
109 Prog* prog_; // underlying program
110 int start_; // start instruction in program
111 int ncapture_; // number of submatches to track
112 bool longest_; // whether searching for longest match
113 bool endmatch_; // whether match must end at text.end()
114 const char* btext_; // beginning of text being matched (for FormatSubmatch)
115 const char* etext_; // end of text being matched (for endmatch_)
116 Threadq q0_, q1_; // pre-allocated for Search.
117 const char** match_; // best match so far
118 bool matched_; // any match so far?
119 AddState* astack_; // pre-allocated for AddToThreadq
121 int first_byte_; // required first byte for match, or -1 if none
123 Thread* free_threads_; // free list
125 DISALLOW_EVIL_CONSTRUCTORS(NFA);
128 NFA::NFA(Prog* prog) {
130 start_ = prog->start();
136 q0_.resize(prog_->size());
137 q1_.resize(prog_->size());
138 nastack_ = 2*prog_->size();
139 astack_ = new AddState[nastack_];
142 free_threads_ = NULL;
143 first_byte_ = ComputeFirstByte();
150 for (Thread* t = free_threads_; t; t = next) {
157 void NFA::FreeThread(Thread *t) {
160 t->next = free_threads_;
164 NFA::Thread* NFA::AllocThread() {
165 Thread* t = free_threads_;
168 t->capture = new const char*[ncapture_];
171 free_threads_ = t->next;
175 void NFA::CopyCapture(const char** dst, const char** src) {
176 for (int i = 0; i < ncapture_; i+=2) {
182 // Follows all empty arrows from id0 and enqueues all the states reached.
183 // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
184 // The pointer p is the current input position, and m is the
185 // current set of match boundaries.
186 void NFA::AddToThreadq(Threadq* q, int id0, int flag,
187 const char* p, const char** capture) {
191 // Astack_ is pre-allocated to avoid resize operations.
192 // It has room for 2*prog_->size() entries, which is enough:
193 // Each inst in prog can be processed at most once,
194 // pushing at most two entries on stk.
197 AddState* stk = astack_;
198 stk[nstk++] = AddState(id0);
201 DCHECK_LE(nstk, nastack_);
202 const AddState& a = stk[--nstk];
204 capture[a.j] = a.cap_j;
209 if (q->has_index(id)) {
211 fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
215 // Create entry in q no matter what. We might fill it in below,
216 // or we might not. Even if not, it is necessary to have it,
217 // so that we don't revisit id0 during the recursion.
218 q->set_new(id, NULL);
220 Thread** tp = &q->find(id)->second;
223 Prog::Inst* ip = prog_->inst(id);
224 switch (ip->opcode()) {
226 LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
233 // Save state; will pick up at next byte.
236 CopyCapture(t->capture, capture);
241 // Explore alternatives.
242 stk[nstk++] = AddState(ip->out1());
243 stk[nstk++] = AddState(ip->out());
248 stk[nstk++] = AddState(ip->out());
252 if ((j=ip->cap()) < ncapture_) {
253 // Push a dummy whose only job is to restore capture[j]
254 // once we finish exploring this possibility.
255 stk[nstk++] = AddState(0, capture[j], j);
260 stk[nstk++] = AddState(ip->out());
265 // Save state; will pick up at next byte.
268 CopyCapture(t->capture, capture);
271 fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
274 case kInstEmptyWidth:
275 // Continue on if we have all the right flag bits.
276 if (ip->empty() & ~flag)
278 stk[nstk++] = AddState(ip->out());
284 // Run runq on byte c, appending new states to nextq.
285 // Updates match as new, better matches are found.
286 // p is position of the byte c in the input string,
287 // used when processing capturing parens.
288 // flag is the bitwise or of Bol, Eol, etc., specifying whether
289 // ^, $ and \b match the current input point (after c).
290 // Frees all the threads on runq.
291 // If there is a shortcut to the end, returns that shortcut.
292 int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
295 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
296 Thread* t = i->second;
301 // Can skip any threads started after our current best match.
302 if (matched_ && match_[0] < t->capture[0]) {
309 Prog::Inst* ip = prog_->inst(id);
311 switch (ip->opcode()) {
313 // Should only see the values handled below.
314 LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
319 AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
323 if (i != runq->begin())
325 // The match is ours if we want it.
326 if (ip->greedy(prog_) || longest_) {
327 CopyCapture((const char**)match_, t->capture);
329 for (++i; i != runq->end(); ++i)
330 FreeThread(i->second);
333 if (ip->greedy(prog_))
340 if (endmatch_ && p != etext_)
343 const char* old = t->capture[1]; // previous end pointer
346 // Leftmost-longest mode: save this match only if
347 // it is either farther to the left or at the same
348 // point but longer than an existing match.
349 if (!matched_ || t->capture[0] < match_[0] ||
350 (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
351 CopyCapture((const char**)match_, t->capture);
353 // Leftmost-biased mode: this match is by definition
354 // better than what we've already found (see next line).
355 CopyCapture((const char**)match_, t->capture);
357 // Cut off the threads that can only find matches
358 // worse than the one we just found: don't run the
359 // rest of the current Threadq.
362 for (++i; i != runq->end(); ++i)
363 FreeThread(i->second);
378 string NFA::FormatCapture(const char** capture) {
381 for (int i = 0; i < ncapture_; i+=2) {
382 if (capture[i] == NULL)
383 StringAppendF(&s, "(?,?)");
384 else if (capture[i+1] == NULL)
385 StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
387 StringAppendF(&s, "(%d,%d)",
388 (int)(capture[i] - btext_),
389 (int)(capture[i+1] - btext_));
394 // Returns whether haystack contains needle's memory.
395 static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
396 return haystack.begin() <= needle.begin() &&
397 haystack.end() >= needle.end();
400 bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
401 bool anchored, bool longest,
402 StringPiece* submatch, int nsubmatch) {
406 StringPiece context = const_context;
407 if (context.begin() == NULL)
410 if (!StringPieceContains(context, text)) {
411 LOG(FATAL) << "Bad args: context does not contain text "
412 << reinterpret_cast<const void*>(context.begin())
413 << "+" << context.size() << " "
414 << reinterpret_cast<const void*>(text.begin())
415 << "+" << text.size();
419 if (prog_->anchor_start() && context.begin() != text.begin())
421 if (prog_->anchor_end() && context.end() != text.end())
423 anchored |= prog_->anchor_start();
424 if (prog_->anchor_end()) {
431 LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
435 // Save search parameters.
436 ncapture_ = 2*nsubmatch;
439 if (nsubmatch == 0) {
440 // We need to maintain match[0], both to distinguish the
441 // longest match (if longest is true) and also to tell
442 // whether we've seen any matches at all.
446 match_ = new const char*[ncapture_];
448 memset(match_, 0, ncapture_*sizeof match_[0]);
450 // For debugging prints.
451 btext_ = context.begin();
454 fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
455 text.as_string().c_str(), context.as_string().c_str(), anchored,
460 Threadq* runq = &q0_;
461 Threadq* nextq = &q1_;
464 memset(&match_[0], 0, ncapture_*sizeof match_[0]);
465 const char* bp = context.begin();
469 if (text.begin() > context.begin()) {
470 c = text.begin()[-1] & 0xFF;
471 wasword = Prog::IsWordChar(c);
474 // Loop over the text, stepping the machine.
475 for (const char* p = text.begin();; p++) {
476 // Check for empty-width specials.
480 if (p == context.begin())
481 flag |= kEmptyBeginText | kEmptyBeginLine;
482 else if (p <= context.end() && p[-1] == '\n')
483 flag |= kEmptyBeginLine;
486 if (p == context.end())
487 flag |= kEmptyEndText | kEmptyEndLine;
488 else if (p < context.end() && p[0] == '\n')
489 flag |= kEmptyEndLine;
493 if (p < context.end())
494 isword = Prog::IsWordChar(p[0] & 0xFF);
496 if (isword != wasword)
497 flag |= kEmptyWordBoundary;
499 flag |= kEmptyNonWordBoundary;
502 fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
503 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
504 Thread* t = i->second;
507 fprintf(stderr, " %d%s", t->id,
508 FormatCapture((const char**)t->capture).c_str());
510 fprintf(stderr, "\n");
513 // Process previous character (waited until now to avoid
514 // repeating the flag computation above).
515 // This is a no-op the first time around the loop, because
517 int id = Step(runq, nextq, c, flag, p-1);
518 DCHECK_EQ(runq->size(), 0);
522 // We're done: full match ahead.
525 Prog::Inst* ip = prog_->inst(id);
526 switch (ip->opcode()) {
528 LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
532 match_[ip->cap()] = p;
545 case kInstEmptyWidth:
546 if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
547 LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
561 // Start a new thread if there have not been any matches.
562 // (No point in starting a new thread if there have been
563 // matches, since it would be to the right of the match
564 // we already found.)
565 if (!matched_ && (!anchored || p == text.begin())) {
566 // If there's a required first byte for an unanchored search
567 // and we're not in the middle of any possible matches,
568 // use memchr to search for the byte quickly.
569 if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
570 p < text.end() && (p[0] & 0xFF) != first_byte_) {
571 p = reinterpret_cast<const char*>(memchr(p, first_byte_,
577 isword = Prog::IsWordChar(p[0] & 0xFF);
579 flag = Prog::EmptyFlags(context, p);
582 // Steal match storage (cleared but unused as of yet)
583 // temporarily to hold match boundaries for new thread.
585 AddToThreadq(runq, start_, flag, p, match_);
589 // If all the threads have died, stop early.
590 if (runq->size() == 0) {
592 fprintf(stderr, "dead\n");
602 // Will run step(runq, nextq, c, ...) on next iteration. See above.
605 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
606 FreeThread(i->second);
609 for (int i = 0; i < nsubmatch; i++)
610 submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
612 fprintf(stderr, "match (%d,%d)\n",
613 static_cast<int>(match_[0] - btext_),
614 static_cast<int>(match_[1] - btext_));
617 VLOG(1) << "No matches found";
621 // Computes whether all successful matches have a common first byte,
622 // and if so, returns that byte. If not, returns -1.
623 int NFA::ComputeFirstByte() {
627 int b = -1; // first byte, not yet computed
629 typedef SparseSet Workq;
630 Workq q(prog_->size());
632 for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
634 Prog::Inst* ip = prog_->inst(id);
635 switch (ip->opcode()) {
637 LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
641 // The empty string matches: no first byte.
645 // Must match only a single byte
646 if (ip->lo() != ip->hi())
648 if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
650 // If we haven't seen any bytes yet, record it;
651 // otherwise must match the one we saw before.
654 else if (b != ip->lo())
660 case kInstEmptyWidth:
662 // Ignore ip->empty() flags for kInstEmptyWidth
663 // in order to be as conservative as possible
664 // (assume all possible empty-width flags are true).
671 // Explore alternatives.
675 q.insert(ip->out1());
686 Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
687 Anchor anchor, MatchKind kind,
688 StringPiece* match, int nmatch) {
694 if (kind == kFullMatch) {
701 if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
703 if (kind == kFullMatch && match[0].end() != text.end())