src/third_party/re2/re2/nfa.cc

   1 // Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Tested by search_test.cc.
   6 //
   7 // Prog::SearchNFA, an NFA search.
   8 // This is an actual NFA like the theorists talk about,
   9 // not the pseudo-NFA found in backtracking regexp implementations.
  10 //
  11 // IMPLEMENTATION
  12 //
  13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
  14 // which is a variant of the one described in Thompson's 1968 CACM paper.
  15 // See http://swtch.com/~rsc/regexp/ for various history.  The main feature
  16 // over the DFA implementation is that it tracks submatch boundaries.
  17 //
  18 // When the choice of submatch boundaries is ambiguous, this particular
  19 // implementation makes the same choices that traditional backtracking
  20 // implementations (in particular, Perl and PCRE) do.
  21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
  22 // time in the length of the input.
  23 //
  24 // Like Thompson's original machine and like the DFA implementation, this
  25 // implementation notices a match only once it is one byte past it.
  26
  27 #include "re2/prog.h"
  28 #include "re2/regexp.h"
  29 #include "util/sparse_array.h"
  30 #include "util/sparse_set.h"
  31
  32 namespace re2 {
  33
  34 class NFA {
  35  public:
  36   NFA(Prog* prog);
  37   ~NFA();
  38
  39   // Searches for a matching string.
  40   //   * If anchored is true, only considers matches starting at offset.
  41   //     Otherwise finds lefmost match at or after offset.
  42   //   * If longest is true, returns the longest match starting
  43   //     at the chosen start point.  Otherwise returns the so-called
  44   //     left-biased match, the one traditional backtracking engines
  45   //     (like Perl and PCRE) find.
  46   // Records submatch boundaries in submatch[1..nsubmatch-1].
  47   // Submatch[0] is the entire match.  When there is a choice in
  48   // which text matches each subexpression, the submatch boundaries
  49   // are chosen to match what a backtracking implementation would choose.
  50   bool Search(const StringPiece& text, const StringPiece& context,
  51               bool anchored, bool longest,
  52               StringPiece* submatch, int nsubmatch);
  53
  54   static const int Debug = 0;
  55
  56  private:
  57   struct Thread {
  58     union {
  59       int id;
  60       Thread* next;  // when on free list
  61     };
  62     const char** capture;
  63   };
  64
  65   // State for explicit stack in AddToThreadq.
  66   struct AddState {
  67     int id;           // Inst to process
  68     int j;
  69     const char* cap_j;  // if j>=0, set capture[j] = cap_j before processing ip
  70
  71     AddState()
  72       : id(0), j(-1), cap_j(NULL) {}
  73     explicit AddState(int id)
  74       : id(id), j(-1), cap_j(NULL) {}
  75     AddState(int id, const char* cap_j, int j)
  76       : id(id), j(j), cap_j(cap_j) {}
  77   };
  78
  79   // Threadq is a list of threads.  The list is sorted by the order
  80   // in which Perl would explore that particular state -- the earlier
  81   // choices appear earlier in the list.
  82   typedef SparseArray<Thread*> Threadq;
  83
  84   inline Thread* AllocThread();
  85   inline void FreeThread(Thread*);
  86
  87   // Add id (or its children, following unlabeled arrows)
  88   // to the workqueue q with associated capture info.
  89   void AddToThreadq(Threadq* q, int id, int flag,
  90                     const char* p, const char** capture);
  91
  92   // Run runq on byte c, appending new states to nextq.
  93   // Updates matched_ and match_ as new, better matches are found.
  94   // p is position of the next byte (the one after c)
  95   // in the input string, used when processing capturing parens.
  96   // flag is the bitwise or of Bol, Eol, etc., specifying whether
  97   // ^, $ and \b match the current input point (after c).
  98   inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
  99
 100   // Returns text version of capture information, for debugging.
 101   string FormatCapture(const char** capture);
 102
 103   inline void CopyCapture(const char** dst, const char** src);
 104
 105   // Computes whether all matches must begin with the same first
 106   // byte, and if so, returns that byte.  If not, returns -1.
 107   int ComputeFirstByte();
 108
 109   Prog* prog_;          // underlying program
 110   int start_;           // start instruction in program
 111   int ncapture_;        // number of submatches to track
 112   bool longest_;        // whether searching for longest match
 113   bool endmatch_;       // whether match must end at text.end()
 114   const char* btext_;   // beginning of text being matched (for FormatSubmatch)
 115   const char* etext_;   // end of text being matched (for endmatch_)
 116   Threadq q0_, q1_;     // pre-allocated for Search.
 117   const char** match_;  // best match so far
 118   bool matched_;        // any match so far?
 119   AddState* astack_;    // pre-allocated for AddToThreadq
 120   int nastack_;
 121   int first_byte_;      // required first byte for match, or -1 if none
 122
 123   Thread* free_threads_;  // free list
 124
 125   DISALLOW_EVIL_CONSTRUCTORS(NFA);
 126 };
 127
 128 NFA::NFA(Prog* prog) {
 129   prog_ = prog;
 130   start_ = prog->start();
 131   ncapture_ = 0;
 132   longest_ = false;
 133   endmatch_ = false;
 134   btext_ = NULL;
 135   etext_ = NULL;
 136   q0_.resize(prog_->size());
 137   q1_.resize(prog_->size());
 138   nastack_ = 2*prog_->size();
 139   astack_ = new AddState[nastack_];
 140   match_ = NULL;
 141   matched_ = false;
 142   free_threads_ = NULL;
 143   first_byte_ = ComputeFirstByte();
 144 }
 145
 146 NFA::~NFA() {
 147   delete[] match_;
 148   delete[] astack_;
 149   Thread* next;
 150   for (Thread* t = free_threads_; t; t = next) {
 151     next = t->next;
 152     delete[] t->capture;
 153     delete t;
 154   }
 155 }
 156
 157 void NFA::FreeThread(Thread *t) {
 158   if (t == NULL)
 159     return;
 160   t->next = free_threads_;
 161   free_threads_ = t;
 162 }
 163
 164 NFA::Thread* NFA::AllocThread() {
 165   Thread* t = free_threads_;
 166   if (t == NULL) {
 167     t = new Thread;
 168     t->capture = new const char*[ncapture_];
 169     return t;
 170   }
 171   free_threads_ = t->next;
 172   return t;
 173 }
 174
 175 void NFA::CopyCapture(const char** dst, const char** src) {
 176   for (int i = 0; i < ncapture_; i+=2) {
 177     dst[i] = src[i];
 178     dst[i+1] = src[i+1];
 179   }
 180 }
 181
 182 // Follows all empty arrows from id0 and enqueues all the states reached.
 183 // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
 184 // The pointer p is the current input position, and m is the
 185 // current set of match boundaries.
 186 void NFA::AddToThreadq(Threadq* q, int id0, int flag,
 187                        const char* p, const char** capture) {
 188   if (id0 == 0)
 189     return;
 190
 191   // Astack_ is pre-allocated to avoid resize operations.
 192   // It has room for 2*prog_->size() entries, which is enough:
 193   // Each inst in prog can be processed at most once,
 194   // pushing at most two entries on stk.
 195
 196   int nstk = 0;
 197   AddState* stk = astack_;
 198   stk[nstk++] = AddState(id0);
 199
 200   while (nstk > 0) {
 201     DCHECK_LE(nstk, nastack_);
 202     const AddState& a = stk[--nstk];
 203     if (a.j >= 0)
 204       capture[a.j] = a.cap_j;
 205
 206     int id = a.id;
 207     if (id == 0)
 208       continue;
 209     if (q->has_index(id)) {
 210       if (Debug)
 211         fprintf(stderr, "  [%d%s]\n", id, FormatCapture(capture).c_str());
 212       continue;
 213     }
 214
 215     // Create entry in q no matter what.  We might fill it in below,
 216     // or we might not.  Even if not, it is necessary to have it,
 217     // so that we don't revisit id0 during the recursion.
 218     q->set_new(id, NULL);
 219
 220     Thread** tp = &q->find(id)->second;
 221     int j;
 222     Thread* t;
 223     Prog::Inst* ip = prog_->inst(id);
 224     switch (ip->opcode()) {
 225     default:
 226       LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
 227       break;
 228
 229     case kInstFail:
 230       break;
 231
 232     case kInstAltMatch:
 233       // Save state; will pick up at next byte.
 234       t = AllocThread();
 235       t->id = id;
 236       CopyCapture(t->capture, capture);
 237       *tp = t;
 238       // fall through
 239
 240     case kInstAlt:
 241       // Explore alternatives.
 242       stk[nstk++] = AddState(ip->out1());
 243       stk[nstk++] = AddState(ip->out());
 244       break;
 245
 246     case kInstNop:
 247       // Continue on.
 248       stk[nstk++] = AddState(ip->out());
 249       break;
 250
 251     case kInstCapture:
 252       if ((j=ip->cap()) < ncapture_) {
 253         // Push a dummy whose only job is to restore capture[j]
 254         // once we finish exploring this possibility.
 255         stk[nstk++] = AddState(0, capture[j], j);
 256
 257         // Record capture.
 258         capture[j] = p;
 259       }
 260       stk[nstk++] = AddState(ip->out());
 261       break;
 262
 263     case kInstMatch:
 264     case kInstByteRange:
 265       // Save state; will pick up at next byte.
 266       t = AllocThread();
 267       t->id = id;
 268       CopyCapture(t->capture, capture);
 269       *tp = t;
 270       if (Debug)
 271         fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
 272       break;
 273
 274     case kInstEmptyWidth:
 275       // Continue on if we have all the right flag bits.
 276       if (ip->empty() & ~flag)
 277         break;
 278       stk[nstk++] = AddState(ip->out());
 279       break;
 280     }
 281   }
 282 }
 283
 284 // Run runq on byte c, appending new states to nextq.
 285 // Updates match as new, better matches are found.
 286 // p is position of the byte c in the input string,
 287 // used when processing capturing parens.
 288 // flag is the bitwise or of Bol, Eol, etc., specifying whether
 289 // ^, $ and \b match the current input point (after c).
 290 // Frees all the threads on runq.
 291 // If there is a shortcut to the end, returns that shortcut.
 292 int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
 293   nextq->clear();
 294
 295   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
 296     Thread* t = i->second;
 297     if (t == NULL)
 298       continue;
 299
 300     if (longest_) {
 301       // Can skip any threads started after our current best match.
 302       if (matched_ && match_[0] < t->capture[0]) {
 303         FreeThread(t);
 304         continue;
 305       }
 306     }
 307
 308     int id = t->id;
 309     Prog::Inst* ip = prog_->inst(id);
 310
 311     switch (ip->opcode()) {
 312       default:
 313         // Should only see the values handled below.
 314         LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
 315         break;
 316
 317       case kInstByteRange:
 318         if (ip->Matches(c))
 319           AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
 320         break;
 321
 322       case kInstAltMatch:
 323         if (i != runq->begin())
 324           break;
 325         // The match is ours if we want it.
 326         if (ip->greedy(prog_) || longest_) {
 327           CopyCapture((const char**)match_, t->capture);
 328           FreeThread(t);
 329           for (++i; i != runq->end(); ++i)
 330             FreeThread(i->second);
 331           runq->clear();
 332           matched_ = true;
 333           if (ip->greedy(prog_))
 334             return ip->out1();
 335           return ip->out();
 336         }
 337         break;
 338
 339       case kInstMatch:
 340         if (endmatch_ && p != etext_)
 341           break;
 342
 343         const char* old = t->capture[1];  // previous end pointer
 344         t->capture[1] = p;
 345         if (longest_) {
 346           // Leftmost-longest mode: save this match only if
 347           // it is either farther to the left or at the same
 348           // point but longer than an existing match.
 349           if (!matched_ || t->capture[0] < match_[0] ||
 350               (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
 351             CopyCapture((const char**)match_, t->capture);
 352         } else {
 353           // Leftmost-biased mode: this match is by definition
 354           // better than what we've already found (see next line).
 355           CopyCapture((const char**)match_, t->capture);
 356
 357           // Cut off the threads that can only find matches
 358           // worse than the one we just found: don't run the
 359           // rest of the current Threadq.
 360           t->capture[0] = old;
 361           FreeThread(t);
 362           for (++i; i != runq->end(); ++i)
 363             FreeThread(i->second);
 364           runq->clear();
 365           matched_ = true;
 366           return 0;
 367         }
 368         t->capture[0] = old;
 369         matched_ = true;
 370         break;
 371     }
 372     FreeThread(t);
 373   }
 374   runq->clear();
 375   return 0;
 376 }
 377
 378 string NFA::FormatCapture(const char** capture) {
 379   string s;
 380
 381   for (int i = 0; i < ncapture_; i+=2) {
 382     if (capture[i] == NULL)
 383       StringAppendF(&s, "(?,?)");
 384     else if (capture[i+1] == NULL)
 385       StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
 386     else
 387       StringAppendF(&s, "(%d,%d)",
 388                     (int)(capture[i] - btext_),
 389                     (int)(capture[i+1] - btext_));
 390   }
 391   return s;
 392 }
 393
 394 // Returns whether haystack contains needle's memory.
 395 static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
 396   return haystack.begin() <= needle.begin() &&
 397          haystack.end() >= needle.end();
 398 }
 399
 400 bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
 401             bool anchored, bool longest,
 402             StringPiece* submatch, int nsubmatch) {
 403   if (start_ == 0)
 404     return false;
 405
 406   StringPiece context = const_context;
 407   if (context.begin() == NULL)
 408     context = text;
 409
 410   if (!StringPieceContains(context, text)) {
 411     LOG(FATAL) << "Bad args: context does not contain text "
 412                 << reinterpret_cast<const void*>(context.begin())
 413                 << "+" << context.size() << " "
 414                 << reinterpret_cast<const void*>(text.begin())
 415                 << "+" << text.size();
 416     return false;
 417   }
 418
 419   if (prog_->anchor_start() && context.begin() != text.begin())
 420     return false;
 421   if (prog_->anchor_end() && context.end() != text.end())
 422     return false;
 423   anchored |= prog_->anchor_start();
 424   if (prog_->anchor_end()) {
 425     longest = true;
 426     endmatch_ = true;
 427     etext_ = text.end();
 428   }
 429
 430   if (nsubmatch < 0) {
 431     LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
 432     return false;
 433   }
 434
 435   // Save search parameters.
 436   ncapture_ = 2*nsubmatch;
 437   longest_ = longest;
 438
 439   if (nsubmatch == 0) {
 440     // We need to maintain match[0], both to distinguish the
 441     // longest match (if longest is true) and also to tell
 442     // whether we've seen any matches at all.
 443     ncapture_ = 2;
 444   }
 445
 446   match_ = new const char*[ncapture_];
 447   matched_ = false;
 448   memset(match_, 0, ncapture_*sizeof match_[0]);
 449
 450   // For debugging prints.
 451   btext_ = context.begin();
 452
 453   if (Debug) {
 454     fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
 455             text.as_string().c_str(), context.as_string().c_str(), anchored,
 456             longest);
 457   }
 458
 459   // Set up search.
 460   Threadq* runq = &q0_;
 461   Threadq* nextq = &q1_;
 462   runq->clear();
 463   nextq->clear();
 464   memset(&match_[0], 0, ncapture_*sizeof match_[0]);
 465   const char* bp = context.begin();
 466   int c = -1;
 467   int wasword = 0;
 468
 469   if (text.begin() > context.begin()) {
 470     c = text.begin()[-1] & 0xFF;
 471     wasword = Prog::IsWordChar(c);
 472   }
 473
 474   // Loop over the text, stepping the machine.
 475   for (const char* p = text.begin();; p++) {
 476     // Check for empty-width specials.
 477     int flag = 0;
 478
 479     // ^ and \A
 480     if (p == context.begin())
 481       flag |= kEmptyBeginText | kEmptyBeginLine;
 482     else if (p <= context.end() && p[-1] == '\n')
 483       flag |= kEmptyBeginLine;
 484
 485     // $ and \z
 486     if (p == context.end())
 487       flag |= kEmptyEndText | kEmptyEndLine;
 488     else if (p < context.end() && p[0] == '\n')
 489       flag |= kEmptyEndLine;
 490
 491     // \b and \B
 492     int isword = 0;
 493     if (p < context.end())
 494       isword = Prog::IsWordChar(p[0] & 0xFF);
 495
 496     if (isword != wasword)
 497       flag |= kEmptyWordBoundary;
 498     else
 499       flag |= kEmptyNonWordBoundary;
 500
 501     if (Debug) {
 502       fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
 503       for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
 504         Thread* t = i->second;
 505         if (t == NULL)
 506           continue;
 507         fprintf(stderr, " %d%s", t->id,
 508                 FormatCapture((const char**)t->capture).c_str());
 509       }
 510       fprintf(stderr, "\n");
 511     }
 512
 513     // Process previous character (waited until now to avoid
 514     // repeating the flag computation above).
 515     // This is a no-op the first time around the loop, because
 516     // runq is empty.
 517     int id = Step(runq, nextq, c, flag, p-1);
 518     DCHECK_EQ(runq->size(), 0);
 519     swap(nextq, runq);
 520     nextq->clear();
 521     if (id != 0) {
 522       // We're done: full match ahead.
 523       p = text.end();
 524       for (;;) {
 525         Prog::Inst* ip = prog_->inst(id);
 526         switch (ip->opcode()) {
 527           default:
 528             LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
 529             break;
 530
 531           case kInstCapture:
 532             match_[ip->cap()] = p;
 533             id = ip->out();
 534             continue;
 535
 536           case kInstNop:
 537             id = ip->out();
 538             continue;
 539
 540           case kInstMatch:
 541             match_[1] = p;
 542             matched_ = true;
 543             break;
 544
 545           case kInstEmptyWidth:
 546             if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
 547               LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
 548               break;
 549             }
 550             id = ip->out();
 551             continue;
 552         }
 553         break;
 554       }
 555       break;
 556     }
 557
 558     if (p > text.end())
 559       break;
 560
 561     // Start a new thread if there have not been any matches.
 562     // (No point in starting a new thread if there have been
 563     // matches, since it would be to the right of the match
 564     // we already found.)
 565     if (!matched_ && (!anchored || p == text.begin())) {
 566       // If there's a required first byte for an unanchored search
 567       // and we're not in the middle of any possible matches,
 568       // use memchr to search for the byte quickly.
 569       if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
 570           p < text.end() && (p[0] & 0xFF) != first_byte_) {
 571         p = reinterpret_cast<const char*>(memchr(p, first_byte_,
 572                                                  text.end() - p));
 573         if (p == NULL) {
 574           p = text.end();
 575           isword = 0;
 576         } else {
 577           isword = Prog::IsWordChar(p[0] & 0xFF);
 578         }
 579         flag = Prog::EmptyFlags(context, p);
 580       }
 581
 582       // Steal match storage (cleared but unused as of yet)
 583       // temporarily to hold match boundaries for new thread.
 584       match_[0] = p;
 585       AddToThreadq(runq, start_, flag, p, match_);
 586       match_[0] = NULL;
 587     }
 588
 589     // If all the threads have died, stop early.
 590     if (runq->size() == 0) {
 591       if (Debug)
 592         fprintf(stderr, "dead\n");
 593       break;
 594     }
 595
 596     if (p == text.end())
 597       c = 0;
 598     else
 599       c = *p & 0xFF;
 600     wasword = isword;
 601
 602     // Will run step(runq, nextq, c, ...) on next iteration.  See above.
 603   }
 604
 605   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
 606     FreeThread(i->second);
 607
 608   if (matched_) {
 609     for (int i = 0; i < nsubmatch; i++)
 610       submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
 611     if (Debug)
 612       fprintf(stderr, "match (%d,%d)\n",
 613               static_cast<int>(match_[0] - btext_),
 614               static_cast<int>(match_[1] - btext_));
 615     return true;
 616   }
 617   VLOG(1) << "No matches found";
 618   return false;
 619 }
 620
 621 // Computes whether all successful matches have a common first byte,
 622 // and if so, returns that byte.  If not, returns -1.
 623 int NFA::ComputeFirstByte() {
 624   if (start_ == 0)
 625     return -1;
 626
 627   int b = -1;  // first byte, not yet computed
 628
 629   typedef SparseSet Workq;
 630   Workq q(prog_->size());
 631   q.insert(start_);
 632   for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
 633     int id = *it;
 634     Prog::Inst* ip = prog_->inst(id);
 635     switch (ip->opcode()) {
 636       default:
 637         LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
 638         break;
 639
 640       case kInstMatch:
 641         // The empty string matches: no first byte.
 642         return -1;
 643
 644       case kInstByteRange:
 645         // Must match only a single byte
 646         if (ip->lo() != ip->hi())
 647           return -1;
 648         if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
 649           return -1;
 650         // If we haven't seen any bytes yet, record it;
 651         // otherwise must match the one we saw before.
 652         if (b == -1)
 653           b = ip->lo();
 654         else if (b != ip->lo())
 655           return -1;
 656         break;
 657
 658       case kInstNop:
 659       case kInstCapture:
 660       case kInstEmptyWidth:
 661         // Continue on.
 662         // Ignore ip->empty() flags for kInstEmptyWidth
 663         // in order to be as conservative as possible
 664         // (assume all possible empty-width flags are true).
 665         if (ip->out())
 666           q.insert(ip->out());
 667         break;
 668
 669       case kInstAlt:
 670       case kInstAltMatch:
 671         // Explore alternatives.
 672         if (ip->out())
 673           q.insert(ip->out());
 674         if (ip->out1())
 675           q.insert(ip->out1());
 676         break;
 677
 678       case kInstFail:
 679         break;
 680     }
 681   }
 682   return b;
 683 }
 684
 685 bool
 686 Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
 687                 Anchor anchor, MatchKind kind,
 688                 StringPiece* match, int nmatch) {
 689   if (NFA::Debug)
 690     Dump();
 691
 692   NFA nfa(this);
 693   StringPiece sp;
 694   if (kind == kFullMatch) {
 695     anchor = kAnchored;
 696     if (nmatch == 0) {
 697       match = &sp;
 698       nmatch = 1;
 699     }
 700   }
 701   if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
 702     return false;
 703   if (kind == kFullMatch && match[0].end() != text.end())
 704     return false;
 705   return true;
 706 }
 707
 708 }  // namespace re2
 709