js/src/yarr/yarr/RegexParser.h

   1 /*
   2  * Copyright (C) 2009 Apple Inc. All rights reserved.
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions
   6  * are met:
   7  * 1. Redistributions of source code must retain the above copyright
   8  *    notice, this list of conditions and the following disclaimer.
   9  * 2. Redistributions in binary form must reproduce the above copyright
  10  *    notice, this list of conditions and the following disclaimer in the
  11  *    documentation and/or other materials provided with the distribution.
  12  *
  13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24  */
  25
  26 #ifndef RegexParser_h
  27 #define RegexParser_h
  28
  29 #include <limits.h>
  30 #include <wtf/ASCIICType.h>
  31 #include "yarr/jswtfbridge.h"
  32 #include "yarr/yarr/RegexCommon.h"
  33
  34 namespace JSC { namespace Yarr {
  35
  36 enum BuiltInCharacterClassID {
  37     DigitClassID,
  38     SpaceClassID,
  39     WordClassID,
  40     NewlineClassID
  41 };
  42
  43 // The Parser class should not be used directly - only via the Yarr::parse() method.
  44 template<class Delegate>
  45 class Parser {
  46 private:
  47     template<class FriendDelegate>
  48     friend int parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
  49
  50     /*
  51      * CharacterClassParserDelegate:
  52      *
  53      * The class CharacterClassParserDelegate is used in the parsing of character
  54      * classes.  This class handles detection of character ranges.  This class
  55      * implements enough of the delegate interface such that it can be passed to
  56      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
  57      * to perform the parsing of escape characters in character sets.
  58      */
  59     class CharacterClassParserDelegate {
  60     public:
  61         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
  62             : m_delegate(delegate)
  63             , m_err(err)
  64             , m_state(empty)
  65         {
  66         }
  67
  68         /*
  69          * begin():
  70          *
  71          * Called at beginning of construction.
  72          */
  73         void begin(bool invert)
  74         {
  75             m_delegate.atomCharacterClassBegin(invert);
  76         }
  77
  78         /*
  79          * atomPatternCharacterUnescaped():
  80          *
  81          * This method is called directly from parseCharacterClass(), to report a new
  82          * pattern character token.  This method differs from atomPatternCharacter(),
  83          * which will be called from parseEscape(), since a hypen provided via this
  84          * method may be indicating a character range, but a hyphen parsed by
  85          * parseEscape() cannot be interpreted as doing so.
  86          */
  87         void atomPatternCharacterUnescaped(UChar ch)
  88         {
  89             switch (m_state) {
  90             case empty:
  91                 m_character = ch;
  92                 m_state = cachedCharacter;
  93                 break;
  94
  95             case cachedCharacter:
  96                 if (ch == '-')
  97                     m_state = cachedCharacterHyphen;
  98                 else {
  99                     m_delegate.atomCharacterClassAtom(m_character);
 100                     m_character = ch;
 101                 }
 102                 break;
 103
 104             case cachedCharacterHyphen:
 105                 if (ch >= m_character)
 106                     m_delegate.atomCharacterClassRange(m_character, ch);
 107                 else
 108                     m_err = CharacterClassOutOfOrder;
 109                 m_state = empty;
 110             }
 111         }
 112
 113         /*
 114          * atomPatternCharacter():
 115          *
 116          * Adds a pattern character, called by parseEscape(), as such will not
 117          * interpret a hyphen as indicating a character range.
 118          */
 119         void atomPatternCharacter(UChar ch)
 120         {
 121             // Flush if a character is already pending to prevent the
 122             // hyphen from begin interpreted as indicating a range.
 123             if((ch == '-') && (m_state == cachedCharacter))
 124                 flush();
 125
 126             atomPatternCharacterUnescaped(ch);
 127         }
 128
 129         /*
 130          * atomBuiltInCharacterClass():
 131          *
 132          * Adds a built-in character class, called by parseEscape().
 133          */
 134         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
 135         {
 136             if (m_state == cachedCharacterHyphen) {
 137                 // If the RHS of a range does not contain exacly one character then a SyntaxError
 138                 // must be thrown. SpiderMonkey only errors out in the [c-\s] case as an extension.
 139                 // (This assumes none of the built in character classes contain a single
 140                 // character.)
 141                 m_err = CharacterClassRangeSingleChar;
 142                 m_state = empty;
 143                 return;
 144             }
 145             flush();
 146             m_delegate.atomCharacterClassBuiltIn(classID, invert);
 147         }
 148
 149         /*
 150          * end():
 151          *
 152          * Called at end of construction.
 153          */
 154         void end()
 155         {
 156             flush();
 157             m_delegate.atomCharacterClassEnd();
 158         }
 159
 160         // parseEscape() should never call these delegate methods when
 161         // invoked with inCharacterClass set.
 162         void assertionWordBoundary(bool) { JS_NOT_REACHED("parseEscape() should never call this"); }
 163         void atomBackReference(unsigned) { JS_NOT_REACHED("parseEscape() should never call this"); }
 164
 165     private:
 166         void flush()
 167         {
 168             if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
 169                 m_delegate.atomCharacterClassAtom(m_character);
 170             if (m_state == cachedCharacterHyphen)
 171                 m_delegate.atomCharacterClassAtom('-');
 172             m_state = empty;
 173         }
 174
 175         Delegate& m_delegate;
 176         ErrorCode& m_err;
 177         enum CharacterClassConstructionState {
 178             empty,
 179             cachedCharacter,
 180             cachedCharacterHyphen
 181         } m_state;
 182         UChar m_character;
 183     };
 184
 185     Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
 186         : m_delegate(delegate)
 187         , m_backReferenceLimit(backReferenceLimit)
 188         , m_err(NoError)
 189         , m_data(const_cast<UString &>(pattern).chars())
 190         , m_size(pattern.length())
 191         , m_index(0)
 192         , m_parenthesesNestingDepth(0)
 193     {
 194     }
 195
 196     /*
 197      * parseEscape():
 198      *
 199      * Helper for parseTokens() AND parseCharacterClass().
 200      * Unlike the other parser methods, this function does not report tokens
 201      * directly to the member delegate (m_delegate), instead tokens are
 202      * emitted to the delegate provided as an argument.  In the case of atom
 203      * escapes, parseTokens() will call parseEscape() passing m_delegate as
 204      * an argument, and as such the escape will be reported to the delegate.
 205      *
 206      * However this method may also be used by parseCharacterClass(), in which
 207      * case a CharacterClassParserDelegate will be passed as the delegate that
 208      * tokens should be added to.  A boolean flag is also provided to indicate
 209      * whether that an escape in a CharacterClass is being parsed (some parsing
 210      * rules change in this context).
 211      *
 212      * The boolean value returned by this method indicates whether the token
 213      * parsed was an atom (outside of a characted class \b and \B will be
 214      * interpreted as assertions).
 215      */
 216     template<bool inCharacterClass, class EscapeDelegate>
 217     bool parseEscape(EscapeDelegate& delegate)
 218     {
 219         JS_ASSERT(!m_err);
 220         JS_ASSERT(peek() == '\\');
 221         consume();
 222
 223         if (atEndOfPattern()) {
 224             m_err = EscapeUnterminated;
 225             return false;
 226         }
 227
 228         switch (peek()) {
 229         // Assertions
 230         case 'b':
 231             consume();
 232             if (inCharacterClass)
 233                 delegate.atomPatternCharacter('\b');
 234             else {
 235                 delegate.assertionWordBoundary(false);
 236                 return false;
 237             }
 238             break;
 239         case 'B':
 240             consume();
 241             if (inCharacterClass)
 242                 delegate.atomPatternCharacter('B');
 243             else {
 244                 delegate.assertionWordBoundary(true);
 245                 return false;
 246             }
 247             break;
 248
 249         // CharacterClassEscape
 250         case 'd':
 251             consume();
 252             delegate.atomBuiltInCharacterClass(DigitClassID, false);
 253             break;
 254         case 's':
 255             consume();
 256             delegate.atomBuiltInCharacterClass(SpaceClassID, false);
 257             break;
 258         case 'w':
 259             consume();
 260             delegate.atomBuiltInCharacterClass(WordClassID, false);
 261             break;
 262         case 'D':
 263             consume();
 264             delegate.atomBuiltInCharacterClass(DigitClassID, true);
 265             break;
 266         case 'S':
 267             consume();
 268             delegate.atomBuiltInCharacterClass(SpaceClassID, true);
 269             break;
 270         case 'W':
 271             consume();
 272             delegate.atomBuiltInCharacterClass(WordClassID, true);
 273             break;
 274
 275         // DecimalEscape
 276         case '1':
 277         case '2':
 278         case '3':
 279         case '4':
 280         case '5':
 281         case '6':
 282         case '7':
 283         case '8':
 284         case '9': {
 285             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
 286             // First, try to parse this as backreference.
 287             if (!inCharacterClass) {
 288                 ParseState state = saveState();
 289
 290                 unsigned backReference;
 291                 if (!consumeNumber(backReference))
 292                     return false;
 293                 if (backReference <= m_backReferenceLimit) {
 294                     delegate.atomBackReference(backReference);
 295                     break;
 296                 }
 297
 298                 restoreState(state);
 299             }
 300
 301             // Not a backreference, and not octal.
 302             if (peek() >= '8') {
 303                 delegate.atomPatternCharacter('\\');
 304                 break;
 305             }
 306
 307             // Fall-through to handle this as an octal escape.
 308         }
 309
 310         // Octal escape
 311         case '0':
 312             delegate.atomPatternCharacter(consumeOctal());
 313             break;
 314
 315         // ControlEscape
 316         case 'f':
 317             consume();
 318             delegate.atomPatternCharacter('\f');
 319             break;
 320         case 'n':
 321             consume();
 322             delegate.atomPatternCharacter('\n');
 323             break;
 324         case 'r':
 325             consume();
 326             delegate.atomPatternCharacter('\r');
 327             break;
 328         case 't':
 329             consume();
 330             delegate.atomPatternCharacter('\t');
 331             break;
 332         case 'v':
 333             consume();
 334             delegate.atomPatternCharacter('\v');
 335             break;
 336
 337         // ControlLetter
 338         case 'c': {
 339             ParseState state = saveState();
 340             consume();
 341             if (!atEndOfPattern()) {
 342                 int control = consume();
 343
 344                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
 345                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
 346                     delegate.atomPatternCharacter(control & 0x1f);
 347                     break;
 348                 }
 349             }
 350             restoreState(state);
 351             delegate.atomPatternCharacter('\\');
 352             break;
 353         }
 354
 355         // HexEscape
 356         case 'x': {
 357             consume();
 358             int x = tryConsumeHex(2);
 359             if (x == -1)
 360                 delegate.atomPatternCharacter('x');
 361             else
 362                 delegate.atomPatternCharacter(x);
 363             break;
 364         }
 365
 366         // UnicodeEscape
 367         case 'u': {
 368             consume();
 369             int u = tryConsumeHex(4);
 370             if (u == -1)
 371                 delegate.atomPatternCharacter('u');
 372             else
 373                 delegate.atomPatternCharacter(u);
 374             break;
 375         }
 376
 377         // IdentityEscape
 378         default:
 379             delegate.atomPatternCharacter(consume());
 380         }
 381
 382         return true;
 383     }
 384
 385     /*
 386      * parseAtomEscape(), parseCharacterClassEscape():
 387      *
 388      * These methods alias to parseEscape().
 389      */
 390     bool parseAtomEscape()
 391     {
 392         return parseEscape<false>(m_delegate);
 393     }
 394     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
 395     {
 396         parseEscape<true>(delegate);
 397     }
 398
 399     /*
 400      * parseCharacterClass():
 401      *
 402      * Helper for parseTokens(); calls directly and indirectly (via parseCharacterClassEscape)
 403      * to an instance of CharacterClassParserDelegate, to describe the character class to the
 404      * delegate.
 405      */
 406     void parseCharacterClass()
 407     {
 408         JS_ASSERT(!m_err);
 409         JS_ASSERT(peek() == '[');
 410         consume();
 411
 412         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
 413
 414         characterClassConstructor.begin(tryConsume('^'));
 415
 416         while (!atEndOfPattern()) {
 417             switch (peek()) {
 418             case ']':
 419                 consume();
 420                 characterClassConstructor.end();
 421                 return;
 422
 423             case '\\':
 424                 parseCharacterClassEscape(characterClassConstructor);
 425                 break;
 426
 427             default:
 428                 characterClassConstructor.atomPatternCharacterUnescaped(consume());
 429             }
 430
 431             if (m_err)
 432                 return;
 433         }
 434
 435         m_err = CharacterClassUnmatched;
 436     }
 437
 438     /*
 439      * parseParenthesesBegin():
 440      *
 441      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
 442      */
 443     void parseParenthesesBegin()
 444     {
 445         JS_ASSERT(!m_err);
 446         JS_ASSERT(peek() == '(');
 447         consume();
 448
 449         if (tryConsume('?')) {
 450             if (atEndOfPattern()) {
 451                 m_err = ParenthesesTypeInvalid;
 452                 return;
 453             }
 454
 455             switch (consume()) {
 456             case ':':
 457                 m_delegate.atomParenthesesSubpatternBegin(false);
 458                 break;
 459
 460             case '=':
 461                 m_delegate.atomParentheticalAssertionBegin();
 462                 break;
 463
 464             case '!':
 465                 m_delegate.atomParentheticalAssertionBegin(true);
 466                 break;
 467
 468             default:
 469                 m_err = ParenthesesTypeInvalid;
 470             }
 471         } else
 472             m_delegate.atomParenthesesSubpatternBegin();
 473
 474         ++m_parenthesesNestingDepth;
 475     }
 476
 477     /*
 478      * parseParenthesesEnd():
 479      *
 480      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
 481      */
 482     void parseParenthesesEnd()
 483     {
 484         JS_ASSERT(!m_err);
 485         JS_ASSERT(peek() == ')');
 486         consume();
 487
 488         if (m_parenthesesNestingDepth > 0)
 489             m_delegate.atomParenthesesEnd();
 490         else
 491             m_err = ParenthesesUnmatched;
 492
 493         --m_parenthesesNestingDepth;
 494     }
 495
 496     /*
 497      * parseQuantifier():
 498      *
 499      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
 500      */
 501     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
 502     {
 503         JS_ASSERT(!m_err);
 504         JS_ASSERT(min <= max);
 505
 506         if (lastTokenWasAnAtom)
 507             m_delegate.quantifyAtom(min, max, !tryConsume('?'));
 508         else
 509             m_err = QuantifierWithoutAtom;
 510     }
 511
 512     /*
 513      * parseTokens():
 514      *
 515      * This method loops over the input pattern reporting tokens to the delegate.
 516      * The method returns when a parse error is detected, or the end of the pattern
 517      * is reached.  One piece of state is tracked around the loop, which is whether
 518      * the last token passed to the delegate was an atom (this is necessary to detect
 519      * a parse error when a quantifier provided without an atom to quantify).
 520      */
 521     void parseTokens()
 522     {
 523         bool lastTokenWasAnAtom = false;
 524
 525         while (!atEndOfPattern()) {
 526             switch (peek()) {
 527             case '|':
 528                 consume();
 529                 m_delegate.disjunction();
 530                 lastTokenWasAnAtom = false;
 531                 break;
 532
 533             case '(':
 534                 parseParenthesesBegin();
 535                 lastTokenWasAnAtom = false;
 536                 break;
 537
 538             case ')':
 539                 parseParenthesesEnd();
 540                 lastTokenWasAnAtom = true;
 541                 break;
 542
 543             case '^':
 544                 consume();
 545                 m_delegate.assertionBOL();
 546                 lastTokenWasAnAtom = false;
 547                 break;
 548
 549             case '$':
 550                 consume();
 551                 m_delegate.assertionEOL();
 552                 lastTokenWasAnAtom = false;
 553                 break;
 554
 555             case '.':
 556                 consume();
 557                 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
 558                 lastTokenWasAnAtom = true;
 559                 break;
 560
 561             case '[':
 562                 parseCharacterClass();
 563                 lastTokenWasAnAtom = true;
 564                 break;
 565
 566             case '\\':
 567                 lastTokenWasAnAtom = parseAtomEscape();
 568                 break;
 569
 570             case '*':
 571                 consume();
 572                 parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
 573                 lastTokenWasAnAtom = false;
 574                 break;
 575
 576             case '+':
 577                 consume();
 578                 parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
 579                 lastTokenWasAnAtom = false;
 580                 break;
 581
 582             case '?':
 583                 consume();
 584                 parseQuantifier(lastTokenWasAnAtom, 0, 1);
 585                 lastTokenWasAnAtom = false;
 586                 break;
 587
 588             case '{': {
 589                 ParseState state = saveState();
 590
 591                 consume();
 592                 if (peekIsDigit()) {
 593                     unsigned min;
 594                     if (!consumeNumber(min))
 595                         break;
 596                     unsigned max = min;
 597
 598                     if (tryConsume(',')) {
 599                         if (peekIsDigit()) {
 600                             if (!consumeNumber(max))
 601                                 break;
 602                         } else {
 603                             max = UINT_MAX;
 604                         }
 605                     }
 606
 607                     if (tryConsume('}')) {
 608                         if (min <= max)
 609                             parseQuantifier(lastTokenWasAnAtom, min, max);
 610                         else
 611                             m_err = QuantifierOutOfOrder;
 612                         lastTokenWasAnAtom = false;
 613                         break;
 614                     }
 615                 }
 616
 617                 restoreState(state);
 618             } // if we did not find a complete quantifer, fall through to the default case.
 619
 620             default:
 621                 m_delegate.atomPatternCharacter(consume());
 622                 lastTokenWasAnAtom = true;
 623             }
 624
 625             if (m_err)
 626                 return;
 627         }
 628
 629         if (m_parenthesesNestingDepth > 0)
 630             m_err = MissingParentheses;
 631     }
 632
 633     /*
 634      * parse():
 635      *
 636      * This method calls regexBegin(), calls parseTokens() to parse over the input
 637      * patterns, calls regexEnd() or regexError() as appropriate, and converts any
 638      * error code to a const char* for a result.
 639      */
 640     int parse()
 641     {
 642         m_delegate.regexBegin();
 643
 644         if (m_size > MAX_PATTERN_SIZE)
 645             m_err = PatternTooLarge;
 646         else
 647             parseTokens();
 648         JS_ASSERT(atEndOfPattern() || m_err);
 649
 650         if (m_err)
 651             m_delegate.regexError();
 652         else
 653             m_delegate.regexEnd();
 654
 655         return static_cast<int>(m_err);
 656     }
 657
 658
 659     // Misc helper functions:
 660
 661     typedef unsigned ParseState;
 662
 663     ParseState saveState()
 664     {
 665         return m_index;
 666     }
 667
 668     void restoreState(ParseState state)
 669     {
 670         m_index = state;
 671     }
 672
 673     bool atEndOfPattern()
 674     {
 675         JS_ASSERT(m_index <= m_size);
 676         return m_index == m_size;
 677     }
 678
 679     int peek()
 680     {
 681         JS_ASSERT(m_index < m_size);
 682         return m_data[m_index];
 683     }
 684
 685     bool peekIsDigit()
 686     {
 687         return !atEndOfPattern() && WTF::isASCIIDigit(peek());
 688     }
 689
 690     unsigned peekDigit()
 691     {
 692         JS_ASSERT(peekIsDigit());
 693         return peek() - '0';
 694     }
 695
 696     int consume()
 697     {
 698         JS_ASSERT(m_index < m_size);
 699         return m_data[m_index++];
 700     }
 701
 702     unsigned consumeDigit()
 703     {
 704         JS_ASSERT(peekIsDigit());
 705         return consume() - '0';
 706     }
 707
 708     bool consumeNumber(unsigned &accum)
 709     {
 710         accum = consumeDigit();
 711         while (peekIsDigit()) {
 712             unsigned newValue = accum * 10 + peekDigit();
 713             if (newValue < accum) { /* Overflow check. */
 714                 m_err = QuantifierTooLarge;
 715                 return false;
 716             }
 717             accum = newValue;
 718             consume();
 719         }
 720         return true;
 721     }
 722
 723     unsigned consumeOctal()
 724     {
 725         JS_ASSERT(WTF::isASCIIOctalDigit(peek()));
 726
 727         unsigned n = consumeDigit();
 728         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
 729             n = n * 8 + consumeDigit();
 730         return n;
 731     }
 732
 733     bool tryConsume(UChar ch)
 734     {
 735         if (atEndOfPattern() || (m_data[m_index] != ch))
 736             return false;
 737         ++m_index;
 738         return true;
 739     }
 740
 741     int tryConsumeHex(int count)
 742     {
 743         ParseState state = saveState();
 744
 745         int n = 0;
 746         while (count--) {
 747             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
 748                 restoreState(state);
 749                 return -1;
 750             }
 751             n = (n << 4) | WTF::toASCIIHexValue(consume());
 752         }
 753         return n;
 754     }
 755
 756     Delegate& m_delegate;
 757     unsigned m_backReferenceLimit;
 758     ErrorCode m_err;
 759     const UChar* m_data;
 760     unsigned m_size;
 761     unsigned m_index;
 762     unsigned m_parenthesesNestingDepth;
 763
 764     // Derived by empirical testing of compile time in PCRE and WREC.
 765     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
 766 };
 767
 768 /*
 769  * Yarr::parse():
 770  *
 771  * The parse method is passed a pattern to be parsed and a delegate upon which
 772  * callbacks will be made to record the parsed tokens forming the regex.
 773  * Yarr::parse() returns null on success, or a const C string providing an error
 774  * message where a parse error occurs.
 775  *
 776  * The Delegate must implement the following interface:
 777  *
 778  *    void assertionBOL();
 779  *    void assertionEOL();
 780  *    void assertionWordBoundary(bool invert);
 781  *
 782  *    void atomPatternCharacter(UChar ch);
 783  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
 784  *    void atomCharacterClassBegin(bool invert)
 785  *    void atomCharacterClassAtom(UChar ch)
 786  *    void atomCharacterClassRange(UChar begin, UChar end)
 787  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
 788  *    void atomCharacterClassEnd()
 789  *    void atomParenthesesSubpatternBegin(bool capture = true);
 790  *    void atomParentheticalAssertionBegin(bool invert = false);
 791  *    void atomParenthesesEnd();
 792  *    void atomBackReference(unsigned subpatternId);
 793  *
 794  *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
 795  *
 796  *    void disjunction();
 797  *
 798  *    void regexBegin();
 799  *    void regexEnd();
 800  *    void regexError();
 801  *
 802  * Before any call recording tokens are made, regexBegin() will be called on the
 803  * delegate once.  Once parsing is complete either regexEnd() or regexError() will
 804  * be called, as appropriate.
 805  *
 806  * The regular expression is described by a sequence of assertion*() and atom*()
 807  * callbacks to the delegate, describing the terms in the regular expression.
 808  * Following an atom a quantifyAtom() call may occur to indicate that the previous
 809  * atom should be quantified.  In the case of atoms described across multiple
 810  * calls (parentheses and character classes) the call to quantifyAtom() will come
 811  * after the call to the atom*End() method, never after atom*Begin().
 812  *
 813  * Character classes may either be described by a single call to
 814  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
 815  * In the latter case, ...Begin() will be called, followed by a sequence of
 816  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
 817  *
 818  * Sequences of atoms and assertions are broken into alternatives via calls to
 819  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
 820  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
 821  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
 822  * capturing subpattern, this will be the subpatternId associated with these
 823  * parentheses, and will also by definition be the lowest subpatternId of these
 824  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
 825  * is passed the subpatternId of the last capturing subexpression nested within
 826  * these paretheses.  In the case of a capturing subpattern with no nested
 827  * capturing subpatterns, the same subpatternId will be passed to the begin and
 828  * end functions.  In the case of non-capturing subpatterns the subpatternId
 829  * passed to the begin method is also the first possible subpatternId that might
 830  * be nested within these paretheses.  If a set of non-capturing parentheses does
 831  * not contain any capturing subpatterns, then the subpatternId passed to begin
 832  * will be greater than the subpatternId passed to end.
 833  */
 834
 835 template<class Delegate>
 836 int parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
 837 {
 838     return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
 839 }
 840
 841 } } // namespace JSC::Yarr
 842
 843 #endif // RegexParser_h