2 * Copyright (C) 2009 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 #include <wtf/ASCIICType.h>
31 #include "yarr/jswtfbridge.h"
32 #include "yarr/yarr/RegexCommon.h"
34 namespace JSC { namespace Yarr {
36 enum BuiltInCharacterClassID {
43 // The Parser class should not be used directly - only via the Yarr::parse() method.
44 template<class Delegate>
47 template<class FriendDelegate>
48 friend int parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
51 * CharacterClassParserDelegate:
53 * The class CharacterClassParserDelegate is used in the parsing of character
54 * classes. This class handles detection of character ranges. This class
55 * implements enough of the delegate interface such that it can be passed to
56 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
57 * to perform the parsing of escape characters in character sets.
59 class CharacterClassParserDelegate {
61 CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
62 : m_delegate(delegate)
71 * Called at beginning of construction.
73 void begin(bool invert)
75 m_delegate.atomCharacterClassBegin(invert);
79 * atomPatternCharacterUnescaped():
81 * This method is called directly from parseCharacterClass(), to report a new
82 * pattern character token. This method differs from atomPatternCharacter(),
83 * which will be called from parseEscape(), since a hypen provided via this
84 * method may be indicating a character range, but a hyphen parsed by
85 * parseEscape() cannot be interpreted as doing so.
87 void atomPatternCharacterUnescaped(UChar ch)
92 m_state = cachedCharacter;
97 m_state = cachedCharacterHyphen;
99 m_delegate.atomCharacterClassAtom(m_character);
104 case cachedCharacterHyphen:
105 if (ch >= m_character)
106 m_delegate.atomCharacterClassRange(m_character, ch);
108 m_err = CharacterClassOutOfOrder;
114 * atomPatternCharacter():
116 * Adds a pattern character, called by parseEscape(), as such will not
117 * interpret a hyphen as indicating a character range.
119 void atomPatternCharacter(UChar ch)
121 // Flush if a character is already pending to prevent the
122 // hyphen from begin interpreted as indicating a range.
123 if((ch == '-') && (m_state == cachedCharacter))
126 atomPatternCharacterUnescaped(ch);
130 * atomBuiltInCharacterClass():
132 * Adds a built-in character class, called by parseEscape().
134 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
136 if (m_state == cachedCharacterHyphen) {
137 // If the RHS of a range does not contain exacly one character then a SyntaxError
138 // must be thrown. SpiderMonkey only errors out in the [c-\s] case as an extension.
139 // (This assumes none of the built in character classes contain a single
141 m_err = CharacterClassRangeSingleChar;
146 m_delegate.atomCharacterClassBuiltIn(classID, invert);
152 * Called at end of construction.
157 m_delegate.atomCharacterClassEnd();
160 // parseEscape() should never call these delegate methods when
161 // invoked with inCharacterClass set.
162 void assertionWordBoundary(bool) { JS_NOT_REACHED("parseEscape() should never call this"); }
163 void atomBackReference(unsigned) { JS_NOT_REACHED("parseEscape() should never call this"); }
168 if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen
169 m_delegate.atomCharacterClassAtom(m_character);
170 if (m_state == cachedCharacterHyphen)
171 m_delegate.atomCharacterClassAtom('-');
175 Delegate& m_delegate;
177 enum CharacterClassConstructionState {
180 cachedCharacterHyphen
185 Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
186 : m_delegate(delegate)
187 , m_backReferenceLimit(backReferenceLimit)
189 , m_data(const_cast<UString &>(pattern).chars())
190 , m_size(pattern.length())
192 , m_parenthesesNestingDepth(0)
199 * Helper for parseTokens() AND parseCharacterClass().
200 * Unlike the other parser methods, this function does not report tokens
201 * directly to the member delegate (m_delegate), instead tokens are
202 * emitted to the delegate provided as an argument. In the case of atom
203 * escapes, parseTokens() will call parseEscape() passing m_delegate as
204 * an argument, and as such the escape will be reported to the delegate.
206 * However this method may also be used by parseCharacterClass(), in which
207 * case a CharacterClassParserDelegate will be passed as the delegate that
208 * tokens should be added to. A boolean flag is also provided to indicate
209 * whether that an escape in a CharacterClass is being parsed (some parsing
210 * rules change in this context).
212 * The boolean value returned by this method indicates whether the token
213 * parsed was an atom (outside of a characted class \b and \B will be
214 * interpreted as assertions).
216 template<bool inCharacterClass, class EscapeDelegate>
217 bool parseEscape(EscapeDelegate& delegate)
220 JS_ASSERT(peek() == '\\');
223 if (atEndOfPattern()) {
224 m_err = EscapeUnterminated;
232 if (inCharacterClass)
233 delegate.atomPatternCharacter('\b');
235 delegate.assertionWordBoundary(false);
241 if (inCharacterClass)
242 delegate.atomPatternCharacter('B');
244 delegate.assertionWordBoundary(true);
249 // CharacterClassEscape
252 delegate.atomBuiltInCharacterClass(DigitClassID, false);
256 delegate.atomBuiltInCharacterClass(SpaceClassID, false);
260 delegate.atomBuiltInCharacterClass(WordClassID, false);
264 delegate.atomBuiltInCharacterClass(DigitClassID, true);
268 delegate.atomBuiltInCharacterClass(SpaceClassID, true);
272 delegate.atomBuiltInCharacterClass(WordClassID, true);
285 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
286 // First, try to parse this as backreference.
287 if (!inCharacterClass) {
288 ParseState state = saveState();
290 unsigned backReference;
291 if (!consumeNumber(backReference))
293 if (backReference <= m_backReferenceLimit) {
294 delegate.atomBackReference(backReference);
301 // Not a backreference, and not octal.
303 delegate.atomPatternCharacter('\\');
307 // Fall-through to handle this as an octal escape.
312 delegate.atomPatternCharacter(consumeOctal());
318 delegate.atomPatternCharacter('\f');
322 delegate.atomPatternCharacter('\n');
326 delegate.atomPatternCharacter('\r');
330 delegate.atomPatternCharacter('\t');
334 delegate.atomPatternCharacter('\v');
339 ParseState state = saveState();
341 if (!atEndOfPattern()) {
342 int control = consume();
344 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
345 if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
346 delegate.atomPatternCharacter(control & 0x1f);
351 delegate.atomPatternCharacter('\\');
358 int x = tryConsumeHex(2);
360 delegate.atomPatternCharacter('x');
362 delegate.atomPatternCharacter(x);
369 int u = tryConsumeHex(4);
371 delegate.atomPatternCharacter('u');
373 delegate.atomPatternCharacter(u);
379 delegate.atomPatternCharacter(consume());
386 * parseAtomEscape(), parseCharacterClassEscape():
388 * These methods alias to parseEscape().
390 bool parseAtomEscape()
392 return parseEscape<false>(m_delegate);
394 void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
396 parseEscape<true>(delegate);
400 * parseCharacterClass():
402 * Helper for parseTokens(); calls directly and indirectly (via parseCharacterClassEscape)
403 * to an instance of CharacterClassParserDelegate, to describe the character class to the
406 void parseCharacterClass()
409 JS_ASSERT(peek() == '[');
412 CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
414 characterClassConstructor.begin(tryConsume('^'));
416 while (!atEndOfPattern()) {
420 characterClassConstructor.end();
424 parseCharacterClassEscape(characterClassConstructor);
428 characterClassConstructor.atomPatternCharacterUnescaped(consume());
435 m_err = CharacterClassUnmatched;
439 * parseParenthesesBegin():
441 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
443 void parseParenthesesBegin()
446 JS_ASSERT(peek() == '(');
449 if (tryConsume('?')) {
450 if (atEndOfPattern()) {
451 m_err = ParenthesesTypeInvalid;
457 m_delegate.atomParenthesesSubpatternBegin(false);
461 m_delegate.atomParentheticalAssertionBegin();
465 m_delegate.atomParentheticalAssertionBegin(true);
469 m_err = ParenthesesTypeInvalid;
472 m_delegate.atomParenthesesSubpatternBegin();
474 ++m_parenthesesNestingDepth;
478 * parseParenthesesEnd():
480 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
482 void parseParenthesesEnd()
485 JS_ASSERT(peek() == ')');
488 if (m_parenthesesNestingDepth > 0)
489 m_delegate.atomParenthesesEnd();
491 m_err = ParenthesesUnmatched;
493 --m_parenthesesNestingDepth;
499 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
501 void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
504 JS_ASSERT(min <= max);
506 if (lastTokenWasAnAtom)
507 m_delegate.quantifyAtom(min, max, !tryConsume('?'));
509 m_err = QuantifierWithoutAtom;
515 * This method loops over the input pattern reporting tokens to the delegate.
516 * The method returns when a parse error is detected, or the end of the pattern
517 * is reached. One piece of state is tracked around the loop, which is whether
518 * the last token passed to the delegate was an atom (this is necessary to detect
519 * a parse error when a quantifier provided without an atom to quantify).
523 bool lastTokenWasAnAtom = false;
525 while (!atEndOfPattern()) {
529 m_delegate.disjunction();
530 lastTokenWasAnAtom = false;
534 parseParenthesesBegin();
535 lastTokenWasAnAtom = false;
539 parseParenthesesEnd();
540 lastTokenWasAnAtom = true;
545 m_delegate.assertionBOL();
546 lastTokenWasAnAtom = false;
551 m_delegate.assertionEOL();
552 lastTokenWasAnAtom = false;
557 m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
558 lastTokenWasAnAtom = true;
562 parseCharacterClass();
563 lastTokenWasAnAtom = true;
567 lastTokenWasAnAtom = parseAtomEscape();
572 parseQuantifier(lastTokenWasAnAtom, 0, UINT_MAX);
573 lastTokenWasAnAtom = false;
578 parseQuantifier(lastTokenWasAnAtom, 1, UINT_MAX);
579 lastTokenWasAnAtom = false;
584 parseQuantifier(lastTokenWasAnAtom, 0, 1);
585 lastTokenWasAnAtom = false;
589 ParseState state = saveState();
594 if (!consumeNumber(min))
598 if (tryConsume(',')) {
600 if (!consumeNumber(max))
607 if (tryConsume('}')) {
609 parseQuantifier(lastTokenWasAnAtom, min, max);
611 m_err = QuantifierOutOfOrder;
612 lastTokenWasAnAtom = false;
618 } // if we did not find a complete quantifer, fall through to the default case.
621 m_delegate.atomPatternCharacter(consume());
622 lastTokenWasAnAtom = true;
629 if (m_parenthesesNestingDepth > 0)
630 m_err = MissingParentheses;
636 * This method calls regexBegin(), calls parseTokens() to parse over the input
637 * patterns, calls regexEnd() or regexError() as appropriate, and converts any
638 * error code to a const char* for a result.
642 m_delegate.regexBegin();
644 if (m_size > MAX_PATTERN_SIZE)
645 m_err = PatternTooLarge;
648 JS_ASSERT(atEndOfPattern() || m_err);
651 m_delegate.regexError();
653 m_delegate.regexEnd();
655 return static_cast<int>(m_err);
659 // Misc helper functions:
661 typedef unsigned ParseState;
663 ParseState saveState()
668 void restoreState(ParseState state)
673 bool atEndOfPattern()
675 JS_ASSERT(m_index <= m_size);
676 return m_index == m_size;
681 JS_ASSERT(m_index < m_size);
682 return m_data[m_index];
687 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
692 JS_ASSERT(peekIsDigit());
698 JS_ASSERT(m_index < m_size);
699 return m_data[m_index++];
702 unsigned consumeDigit()
704 JS_ASSERT(peekIsDigit());
705 return consume() - '0';
708 bool consumeNumber(unsigned &accum)
710 accum = consumeDigit();
711 while (peekIsDigit()) {
712 unsigned newValue = accum * 10 + peekDigit();
713 if (newValue < accum) { /* Overflow check. */
714 m_err = QuantifierTooLarge;
723 unsigned consumeOctal()
725 JS_ASSERT(WTF::isASCIIOctalDigit(peek()));
727 unsigned n = consumeDigit();
728 while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
729 n = n * 8 + consumeDigit();
733 bool tryConsume(UChar ch)
735 if (atEndOfPattern() || (m_data[m_index] != ch))
741 int tryConsumeHex(int count)
743 ParseState state = saveState();
747 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
751 n = (n << 4) | WTF::toASCIIHexValue(consume());
756 Delegate& m_delegate;
757 unsigned m_backReferenceLimit;
762 unsigned m_parenthesesNestingDepth;
764 // Derived by empirical testing of compile time in PCRE and WREC.
765 static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
771 * The parse method is passed a pattern to be parsed and a delegate upon which
772 * callbacks will be made to record the parsed tokens forming the regex.
773 * Yarr::parse() returns null on success, or a const C string providing an error
774 * message where a parse error occurs.
776 * The Delegate must implement the following interface:
778 * void assertionBOL();
779 * void assertionEOL();
780 * void assertionWordBoundary(bool invert);
782 * void atomPatternCharacter(UChar ch);
783 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
784 * void atomCharacterClassBegin(bool invert)
785 * void atomCharacterClassAtom(UChar ch)
786 * void atomCharacterClassRange(UChar begin, UChar end)
787 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
788 * void atomCharacterClassEnd()
789 * void atomParenthesesSubpatternBegin(bool capture = true);
790 * void atomParentheticalAssertionBegin(bool invert = false);
791 * void atomParenthesesEnd();
792 * void atomBackReference(unsigned subpatternId);
794 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
796 * void disjunction();
802 * Before any call recording tokens are made, regexBegin() will be called on the
803 * delegate once. Once parsing is complete either regexEnd() or regexError() will
804 * be called, as appropriate.
806 * The regular expression is described by a sequence of assertion*() and atom*()
807 * callbacks to the delegate, describing the terms in the regular expression.
808 * Following an atom a quantifyAtom() call may occur to indicate that the previous
809 * atom should be quantified. In the case of atoms described across multiple
810 * calls (parentheses and character classes) the call to quantifyAtom() will come
811 * after the call to the atom*End() method, never after atom*Begin().
813 * Character classes may either be described by a single call to
814 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
815 * In the latter case, ...Begin() will be called, followed by a sequence of
816 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
818 * Sequences of atoms and assertions are broken into alternatives via calls to
819 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
820 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
821 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
822 * capturing subpattern, this will be the subpatternId associated with these
823 * parentheses, and will also by definition be the lowest subpatternId of these
824 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
825 * is passed the subpatternId of the last capturing subexpression nested within
826 * these paretheses. In the case of a capturing subpattern with no nested
827 * capturing subpatterns, the same subpatternId will be passed to the begin and
828 * end functions. In the case of non-capturing subpatterns the subpatternId
829 * passed to the begin method is also the first possible subpatternId that might
830 * be nested within these paretheses. If a set of non-capturing parentheses does
831 * not contain any capturing subpatterns, then the subpatternId passed to begin
832 * will be greater than the subpatternId passed to end.
835 template<class Delegate>
836 int parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX)
838 return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
841 } } // namespace JSC::Yarr
843 #endif // RegexParser_h