2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "YarrInterpreter.h"
32 #include <wtf/BumpPointerAllocator.h>
33 #include <wtf/text/CString.h>
41 namespace JSC { namespace Yarr {
45 struct ParenthesesDisjunctionContext;
47 struct BackTrackInfoPatternCharacter {
48 uintptr_t matchAmount;
50 struct BackTrackInfoCharacterClass {
51 uintptr_t matchAmount;
53 struct BackTrackInfoBackReference {
54 uintptr_t begin; // Not really needed for greedy quantifiers.
55 uintptr_t matchAmount; // Not really needed for fixed quantifiers.
57 struct BackTrackInfoAlternative {
60 struct BackTrackInfoParentheticalAssertion {
63 struct BackTrackInfoParenthesesOnce {
66 struct BackTrackInfoParenthesesTerminal {
69 struct BackTrackInfoParentheses {
70 uintptr_t matchAmount;
71 ParenthesesDisjunctionContext* lastContext;
74 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
76 context->next = backTrack->lastContext;
77 backTrack->lastContext = context;
78 ++backTrack->matchAmount;
81 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
83 ASSERT(backTrack->matchAmount);
84 ASSERT(backTrack->lastContext);
85 backTrack->lastContext = backTrack->lastContext->next;
86 --backTrack->matchAmount;
89 struct DisjunctionContext
96 void* operator new(size_t, void* where)
107 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
109 size_t size = sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
110 allocatorPool = allocatorPool->ensureCapacity(size);
113 return new(allocatorPool->alloc(size)) DisjunctionContext();
116 void freeDisjunctionContext(DisjunctionContext* context)
118 allocatorPool = allocatorPool->dealloc(context);
121 struct ParenthesesDisjunctionContext
123 ParenthesesDisjunctionContext(int* output, ByteTerm& term)
126 unsigned firstSubpatternId = term.atom.subpatternId;
127 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
129 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
130 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
131 output[(firstSubpatternId << 1) + i] = -1;
134 new(getDisjunctionContext(term)) DisjunctionContext();
137 void* operator new(size_t, void* where)
142 void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
144 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
145 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
148 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
150 return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1]));
153 ParenthesesDisjunctionContext* next;
154 int subpatternBackup[1];
157 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
159 size_t size = sizeof(ParenthesesDisjunctionContext) - sizeof(int) + (term.atom.parenthesesDisjunction->m_numSubpatterns << 1) * sizeof(int) + sizeof(DisjunctionContext) - sizeof(uintptr_t) + disjunction->m_frameSize * sizeof(uintptr_t);
160 allocatorPool = allocatorPool->ensureCapacity(size);
163 return new(allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
166 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
168 allocatorPool = allocatorPool->dealloc(context);
171 // This class is a placeholder for future character iterator, current
172 // proposed name StringConstCharacterIterator.
175 CharAccess(const UString& s)
179 m_ptr.ptr8 = s.characters8();
182 m_ptr.ptr16 = s.characters16();
186 CharAccess(const LChar* ptr)
192 CharAccess(const UChar* ptr)
202 inline UChar operator[](unsigned index)
204 if (m_charSize == Char8)
205 return m_ptr.ptr8[index];
206 return m_ptr.ptr16[index];
214 YarrCharSize m_charSize;
219 InputStream(const UString& input, unsigned start, unsigned length)
231 void rewind(unsigned amount)
233 ASSERT(pos >= amount);
239 ASSERT(pos < length);
247 ASSERT(pos + 1 < length);
248 return input[pos] | input[pos + 1] << 16;
251 int readChecked(int position)
253 ASSERT(position < 0);
254 ASSERT(static_cast<unsigned>(-position) <= pos);
255 unsigned p = pos + position;
260 int reread(unsigned from)
262 ASSERT(from < length);
268 ASSERT(!(pos > length));
270 return input[pos - 1];
279 void setPos(unsigned p)
291 return pos == length;
299 bool checkInput(int count)
301 if ((pos + count) <= length) {
308 void uncheckInput(int count)
313 bool atStart(int position)
315 return (pos + position) == 0;
318 bool atEnd(int position)
320 return (pos + position) == length;
323 bool isNotAvailableInput(int position)
325 return (pos + position) > length;
334 bool testCharacterClass(CharacterClass* characterClass, int ch)
337 for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
338 if (ch == characterClass->m_matchesUnicode[i])
340 for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i)
341 if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end))
344 for (unsigned i = 0; i < characterClass->m_matches.size(); ++i)
345 if (ch == characterClass->m_matches[i])
347 for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i)
348 if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end))
355 bool checkCharacter(int testChar, int inputPosition)
357 return testChar == input.readChecked(inputPosition);
360 bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
362 int ch = input.readChecked(inputPosition);
363 return (loChar == ch) || (hiChar == ch);
366 bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
368 bool match = testCharacterClass(characterClass, input.readChecked(inputPosition));
369 return invert ? !match : match;
372 bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
374 int matchSize = matchEnd - matchBegin;
376 if (!input.checkInput(matchSize))
379 if (pattern->m_ignoreCase) {
380 for (int i = 0; i < matchSize; ++i) {
381 int ch = input.reread(matchBegin + i);
383 int lo = Unicode::toLower(ch);
384 int hi = Unicode::toUpper(ch);
386 if ((lo != hi) ? (!checkCasedCharacter(lo, hi, inputOffset - matchSize + i)) : (!checkCharacter(ch, inputOffset - matchSize + i))) {
387 input.uncheckInput(matchSize);
392 for (int i = 0; i < matchSize; ++i) {
393 if (!checkCharacter(input.reread(matchBegin + i), inputOffset - matchSize + i)) {
394 input.uncheckInput(matchSize);
403 bool matchAssertionBOL(ByteTerm& term)
405 return (input.atStart(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition - 1)));
408 bool matchAssertionEOL(ByteTerm& term)
410 if (term.inputPosition)
411 return (input.atEnd(term.inputPosition)) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
413 return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(pattern->newlineCharacterClass, input.read()));
416 bool matchAssertionWordBoundary(ByteTerm& term)
418 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition - 1));
420 if (term.inputPosition)
421 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
423 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
425 bool wordBoundary = prevIsWordchar != readIsWordchar;
426 return term.invert() ? !wordBoundary : wordBoundary;
429 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
431 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
433 switch (term.atom.quantityType) {
434 case QuantifierFixedCount:
437 case QuantifierGreedy:
438 if (backTrack->matchAmount) {
439 --backTrack->matchAmount;
440 input.uncheckInput(1);
445 case QuantifierNonGreedy:
446 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
447 ++backTrack->matchAmount;
448 if (checkCharacter(term.atom.patternCharacter, term.inputPosition - 1))
451 input.uncheckInput(backTrack->matchAmount);
458 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
460 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
462 switch (term.atom.quantityType) {
463 case QuantifierFixedCount:
466 case QuantifierGreedy:
467 if (backTrack->matchAmount) {
468 --backTrack->matchAmount;
469 input.uncheckInput(1);
474 case QuantifierNonGreedy:
475 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
476 ++backTrack->matchAmount;
477 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition - 1))
480 input.uncheckInput(backTrack->matchAmount);
487 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
489 ASSERT(term.type == ByteTerm::TypeCharacterClass);
490 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
492 switch (term.atom.quantityType) {
493 case QuantifierFixedCount: {
494 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
495 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + matchAmount))
501 case QuantifierGreedy: {
502 unsigned matchAmount = 0;
503 while ((matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
504 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1)) {
505 input.uncheckInput(1);
510 backTrack->matchAmount = matchAmount;
515 case QuantifierNonGreedy:
516 backTrack->matchAmount = 0;
520 ASSERT_NOT_REACHED();
524 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
526 ASSERT(term.type == ByteTerm::TypeCharacterClass);
527 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
529 switch (term.atom.quantityType) {
530 case QuantifierFixedCount:
533 case QuantifierGreedy:
534 if (backTrack->matchAmount) {
535 --backTrack->matchAmount;
536 input.uncheckInput(1);
541 case QuantifierNonGreedy:
542 if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(1)) {
543 ++backTrack->matchAmount;
544 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - 1))
547 input.uncheckInput(backTrack->matchAmount);
554 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
556 ASSERT(term.type == ByteTerm::TypeBackReference);
557 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
559 int matchBegin = output[(term.atom.subpatternId << 1)];
560 int matchEnd = output[(term.atom.subpatternId << 1) + 1];
562 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
563 // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
568 ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
570 if (matchBegin == matchEnd)
573 switch (term.atom.quantityType) {
574 case QuantifierFixedCount: {
575 backTrack->begin = input.getPos();
576 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) {
577 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
578 input.setPos(backTrack->begin);
585 case QuantifierGreedy: {
586 unsigned matchAmount = 0;
587 while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
589 backTrack->matchAmount = matchAmount;
593 case QuantifierNonGreedy:
594 backTrack->begin = input.getPos();
595 backTrack->matchAmount = 0;
599 ASSERT_NOT_REACHED();
603 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
605 ASSERT(term.type == ByteTerm::TypeBackReference);
606 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
608 int matchBegin = output[(term.atom.subpatternId << 1)];
609 int matchEnd = output[(term.atom.subpatternId << 1) + 1];
610 ASSERT((matchBegin == -1) || (matchBegin <= matchEnd));
612 if (matchBegin == matchEnd)
615 switch (term.atom.quantityType) {
616 case QuantifierFixedCount:
617 // for quantityCount == 1, could rewind.
618 input.setPos(backTrack->begin);
621 case QuantifierGreedy:
622 if (backTrack->matchAmount) {
623 --backTrack->matchAmount;
624 input.rewind(matchEnd - matchBegin);
629 case QuantifierNonGreedy:
630 if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
631 ++backTrack->matchAmount;
634 input.setPos(backTrack->begin);
641 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
643 if (term.capture()) {
644 unsigned subpatternId = term.atom.subpatternId;
645 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
646 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
649 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
651 unsigned firstSubpatternId = term.atom.subpatternId;
652 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
653 context->restoreOutput(output, firstSubpatternId, count);
655 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
657 while (backTrack->matchAmount) {
658 ParenthesesDisjunctionContext* context = backTrack->lastContext;
660 JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
661 if (result == JSRegExpMatch)
662 return JSRegExpMatch;
664 resetMatches(term, context);
665 popParenthesesDisjunctionContext(backTrack);
666 freeParenthesesDisjunctionContext(context);
668 if (result != JSRegExpNoMatch)
672 return JSRegExpNoMatch;
675 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
677 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
678 ASSERT(term.atom.quantityCount == 1);
680 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
682 switch (term.atom.quantityType) {
683 case QuantifierGreedy: {
684 // set this speculatively; if we get to the parens end this will be true.
685 backTrack->begin = input.getPos();
688 case QuantifierNonGreedy: {
689 backTrack->begin = notFound;
690 context->term += term.atom.parenthesesWidth;
693 case QuantifierFixedCount:
697 if (term.capture()) {
698 unsigned subpatternId = term.atom.subpatternId;
699 output[(subpatternId << 1)] = input.getPos() + term.inputPosition;
705 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
707 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
708 ASSERT(term.atom.quantityCount == 1);
710 if (term.capture()) {
711 unsigned subpatternId = term.atom.subpatternId;
712 output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition;
715 if (term.atom.quantityType == QuantifierFixedCount)
718 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
719 return backTrack->begin != input.getPos();
722 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
724 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
725 ASSERT(term.atom.quantityCount == 1);
727 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
729 if (term.capture()) {
730 unsigned subpatternId = term.atom.subpatternId;
731 output[(subpatternId << 1)] = -1;
732 output[(subpatternId << 1) + 1] = -1;
735 switch (term.atom.quantityType) {
736 case QuantifierGreedy:
737 // if we backtrack to this point, there is another chance - try matching nothing.
738 ASSERT(backTrack->begin != notFound);
739 backTrack->begin = notFound;
740 context->term += term.atom.parenthesesWidth;
742 case QuantifierNonGreedy:
743 ASSERT(backTrack->begin != notFound);
744 case QuantifierFixedCount:
751 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
753 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
754 ASSERT(term.atom.quantityCount == 1);
756 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
758 switch (term.atom.quantityType) {
759 case QuantifierGreedy:
760 if (backTrack->begin == notFound) {
761 context->term -= term.atom.parenthesesWidth;
764 case QuantifierNonGreedy:
765 if (backTrack->begin == notFound) {
766 backTrack->begin = input.getPos();
767 if (term.capture()) {
768 // Technically this access to inputPosition should be accessing the begin term's
769 // inputPosition, but for repeats other than fixed these values should be
770 // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
771 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
772 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
773 unsigned subpatternId = term.atom.subpatternId;
774 output[subpatternId << 1] = input.getPos() + term.inputPosition;
776 context->term -= term.atom.parenthesesWidth;
779 case QuantifierFixedCount:
786 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
788 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
789 ASSERT(term.atom.quantityType == QuantifierGreedy);
790 ASSERT(term.atom.quantityCount == quantifyInfinite);
791 ASSERT(!term.capture());
793 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
794 backTrack->begin = input.getPos();
798 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
800 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
802 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
803 // Empty match is a failed match.
804 if (backTrack->begin == input.getPos())
807 // Successful match! Okay, what's next? - loop around and try to match moar!
808 context->term -= (term.atom.parenthesesWidth + 1);
812 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
814 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
815 ASSERT(term.atom.quantityType == QuantifierGreedy);
816 ASSERT(term.atom.quantityCount == quantifyInfinite);
817 ASSERT(!term.capture());
819 // If we backtrack to this point, we have failed to match this iteration of the parens.
820 // Since this is greedy / zero minimum a failed is also accepted as a match!
821 context->term += term.atom.parenthesesWidth;
825 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
827 // 'Terminal' parentheses are at the end of the regex, and as such a match past end
828 // should always be returned as a successful match - we should never backtrack to here.
829 ASSERT_NOT_REACHED();
833 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
835 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
836 ASSERT(term.atom.quantityCount == 1);
838 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
840 backTrack->begin = input.getPos();
844 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
846 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
847 ASSERT(term.atom.quantityCount == 1);
849 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
851 input.setPos(backTrack->begin);
853 // We've reached the end of the parens; if they are inverted, this is failure.
855 context->term -= term.atom.parenthesesWidth;
862 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
864 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
865 ASSERT(term.atom.quantityCount == 1);
867 // We've failed to match parens; if they are inverted, this is win!
869 context->term += term.atom.parenthesesWidth;
876 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
878 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
879 ASSERT(term.atom.quantityCount == 1);
881 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
883 input.setPos(backTrack->begin);
885 context->term -= term.atom.parenthesesWidth;
889 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
891 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
893 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
894 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
896 backTrack->matchAmount = 0;
897 backTrack->lastContext = 0;
899 switch (term.atom.quantityType) {
900 case QuantifierFixedCount: {
901 // While we haven't yet reached our fixed limit,
902 while (backTrack->matchAmount < term.atom.quantityCount) {
903 // Try to do a match, and it it succeeds, add it to the list.
904 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
905 JSRegExpResult result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
906 if (result == JSRegExpMatch)
907 appendParenthesesDisjunctionContext(backTrack, context);
909 // The match failed; try to find an alternate point to carry on from.
910 resetMatches(term, context);
911 freeParenthesesDisjunctionContext(context);
913 if (result != JSRegExpNoMatch)
915 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
916 if (backtrackResult != JSRegExpMatch)
917 return backtrackResult;
921 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
922 ParenthesesDisjunctionContext* context = backTrack->lastContext;
923 recordParenthesesMatch(term, context);
924 return JSRegExpMatch;
927 case QuantifierGreedy: {
928 while (backTrack->matchAmount < term.atom.quantityCount) {
929 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
930 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
931 if (result == JSRegExpMatch)
932 appendParenthesesDisjunctionContext(backTrack, context);
934 resetMatches(term, context);
935 freeParenthesesDisjunctionContext(context);
937 if (result != JSRegExpNoMatch)
944 if (backTrack->matchAmount) {
945 ParenthesesDisjunctionContext* context = backTrack->lastContext;
946 recordParenthesesMatch(term, context);
948 return JSRegExpMatch;
951 case QuantifierNonGreedy:
952 return JSRegExpMatch;
955 ASSERT_NOT_REACHED();
956 return JSRegExpErrorNoMatch;
959 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
961 // Greedy matches never should try just adding more - you should already have done
962 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
963 // you backtrack an item off the list needs checking, since we'll never have matched
964 // the one less case. Tracking forwards, still add as much as possible.
966 // Non-greedy, we've already done the one less case, so don't match on popping.
967 // We haven't done the one more case, so always try to add that.
969 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
971 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
973 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
974 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
976 switch (term.atom.quantityType) {
977 case QuantifierFixedCount: {
978 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
980 ParenthesesDisjunctionContext* context = 0;
981 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
983 if (result != JSRegExpMatch)
986 // While we haven't yet reached our fixed limit,
987 while (backTrack->matchAmount < term.atom.quantityCount) {
988 // Try to do a match, and it it succeeds, add it to the list.
989 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
990 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
992 if (result == JSRegExpMatch)
993 appendParenthesesDisjunctionContext(backTrack, context);
995 // The match failed; try to find an alternate point to carry on from.
996 resetMatches(term, context);
997 freeParenthesesDisjunctionContext(context);
999 if (result != JSRegExpNoMatch)
1001 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
1002 if (backtrackResult != JSRegExpMatch)
1003 return backtrackResult;
1007 ASSERT(backTrack->matchAmount == term.atom.quantityCount);
1008 context = backTrack->lastContext;
1009 recordParenthesesMatch(term, context);
1010 return JSRegExpMatch;
1013 case QuantifierGreedy: {
1014 if (!backTrack->matchAmount)
1015 return JSRegExpNoMatch;
1017 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1018 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1019 if (result == JSRegExpMatch) {
1020 while (backTrack->matchAmount < term.atom.quantityCount) {
1021 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1022 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1023 if (parenthesesResult == JSRegExpMatch)
1024 appendParenthesesDisjunctionContext(backTrack, context);
1026 resetMatches(term, context);
1027 freeParenthesesDisjunctionContext(context);
1029 if (parenthesesResult != JSRegExpNoMatch)
1030 return parenthesesResult;
1036 resetMatches(term, context);
1037 popParenthesesDisjunctionContext(backTrack);
1038 freeParenthesesDisjunctionContext(context);
1040 if (result != JSRegExpNoMatch)
1044 if (backTrack->matchAmount) {
1045 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1046 recordParenthesesMatch(term, context);
1048 return JSRegExpMatch;
1051 case QuantifierNonGreedy: {
1052 // If we've not reached the limit, try to add one more match.
1053 if (backTrack->matchAmount < term.atom.quantityCount) {
1054 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1055 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1056 if (result == JSRegExpMatch) {
1057 appendParenthesesDisjunctionContext(backTrack, context);
1058 recordParenthesesMatch(term, context);
1059 return JSRegExpMatch;
1062 resetMatches(term, context);
1063 freeParenthesesDisjunctionContext(context);
1065 if (result != JSRegExpNoMatch)
1069 // Nope - okay backtrack looking for an alternative.
1070 while (backTrack->matchAmount) {
1071 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1072 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1073 if (result == JSRegExpMatch) {
1074 // successful backtrack! we're back in the game!
1075 if (backTrack->matchAmount) {
1076 context = backTrack->lastContext;
1077 recordParenthesesMatch(term, context);
1079 return JSRegExpMatch;
1082 // pop a match off the stack
1083 resetMatches(term, context);
1084 popParenthesesDisjunctionContext(backTrack);
1085 freeParenthesesDisjunctionContext(context);
1087 if (result != JSRegExpNoMatch)
1091 return JSRegExpNoMatch;
1095 ASSERT_NOT_REACHED();
1096 return JSRegExpErrorNoMatch;
1099 bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
1102 unsigned matchBegin = context->matchBegin;
1105 for (matchBegin--; true; matchBegin--) {
1106 if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
1116 unsigned matchEnd = input.getPos();
1118 for (; (matchEnd != input.end())
1119 && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
1121 if (((matchBegin && term.anchors.m_bol)
1122 || ((matchEnd != input.end()) && term.anchors.m_eol))
1123 && !pattern->m_multiline)
1126 context->matchBegin = matchBegin;
1127 context->matchEnd = matchEnd;
1131 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
1132 #define BACKTRACK() { --context->term; goto backtrack; }
1133 #define currentTerm() (disjunction->terms[context->term])
1134 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1136 if (!--remainingMatchCount)
1137 return JSRegExpErrorHitLimit;
1142 context->matchBegin = input.getPos();
1146 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1148 switch (currentTerm().type) {
1149 case ByteTerm::TypeSubpatternBegin:
1151 case ByteTerm::TypeSubpatternEnd:
1152 context->matchEnd = input.getPos();
1153 return JSRegExpMatch;
1155 case ByteTerm::TypeBodyAlternativeBegin:
1157 case ByteTerm::TypeBodyAlternativeDisjunction:
1158 case ByteTerm::TypeBodyAlternativeEnd:
1159 context->matchEnd = input.getPos();
1160 return JSRegExpMatch;
1162 case ByteTerm::TypeAlternativeBegin:
1164 case ByteTerm::TypeAlternativeDisjunction:
1165 case ByteTerm::TypeAlternativeEnd: {
1166 int offset = currentTerm().alternative.end;
1167 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1168 backTrack->offset = offset;
1169 context->term += offset;
1173 case ByteTerm::TypeAssertionBOL:
1174 if (matchAssertionBOL(currentTerm()))
1177 case ByteTerm::TypeAssertionEOL:
1178 if (matchAssertionEOL(currentTerm()))
1181 case ByteTerm::TypeAssertionWordBoundary:
1182 if (matchAssertionWordBoundary(currentTerm()))
1186 case ByteTerm::TypePatternCharacterOnce:
1187 case ByteTerm::TypePatternCharacterFixed: {
1188 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1189 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
1194 case ByteTerm::TypePatternCharacterGreedy: {
1195 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1196 unsigned matchAmount = 0;
1197 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1198 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) {
1199 input.uncheckInput(1);
1204 backTrack->matchAmount = matchAmount;
1208 case ByteTerm::TypePatternCharacterNonGreedy: {
1209 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1210 backTrack->matchAmount = 0;
1214 case ByteTerm::TypePatternCasedCharacterOnce:
1215 case ByteTerm::TypePatternCasedCharacterFixed: {
1216 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1217 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
1222 case ByteTerm::TypePatternCasedCharacterGreedy: {
1223 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1224 unsigned matchAmount = 0;
1225 while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(1)) {
1226 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) {
1227 input.uncheckInput(1);
1232 backTrack->matchAmount = matchAmount;
1236 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1237 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1238 backTrack->matchAmount = 0;
1242 case ByteTerm::TypeCharacterClass:
1243 if (matchCharacterClass(currentTerm(), context))
1246 case ByteTerm::TypeBackReference:
1247 if (matchBackReference(currentTerm(), context))
1250 case ByteTerm::TypeParenthesesSubpattern: {
1251 JSRegExpResult result = matchParentheses(currentTerm(), context);
1253 if (result == JSRegExpMatch) {
1255 } else if (result != JSRegExpNoMatch)
1260 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1261 if (matchParenthesesOnceBegin(currentTerm(), context))
1264 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1265 if (matchParenthesesOnceEnd(currentTerm(), context))
1268 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1269 if (matchParenthesesTerminalBegin(currentTerm(), context))
1272 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1273 if (matchParenthesesTerminalEnd(currentTerm(), context))
1276 case ByteTerm::TypeParentheticalAssertionBegin:
1277 if (matchParentheticalAssertionBegin(currentTerm(), context))
1280 case ByteTerm::TypeParentheticalAssertionEnd:
1281 if (matchParentheticalAssertionEnd(currentTerm(), context))
1285 case ByteTerm::TypeCheckInput:
1286 if (input.checkInput(currentTerm().checkInputCount))
1290 case ByteTerm::TypeUncheckInput:
1291 input.uncheckInput(currentTerm().checkInputCount);
1294 case ByteTerm::TypeDotStarEnclosure:
1295 if (matchDotStarEnclosure(currentTerm(), context))
1296 return JSRegExpMatch;
1300 // We should never fall-through to here.
1301 ASSERT_NOT_REACHED();
1304 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1306 switch (currentTerm().type) {
1307 case ByteTerm::TypeSubpatternBegin:
1308 return JSRegExpNoMatch;
1309 case ByteTerm::TypeSubpatternEnd:
1310 ASSERT_NOT_REACHED();
1312 case ByteTerm::TypeBodyAlternativeBegin:
1313 case ByteTerm::TypeBodyAlternativeDisjunction: {
1314 int offset = currentTerm().alternative.next;
1315 context->term += offset;
1320 return JSRegExpNoMatch;
1324 context->matchBegin = input.getPos();
1326 if (currentTerm().alternative.onceThrough)
1327 context->term += currentTerm().alternative.next;
1331 case ByteTerm::TypeBodyAlternativeEnd:
1332 ASSERT_NOT_REACHED();
1334 case ByteTerm::TypeAlternativeBegin:
1335 case ByteTerm::TypeAlternativeDisjunction: {
1336 int offset = currentTerm().alternative.next;
1337 context->term += offset;
1342 case ByteTerm::TypeAlternativeEnd: {
1343 // We should never backtrack back into an alternative of the main body of the regex.
1344 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1345 unsigned offset = backTrack->offset;
1346 context->term -= offset;
1350 case ByteTerm::TypeAssertionBOL:
1351 case ByteTerm::TypeAssertionEOL:
1352 case ByteTerm::TypeAssertionWordBoundary:
1355 case ByteTerm::TypePatternCharacterOnce:
1356 case ByteTerm::TypePatternCharacterFixed:
1357 case ByteTerm::TypePatternCharacterGreedy:
1358 case ByteTerm::TypePatternCharacterNonGreedy:
1359 if (backtrackPatternCharacter(currentTerm(), context))
1362 case ByteTerm::TypePatternCasedCharacterOnce:
1363 case ByteTerm::TypePatternCasedCharacterFixed:
1364 case ByteTerm::TypePatternCasedCharacterGreedy:
1365 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1366 if (backtrackPatternCasedCharacter(currentTerm(), context))
1369 case ByteTerm::TypeCharacterClass:
1370 if (backtrackCharacterClass(currentTerm(), context))
1373 case ByteTerm::TypeBackReference:
1374 if (backtrackBackReference(currentTerm(), context))
1377 case ByteTerm::TypeParenthesesSubpattern: {
1378 JSRegExpResult result = backtrackParentheses(currentTerm(), context);
1380 if (result == JSRegExpMatch) {
1382 } else if (result != JSRegExpNoMatch)
1387 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1388 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1391 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1392 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1395 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1396 if (backtrackParenthesesTerminalBegin(currentTerm(), context))
1399 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1400 if (backtrackParenthesesTerminalEnd(currentTerm(), context))
1403 case ByteTerm::TypeParentheticalAssertionBegin:
1404 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1407 case ByteTerm::TypeParentheticalAssertionEnd:
1408 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1412 case ByteTerm::TypeCheckInput:
1413 input.uncheckInput(currentTerm().checkInputCount);
1416 case ByteTerm::TypeUncheckInput:
1417 input.checkInput(currentTerm().checkInputCount);
1420 case ByteTerm::TypeDotStarEnclosure:
1421 ASSERT_NOT_REACHED();
1424 ASSERT_NOT_REACHED();
1425 return JSRegExpErrorNoMatch;
1428 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1430 JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
1432 if (result == JSRegExpMatch) {
1433 while (context->matchBegin == context->matchEnd) {
1434 result = matchDisjunction(disjunction, context, true);
1435 if (result != JSRegExpMatch)
1438 return JSRegExpMatch;
1446 allocatorPool = pattern->m_allocator->startAllocator();
1450 for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i)
1453 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1455 JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
1456 if (result == JSRegExpMatch) {
1457 output[0] = context->matchBegin;
1458 output[1] = context->matchEnd;
1461 freeDisjunctionContext(context);
1463 pattern->m_allocator->stopAllocator();
1465 // RegExp.cpp currently expects all error to be converted to -1.
1466 ASSERT((result == JSRegExpMatch) == (output[0] != -1));
1470 Interpreter(BytecodePattern* pattern, int* output, const UString input, unsigned start, unsigned length)
1473 , input(input, start, length)
1475 , remainingMatchCount(matchLimit)
1480 BytecodePattern* pattern;
1483 BumpPointerPool* allocatorPool;
1484 unsigned remainingMatchCount;
1489 class ByteCompiler {
1490 struct ParenthesesStackEntry {
1492 unsigned savedAlternativeIndex;
1493 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1494 : beginTerm(beginTerm)
1495 , savedAlternativeIndex(savedAlternativeIndex)
1501 ByteCompiler(YarrPattern& pattern)
1502 : m_pattern(pattern)
1504 m_currentAlternativeIndex = 0;
1507 PassOwnPtr<BytecodePattern> compile(BumpPointerAllocator* allocator)
1509 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1510 emitDisjunction(m_pattern.m_body);
1513 return adoptPtr(new BytecodePattern(m_bodyDisjunction.release(), m_allParenthesesInfo, m_pattern, allocator));
1516 void checkInput(unsigned count)
1518 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1521 void uncheckInput(unsigned count)
1523 m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
1526 void assertionBOL(int inputPosition)
1528 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1531 void assertionEOL(int inputPosition)
1533 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1536 void assertionWordBoundary(bool invert, int inputPosition)
1538 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1541 void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1543 if (m_pattern.m_ignoreCase) {
1544 UChar lo = Unicode::toLower(ch);
1545 UChar hi = Unicode::toUpper(ch);
1548 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1553 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType));
1556 void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1558 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1560 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
1561 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1562 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1565 void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1567 ASSERT(subpatternId);
1569 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1571 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount.unsafeGet();
1572 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1573 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1576 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1578 int beginTerm = m_bodyDisjunction->terms.size();
1580 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1581 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1582 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1583 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1585 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1586 m_currentAlternativeIndex = beginTerm + 1;
1589 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1591 int beginTerm = m_bodyDisjunction->terms.size();
1593 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
1594 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1595 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1596 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1598 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1599 m_currentAlternativeIndex = beginTerm + 1;
1602 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1604 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
1605 // then fix this up at the end! - simplifying this should make it much clearer.
1606 // https://bugs.webkit.org/show_bug.cgi?id=50136
1608 int beginTerm = m_bodyDisjunction->terms.size();
1610 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1611 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1612 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1613 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1615 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1616 m_currentAlternativeIndex = beginTerm + 1;
1619 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1621 int beginTerm = m_bodyDisjunction->terms.size();
1623 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
1624 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1625 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1626 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1628 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1629 m_currentAlternativeIndex = beginTerm + 1;
1632 void atomParentheticalAssertionEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1634 unsigned beginTerm = popParenthesesStack();
1635 closeAlternative(beginTerm + 1);
1636 unsigned endTerm = m_bodyDisjunction->terms.size();
1638 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
1640 bool invert = m_bodyDisjunction->terms[beginTerm].invert();
1641 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1643 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
1644 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1645 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1646 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1648 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1649 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1650 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1651 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1654 void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
1656 m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
1659 unsigned popParenthesesStack()
1661 ASSERT(m_parenthesesStack.size());
1662 int stackEnd = m_parenthesesStack.size() - 1;
1663 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1664 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1665 m_parenthesesStack.shrink(stackEnd);
1667 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1668 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1674 void dumpDisjunction(ByteDisjunction* disjunction)
1676 printf("ByteDisjunction(%p):\n\t", disjunction);
1677 for (unsigned i = 0; i < disjunction->terms.size(); ++i)
1678 printf("{ %d } ", disjunction->terms[i].type);
1683 void closeAlternative(int beginTerm)
1685 int origBeginTerm = beginTerm;
1686 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1687 int endIndex = m_bodyDisjunction->terms.size();
1689 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1691 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1692 m_bodyDisjunction->terms.remove(beginTerm);
1694 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1695 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1696 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1697 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1698 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1701 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1703 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1704 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1708 void closeBodyAlternative()
1711 int origBeginTerm = 0;
1712 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1713 int endIndex = m_bodyDisjunction->terms.size();
1715 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1717 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1718 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1719 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1720 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1721 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1724 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1726 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1727 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1730 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1732 unsigned beginTerm = popParenthesesStack();
1733 closeAlternative(beginTerm + 1);
1734 unsigned endTerm = m_bodyDisjunction->terms.size();
1736 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1738 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1740 bool capture = parenthesesBegin.capture();
1741 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1743 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1744 ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize);
1746 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1747 for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses)
1748 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1749 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1751 m_bodyDisjunction->terms.shrink(beginTerm);
1753 m_allParenthesesInfo.append(parenthesesDisjunction);
1754 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, inputPosition));
1756 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1757 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1758 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1761 void atomParenthesesOnceEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1763 unsigned beginTerm = popParenthesesStack();
1764 closeAlternative(beginTerm + 1);
1765 unsigned endTerm = m_bodyDisjunction->terms.size();
1767 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1769 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1770 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1772 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
1773 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1774 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1775 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1777 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1778 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1779 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1780 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1783 void atomParenthesesTerminalEnd(int inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
1785 unsigned beginTerm = popParenthesesStack();
1786 closeAlternative(beginTerm + 1);
1787 unsigned endTerm = m_bodyDisjunction->terms.size();
1789 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
1791 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1792 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1794 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
1795 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1796 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1797 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1799 m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount.unsafeGet();
1800 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1801 m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount.unsafeGet();
1802 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1805 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1807 m_bodyDisjunction = adoptPtr(new ByteDisjunction(numSubpatterns, callFrameSize));
1808 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
1809 m_bodyDisjunction->terms[0].frameLocation = 0;
1810 m_currentAlternativeIndex = 0;
1815 closeBodyAlternative();
1818 void alternativeBodyDisjunction(bool onceThrough)
1820 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1821 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1822 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
1824 m_currentAlternativeIndex = newAlternativeIndex;
1827 void alternativeDisjunction()
1829 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1830 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1831 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
1833 m_currentAlternativeIndex = newAlternativeIndex;
1836 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
1838 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
1839 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1841 PatternAlternative* alternative = disjunction->m_alternatives[alt];
1844 if (disjunction == m_pattern.m_body)
1845 alternativeBodyDisjunction(alternative->onceThrough());
1847 alternativeDisjunction();
1850 unsigned minimumSize = alternative->m_minimumSize;
1851 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
1852 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1855 checkInput(countToCheck);
1856 currentCountAlreadyChecked += countToCheck;
1859 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
1860 PatternTerm& term = alternative->m_terms[i];
1862 switch (term.type) {
1863 case PatternTerm::TypeAssertionBOL:
1864 assertionBOL(term.inputPosition - currentCountAlreadyChecked);
1867 case PatternTerm::TypeAssertionEOL:
1868 assertionEOL(term.inputPosition - currentCountAlreadyChecked);
1871 case PatternTerm::TypeAssertionWordBoundary:
1872 assertionWordBoundary(term.invert(), term.inputPosition - currentCountAlreadyChecked);
1875 case PatternTerm::TypePatternCharacter:
1876 atomPatternCharacter(term.patternCharacter, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1879 case PatternTerm::TypeCharacterClass:
1880 atomCharacterClass(term.characterClass, term.invert(), term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1883 case PatternTerm::TypeBackReference:
1884 atomBackReference(term.backReferenceSubpatternId, term.inputPosition - currentCountAlreadyChecked, term.frameLocation, term.quantityCount, term.quantityType);
1887 case PatternTerm::TypeForwardReference:
1890 case PatternTerm::TypeParenthesesSubpattern: {
1891 unsigned disjunctionAlreadyCheckedCount = 0;
1892 if (term.quantityCount == 1 && !term.parentheses.isCopy) {
1893 unsigned alternativeFrameLocation = term.frameLocation;
1894 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
1895 if (term.quantityType == QuantifierFixedCount)
1896 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1898 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1899 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1900 atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, alternativeFrameLocation);
1901 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1902 atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1903 } else if (term.parentheses.isTerminal) {
1904 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1905 atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesOnce);
1906 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount);
1907 atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType);
1909 unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1910 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), delegateEndInputOffset - disjunctionAlreadyCheckedCount, term.frameLocation, 0);
1911 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0);
1912 atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
1917 case PatternTerm::TypeParentheticalAssertion: {
1918 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
1920 ASSERT(currentCountAlreadyChecked >= static_cast<unsigned>(term.inputPosition));
1921 unsigned positiveInputOffset = currentCountAlreadyChecked - static_cast<unsigned>(term.inputPosition);
1922 unsigned uncheckAmount = 0;
1923 if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
1924 uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
1925 uncheckInput(uncheckAmount);
1926 currentCountAlreadyChecked -= uncheckAmount;
1929 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
1930 emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount);
1931 atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityCount, term.quantityType);
1932 if (uncheckAmount) {
1933 checkInput(uncheckAmount);
1934 currentCountAlreadyChecked += uncheckAmount;
1939 case PatternTerm::TypeDotStarEnclosure:
1940 assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
1948 YarrPattern& m_pattern;
1949 OwnPtr<ByteDisjunction> m_bodyDisjunction;
1950 unsigned m_currentAlternativeIndex;
1951 Vector<ParenthesesStackEntry> m_parenthesesStack;
1952 Vector<ByteDisjunction*> m_allParenthesesInfo;
1955 PassOwnPtr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator)
1957 return ByteCompiler(pattern).compile(allocator);
1960 int interpret(BytecodePattern* bytecode, const UString& input, unsigned start, unsigned length, int* output)
1962 return Interpreter(bytecode, output, input, start, length).interpret();
1965 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
1966 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
1967 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
1968 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
1969 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
1970 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
1971 COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);