1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **************************************************************************
5 * Copyright (C) 2002-2016 International Business Machines Corporation
6 * and others. All rights reserved.
7 **************************************************************************
12 // Contains the implementation of class RegexMatcher,
13 // which is one of the main API classes for the ICU regular expression package.
16 #include "unicode/utypes.h"
17 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
19 #include "unicode/regex.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/ustring.h"
23 #include "unicode/rbbi.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
37 // #include <malloc.h> // Needed for heapcheck testing
42 // Default limit for the size of the back track stack, to avoid system
43 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
44 // This value puts ICU's limits higher than most other regexp implementations,
45 // which use recursion rather than the heap, and take more storage per
48 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
50 // Time limit counter constant.
51 // Time limits for expression evaluation are in terms of quanta of work by
52 // the engine, each of which is 10,000 state saves.
53 // This constant determines that state saves per tick number.
54 static const int32_t TIMER_INITIAL_VALUE = 10000;
57 // Test for any of the Unicode line terminating characters.
58 static inline UBool isLineTerminator(UChar32 c) {
59 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
62 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
65 //-----------------------------------------------------------------------------
67 // Constructor and Destructor
69 //-----------------------------------------------------------------------------
70 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
71 fDeferredStatus = U_ZERO_ERROR;
72 init(fDeferredStatus);
73 if (U_FAILURE(fDeferredStatus)) {
77 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
81 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
86 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
87 uint32_t flags, UErrorCode &status) {
89 if (U_FAILURE(status)) {
93 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
94 fPattern = fPatternOwned;
96 UText inputText = UTEXT_INITIALIZER;
97 utext_openConstUnicodeString(&inputText, &input, &status);
98 init2(&inputText, status);
99 utext_close(&inputText);
101 fInputUniStrMaybeMutable = TRUE;
105 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106 uint32_t flags, UErrorCode &status) {
108 if (U_FAILURE(status)) {
112 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
113 if (U_FAILURE(status)) {
117 fPattern = fPatternOwned;
118 init2(input, status);
122 RegexMatcher::RegexMatcher(const UnicodeString ®exp,
123 uint32_t flags, UErrorCode &status) {
125 if (U_FAILURE(status)) {
129 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
130 if (U_FAILURE(status)) {
133 fPattern = fPatternOwned;
134 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
137 RegexMatcher::RegexMatcher(UText *regexp,
138 uint32_t flags, UErrorCode &status) {
140 if (U_FAILURE(status)) {
144 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
145 if (U_FAILURE(status)) {
149 fPattern = fPatternOwned;
150 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
156 RegexMatcher::~RegexMatcher() {
158 if (fData != fSmallData) {
163 delete fPatternOwned;
164 fPatternOwned = NULL;
172 utext_close(fInputText);
175 utext_close(fAltInputText);
178 #if UCONFIG_NO_BREAK_ITERATION==0
179 delete fWordBreakItr;
184 // init() common initialization for use by all constructors.
185 // Initialize all fields, get the object into a consistent state.
186 // This must be done even when the initial status shows an error,
187 // so that the object is initialized sufficiently well for the destructor
190 void RegexMatcher::init(UErrorCode &status) {
192 fPatternOwned = NULL;
202 fTransparentBounds = FALSE;
203 fAnchoringBounds = TRUE;
216 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
218 fCallbackContext = NULL;
219 fFindProgressCallbackFn = NULL;
220 fFindProgressCallbackContext = NULL;
222 fDeferredStatus = status;
224 fWordBreakItr = NULL;
228 fAltInputText = NULL;
231 fInputUniStrMaybeMutable = FALSE;
235 // init2() Common initialization for use by RegexMatcher constructors, part 2.
236 // This handles the common setup to be done after the Pattern is available.
238 void RegexMatcher::init2(UText *input, UErrorCode &status) {
239 if (U_FAILURE(status)) {
240 fDeferredStatus = status;
244 if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
245 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
247 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
252 fStack = new UVector64(status);
253 if (fStack == NULL) {
254 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260 if (U_FAILURE(status)) {
261 fDeferredStatus = status;
267 static const UChar BACKSLASH = 0x5c;
268 static const UChar DOLLARSIGN = 0x24;
269 static const UChar LEFTBRACKET = 0x7b;
270 static const UChar RIGHTBRACKET = 0x7d;
272 //--------------------------------------------------------------------------------
276 //--------------------------------------------------------------------------------
277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278 const UnicodeString &replacement,
279 UErrorCode &status) {
280 UText replacementText = UTEXT_INITIALIZER;
282 utext_openConstUnicodeString(&replacementText, &replacement, &status);
283 if (U_SUCCESS(status)) {
284 UText resultText = UTEXT_INITIALIZER;
285 utext_openUnicodeString(&resultText, &dest, &status);
287 if (U_SUCCESS(status)) {
288 appendReplacement(&resultText, &replacementText, status);
289 utext_close(&resultText);
291 utext_close(&replacementText);
298 // appendReplacement, UText mode
300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
302 UErrorCode &status) {
303 if (U_FAILURE(status)) {
306 if (U_FAILURE(fDeferredStatus)) {
307 status = fDeferredStatus;
310 if (fMatch == FALSE) {
311 status = U_REGEX_INVALID_STATE;
315 // Copy input string from the end of previous match to start of current match
316 int64_t destLen = utext_nativeLength(dest);
317 if (fMatchStart > fAppendPosition) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320 (int32_t)(fMatchStart-fAppendPosition), &status);
323 if (UTEXT_USES_U16(fInputText)) {
324 len16 = (int32_t)(fMatchStart-fAppendPosition);
326 UErrorCode lengthStatus = U_ZERO_ERROR;
327 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
329 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
330 if (inputChars == NULL) {
331 status = U_MEMORY_ALLOCATION_ERROR;
334 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336 uprv_free(inputChars);
339 fAppendPosition = fMatchEnd;
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
345 UTEXT_SETNATIVEINDEX(replacement, 0);
346 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
347 if (c == BACKSLASH) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
353 c = UTEXT_CURRENT32(replacement);
354 if (c == U_SENTINEL) {
358 if (c==0x55/*U*/ || c==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
361 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363 if (escapedChar != (UChar32)0xFFFFFFFF) {
364 if (U_IS_BMP(escapedChar)) {
365 UChar c16 = (UChar)escapedChar;
366 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
369 surrogate[0] = U16_LEAD(escapedChar);
370 surrogate[1] = U16_TRAIL(escapedChar);
371 if (U_SUCCESS(status)) {
372 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
377 if (context.lastOffset == offset) {
378 (void)UTEXT_PREVIOUS32(replacement);
379 } else if (context.lastOffset != offset-1) {
380 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
384 (void)UTEXT_NEXT32(replacement);
385 // Plain backslash escape. Just put out the escaped character.
387 UChar c16 = (UChar)c;
388 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
391 surrogate[0] = U16_LEAD(c);
392 surrogate[1] = U16_TRAIL(c);
393 if (U_SUCCESS(status)) {
394 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
398 } else if (c != DOLLARSIGN) {
399 // Normal char, not a $. Copy it out without further checks.
401 UChar c16 = (UChar)c;
402 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
405 surrogate[0] = U16_LEAD(c);
406 surrogate[1] = U16_TRAIL(c);
407 if (U_SUCCESS(status)) {
408 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
416 int32_t groupNum = 0;
417 int32_t numDigits = 0;
418 UChar32 nextChar = utext_current32(replacement);
419 if (nextChar == LEFTBRACKET) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName;
422 utext_next32(replacement);
423 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424 nextChar = utext_next32(replacement);
425 if (nextChar == U_SENTINEL) {
426 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
428 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
429 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
430 groupName.append(nextChar);
431 } else if (nextChar == RIGHTBRACKET) {
432 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
437 // Character was something other than a name char or a closing '}'
438 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
442 } else if (u_isdigit(nextChar)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups = fPattern->fGroupMap->size();
446 nextChar = UTEXT_CURRENT32(replacement);
447 if (nextChar == U_SENTINEL) {
450 if (u_isdigit(nextChar) == FALSE) {
453 int32_t nextDigitVal = u_charDigitValue(nextChar);
454 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits == 0) {
457 status = U_INDEX_OUTOFBOUNDS_ERROR;
461 (void)UTEXT_NEXT32(replacement);
462 groupNum=groupNum*10 + nextDigitVal;
466 // $ not followed by capture group name or number.
467 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
470 if (U_SUCCESS(status)) {
471 destLen += appendGroup(groupNum, dest, status);
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
481 //--------------------------------------------------------------------------------
483 // appendTail Intended to be used in conjunction with appendReplacement()
484 // To the destination string, append everything following
485 // the last match position from the input string.
487 // Note: Match ranges do not affect appendTail or appendReplacement
489 //--------------------------------------------------------------------------------
490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491 UErrorCode status = U_ZERO_ERROR;
492 UText resultText = UTEXT_INITIALIZER;
493 utext_openUnicodeString(&resultText, &dest, &status);
495 if (U_SUCCESS(status)) {
496 appendTail(&resultText, status);
497 utext_close(&resultText);
504 // appendTail, UText mode
506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507 if (U_FAILURE(status)) {
510 if (U_FAILURE(fDeferredStatus)) {
511 status = fDeferredStatus;
515 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest);
518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519 (int32_t)(fInputLength-fAppendPosition), &status);
522 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition);
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530 if (inputChars == NULL) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536 uprv_free(inputChars);
545 //--------------------------------------------------------------------------------
549 //--------------------------------------------------------------------------------
550 int32_t RegexMatcher::end(UErrorCode &err) const {
554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555 return end64(0, err);
558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559 if (U_FAILURE(err)) {
562 if (fMatch == FALSE) {
563 err = U_REGEX_INVALID_STATE;
566 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR;
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1];
585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err);
589 //--------------------------------------------------------------------------------
591 // findProgressInterrupt This function is called once for each advance in the target
592 // string from the find() function, and calls the user progress callback
593 // function if there is one installed.
595 // Return: TRUE if the find operation is to be terminated.
596 // FALSE if the find operation is to continue running.
598 //--------------------------------------------------------------------------------
599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601 status = U_REGEX_STOPPED_BY_CALLER;
607 //--------------------------------------------------------------------------------
611 //--------------------------------------------------------------------------------
612 UBool RegexMatcher::find() {
613 if (U_FAILURE(fDeferredStatus)) {
616 UErrorCode status = U_ZERO_ERROR;
617 UBool result = find(status);
621 //--------------------------------------------------------------------------------
625 //--------------------------------------------------------------------------------
626 UBool RegexMatcher::find(UErrorCode &status) {
627 // Start at the position of the last match end. (Will be zero if the
628 // matcher has been reset.)
630 if (U_FAILURE(status)) {
633 if (U_FAILURE(fDeferredStatus)) {
634 status = fDeferredStatus;
638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639 return findUsingChunk(status);
642 int64_t startPos = fMatchEnd;
644 startPos = fActiveStart;
648 // Save the position of any previous successful match.
649 fLastMatchEnd = fMatchEnd;
651 if (fMatchStart == fMatchEnd) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
654 if (startPos >= fActiveLimit) {
659 UTEXT_SETNATIVEINDEX(fInputText, startPos);
660 (void)UTEXT_NEXT32(fInputText);
661 startPos = UTEXT_GETNATIVEINDEX(fInputText);
664 if (fLastMatchEnd >= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
678 int64_t testStartLimit;
679 if (UTEXT_USES_U16(fInputText)) {
680 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681 if (startPos > testStartLimit) {
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
693 U_ASSERT(startPos >= 0);
695 switch (fPattern->fStartType) {
697 // No optimization was found.
698 // Try a match at each input position.
700 MatchAt(startPos, FALSE, status);
701 if (U_FAILURE(status)) {
707 if (startPos >= testStartLimit) {
711 UTEXT_SETNATIVEINDEX(fInputText, startPos);
712 (void)UTEXT_NEXT32(fInputText);
713 startPos = UTEXT_GETNATIVEINDEX(fInputText);
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
716 // runs with startPos == testStartLimit the last time through.
717 if (findProgressInterrupt(startPos, status))
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
725 if (startPos > fActiveStart) {
729 MatchAt(startPos, FALSE, status);
730 if (U_FAILURE(status)) {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern->fMinMatchLen > 0);
740 UTEXT_SETNATIVEINDEX(fInputText, startPos);
742 int64_t pos = startPos;
743 c = UTEXT_NEXT32(fInputText);
744 startPos = UTEXT_GETNATIVEINDEX(fInputText);
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749 (c>=256 && fPattern->fInitialChars->contains(c)))) {
750 MatchAt(pos, FALSE, status);
751 if (U_FAILURE(status)) {
757 UTEXT_SETNATIVEINDEX(fInputText, pos);
759 if (startPos > testStartLimit) {
764 if (findProgressInterrupt(startPos, status))
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern->fMinMatchLen > 0);
775 UChar32 theChar = fPattern->fInitialChar;
776 UTEXT_SETNATIVEINDEX(fInputText, startPos);
778 int64_t pos = startPos;
779 c = UTEXT_NEXT32(fInputText);
780 startPos = UTEXT_GETNATIVEINDEX(fInputText);
782 MatchAt(pos, FALSE, status);
783 if (U_FAILURE(status)) {
789 UTEXT_SETNATIVEINDEX(fInputText, startPos);
791 if (startPos > testStartLimit) {
796 if (findProgressInterrupt(startPos, status))
805 if (startPos == fAnchorStart) {
806 MatchAt(startPos, FALSE, status);
807 if (U_FAILURE(status)) {
813 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814 c = UTEXT_NEXT32(fInputText);
815 startPos = UTEXT_GETNATIVEINDEX(fInputText);
817 UTEXT_SETNATIVEINDEX(fInputText, startPos);
818 c = UTEXT_PREVIOUS32(fInputText);
819 UTEXT_SETNATIVEINDEX(fInputText, startPos);
822 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
825 MatchAt(startPos, FALSE, status);
826 if (U_FAILURE(status)) {
832 UTEXT_SETNATIVEINDEX(fInputText, startPos);
834 if (startPos >= testStartLimit) {
839 c = UTEXT_NEXT32(fInputText);
840 startPos = UTEXT_GETNATIVEINDEX(fInputText);
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
843 // runs with startPos == testStartLimit the last time through.
844 if (findProgressInterrupt(startPos, status))
849 if (isLineTerminator(c)) {
850 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText);
852 startPos = UTEXT_GETNATIVEINDEX(fInputText);
854 MatchAt(startPos, FALSE, status);
855 if (U_FAILURE(status)) {
861 UTEXT_SETNATIVEINDEX(fInputText, startPos);
863 if (startPos >= testStartLimit) {
868 c = UTEXT_NEXT32(fInputText);
869 startPos = UTEXT_GETNATIVEINDEX(fInputText);
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
872 // runs with startPos == testStartLimit the last time through.
873 if (findProgressInterrupt(startPos, status))
889 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
890 if (U_FAILURE(status)) {
893 if (U_FAILURE(fDeferredStatus)) {
894 status = fDeferredStatus;
897 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
898 // This will reset the region to be the full input length.
900 status = U_INDEX_OUTOFBOUNDS_ERROR;
904 int64_t nativeStart = start;
905 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
906 status = U_INDEX_OUTOFBOUNDS_ERROR;
909 fMatchEnd = nativeStart;
914 //--------------------------------------------------------------------------------
916 // findUsingChunk() -- like find(), but with the advance knowledge that the
917 // entire string is available in the UText's chunk buffer.
919 //--------------------------------------------------------------------------------
920 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
921 // Start at the position of the last match end. (Will be zero if the
922 // matcher has been reset.
925 int32_t startPos = (int32_t)fMatchEnd;
927 startPos = (int32_t)fActiveStart;
930 const UChar *inputBuf = fInputText->chunkContents;
933 // Save the position of any previous successful match.
934 fLastMatchEnd = fMatchEnd;
936 if (fMatchStart == fMatchEnd) {
937 // Previous match had zero length. Move start position up one position
938 // to avoid sending find() into a loop on zero-length matches.
939 if (startPos >= fActiveLimit) {
944 U16_FWD_1(inputBuf, startPos, fInputLength);
947 if (fLastMatchEnd >= 0) {
948 // A previous find() failed to match. Don't try again.
949 // (without this test, a pattern with a zero-length match
950 // could match again at the end of an input string.)
957 // Compute the position in the input string beyond which a match can not begin, because
958 // the minimum length match would extend past the end of the input.
959 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
960 // Be aware of possible overflows if making changes here.
961 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
962 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
963 if (startPos > testLen) {
970 U_ASSERT(startPos >= 0);
972 switch (fPattern->fStartType) {
974 // No optimization was found.
975 // Try a match at each input position.
977 MatchChunkAt(startPos, FALSE, status);
978 if (U_FAILURE(status)) {
984 if (startPos >= testLen) {
988 U16_FWD_1(inputBuf, startPos, fActiveLimit);
989 // Note that it's perfectly OK for a pattern to have a zero-length
990 // match at the end of a string, so we must make sure that the loop
991 // runs with startPos == testLen the last time through.
992 if (findProgressInterrupt(startPos, status))
998 // Matches are only possible at the start of the input string
999 // (pattern begins with ^ or \A)
1000 if (startPos > fActiveStart) {
1004 MatchChunkAt(startPos, FALSE, status);
1005 if (U_FAILURE(status)) {
1013 // Match may start on any char from a pre-computed set.
1014 U_ASSERT(fPattern->fMinMatchLen > 0);
1016 int32_t pos = startPos;
1017 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1018 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1019 (c>=256 && fPattern->fInitialChars->contains(c))) {
1020 MatchChunkAt(pos, FALSE, status);
1021 if (U_FAILURE(status)) {
1028 if (startPos > testLen) {
1033 if (findProgressInterrupt(startPos, status))
1042 // Match starts on exactly one char.
1043 U_ASSERT(fPattern->fMinMatchLen > 0);
1044 UChar32 theChar = fPattern->fInitialChar;
1046 int32_t pos = startPos;
1047 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1049 MatchChunkAt(pos, FALSE, status);
1050 if (U_FAILURE(status)) {
1057 if (startPos > testLen) {
1062 if (findProgressInterrupt(startPos, status))
1071 if (startPos == fAnchorStart) {
1072 MatchChunkAt(startPos, FALSE, status);
1073 if (U_FAILURE(status)) {
1079 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1082 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1084 c = inputBuf[startPos-1];
1086 MatchChunkAt(startPos, FALSE, status);
1087 if (U_FAILURE(status)) {
1094 if (startPos >= testLen) {
1099 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1100 // Note that it's perfectly OK for a pattern to have a zero-length
1101 // match at the end of a string, so we must make sure that the loop
1102 // runs with startPos == testLen the last time through.
1103 if (findProgressInterrupt(startPos, status))
1108 c = inputBuf[startPos-1];
1109 if (isLineTerminator(c)) {
1110 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1113 MatchChunkAt(startPos, FALSE, status);
1114 if (U_FAILURE(status)) {
1121 if (startPos >= testLen) {
1126 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1127 // Note that it's perfectly OK for a pattern to have a zero-length
1128 // match at the end of a string, so we must make sure that the loop
1129 // runs with startPos == testLen the last time through.
1130 if (findProgressInterrupt(startPos, status))
1146 //--------------------------------------------------------------------------------
1150 //--------------------------------------------------------------------------------
1151 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1152 return group(0, status);
1155 // Return immutable shallow clone
1156 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1157 return group(0, dest, group_len, status);
1160 // Return immutable shallow clone
1161 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1163 if (U_FAILURE(status)) {
1166 if (U_FAILURE(fDeferredStatus)) {
1167 status = fDeferredStatus;
1168 } else if (fMatch == FALSE) {
1169 status = U_REGEX_INVALID_STATE;
1170 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1171 status = U_INDEX_OUTOFBOUNDS_ERROR;
1174 if (U_FAILURE(status)) {
1179 if (groupNum == 0) {
1183 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1184 U_ASSERT(groupOffset < fPattern->fFrameSize);
1185 U_ASSERT(groupOffset >= 0);
1186 s = fFrame->fExtra[groupOffset];
1187 e = fFrame->fExtra[groupOffset+1];
1191 // A capture group wasn't part of the match
1192 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1197 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1199 UTEXT_SETNATIVEINDEX(dest, s);
1203 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1204 UnicodeString result;
1205 int64_t groupStart = start64(groupNum, status);
1206 int64_t groupEnd = end64(groupNum, status);
1207 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1211 // Get the group length using a utext_extract preflight.
1212 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1213 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1214 if (status != U_BUFFER_OVERFLOW_ERROR) {
1218 status = U_ZERO_ERROR;
1219 UChar *buf = result.getBuffer(length);
1221 status = U_MEMORY_ALLOCATION_ERROR;
1223 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1224 result.releaseBuffer(extractLength);
1225 U_ASSERT(length == extractLength);
1231 //--------------------------------------------------------------------------------
1233 // appendGroup() -- currently internal only, appends a group to a UText rather
1234 // than replacing its contents
1236 //--------------------------------------------------------------------------------
1238 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1239 if (U_FAILURE(status)) {
1242 if (U_FAILURE(fDeferredStatus)) {
1243 status = fDeferredStatus;
1246 int64_t destLen = utext_nativeLength(dest);
1248 if (fMatch == FALSE) {
1249 status = U_REGEX_INVALID_STATE;
1250 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1252 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1253 status = U_INDEX_OUTOFBOUNDS_ERROR;
1254 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1258 if (groupNum == 0) {
1262 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1263 U_ASSERT(groupOffset < fPattern->fFrameSize);
1264 U_ASSERT(groupOffset >= 0);
1265 s = fFrame->fExtra[groupOffset];
1266 e = fFrame->fExtra[groupOffset+1];
1270 // A capture group wasn't part of the match
1271 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1276 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1277 U_ASSERT(e <= fInputLength);
1278 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1281 if (UTEXT_USES_U16(fInputText)) {
1282 len16 = (int32_t)(e-s);
1284 UErrorCode lengthStatus = U_ZERO_ERROR;
1285 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1287 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1288 if (groupChars == NULL) {
1289 status = U_MEMORY_ALLOCATION_ERROR;
1292 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1294 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1295 uprv_free(groupChars);
1302 //--------------------------------------------------------------------------------
1306 //--------------------------------------------------------------------------------
1307 int32_t RegexMatcher::groupCount() const {
1308 return fPattern->fGroupMap->size();
1311 //--------------------------------------------------------------------------------
1313 // hasAnchoringBounds()
1315 //--------------------------------------------------------------------------------
1316 UBool RegexMatcher::hasAnchoringBounds() const {
1317 return fAnchoringBounds;
1321 //--------------------------------------------------------------------------------
1323 // hasTransparentBounds()
1325 //--------------------------------------------------------------------------------
1326 UBool RegexMatcher::hasTransparentBounds() const {
1327 return fTransparentBounds;
1332 //--------------------------------------------------------------------------------
1336 //--------------------------------------------------------------------------------
1337 UBool RegexMatcher::hitEnd() const {
1342 //--------------------------------------------------------------------------------
1346 //--------------------------------------------------------------------------------
1347 const UnicodeString &RegexMatcher::input() const {
1349 UErrorCode status = U_ZERO_ERROR;
1351 if (UTEXT_USES_U16(fInputText)) {
1352 len16 = (int32_t)fInputLength;
1354 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1355 status = U_ZERO_ERROR; // overflow, length status
1357 UnicodeString *result = new UnicodeString(len16, 0, 0);
1359 UChar *inputChars = result->getBuffer(len16);
1360 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1361 result->releaseBuffer(len16);
1363 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1369 //--------------------------------------------------------------------------------
1373 //--------------------------------------------------------------------------------
1374 UText *RegexMatcher::inputText() const {
1379 //--------------------------------------------------------------------------------
1381 // getInput() -- like inputText(), but makes a clone or copies into another UText
1383 //--------------------------------------------------------------------------------
1384 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1385 if (U_FAILURE(status)) {
1388 if (U_FAILURE(fDeferredStatus)) {
1389 status = fDeferredStatus;
1394 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1395 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1398 if (UTEXT_USES_U16(fInputText)) {
1399 input16Len = (int32_t)fInputLength;
1401 UErrorCode lengthStatus = U_ZERO_ERROR;
1402 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1404 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1405 if (inputChars == NULL) {
1409 status = U_ZERO_ERROR;
1410 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1411 status = U_ZERO_ERROR;
1412 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1414 uprv_free(inputChars);
1418 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1423 static UBool compat_SyncMutableUTextContents(UText *ut);
1424 static UBool compat_SyncMutableUTextContents(UText *ut) {
1425 UBool retVal = FALSE;
1427 // In the following test, we're really only interested in whether the UText should switch
1428 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1429 // will still point to the correct data.
1430 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1431 UnicodeString *us=(UnicodeString *)ut->context;
1433 // Update to the latest length.
1434 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1435 int32_t newLength = us->length();
1437 // Update the chunk description.
1438 // The buffer may have switched between stack- and heap-based.
1439 ut->chunkContents = us->getBuffer();
1440 ut->chunkLength = newLength;
1441 ut->chunkNativeLimit = newLength;
1442 ut->nativeIndexingLimit = newLength;
1449 //--------------------------------------------------------------------------------
1453 //--------------------------------------------------------------------------------
1454 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1455 if (U_FAILURE(status)) {
1458 if (U_FAILURE(fDeferredStatus)) {
1459 status = fDeferredStatus;
1463 if (fInputUniStrMaybeMutable) {
1464 if (compat_SyncMutableUTextContents(fInputText)) {
1465 fInputLength = utext_nativeLength(fInputText);
1470 resetPreserveRegion();
1472 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1473 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1475 MatchAt(fActiveStart, FALSE, status);
1481 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1482 if (U_FAILURE(status)) {
1485 if (U_FAILURE(fDeferredStatus)) {
1486 status = fDeferredStatus;
1492 status = U_INDEX_OUTOFBOUNDS_ERROR;
1496 if (fInputUniStrMaybeMutable) {
1497 if (compat_SyncMutableUTextContents(fInputText)) {
1498 fInputLength = utext_nativeLength(fInputText);
1503 int64_t nativeStart;
1504 nativeStart = start;
1505 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1506 status = U_INDEX_OUTOFBOUNDS_ERROR;
1510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1511 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1513 MatchAt(nativeStart, FALSE, status);
1520 //--------------------------------------------------------------------------------
1524 //--------------------------------------------------------------------------------
1525 UBool RegexMatcher::matches(UErrorCode &status) {
1526 if (U_FAILURE(status)) {
1529 if (U_FAILURE(fDeferredStatus)) {
1530 status = fDeferredStatus;
1534 if (fInputUniStrMaybeMutable) {
1535 if (compat_SyncMutableUTextContents(fInputText)) {
1536 fInputLength = utext_nativeLength(fInputText);
1541 resetPreserveRegion();
1544 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1545 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1547 MatchAt(fActiveStart, TRUE, status);
1553 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1554 if (U_FAILURE(status)) {
1557 if (U_FAILURE(fDeferredStatus)) {
1558 status = fDeferredStatus;
1564 status = U_INDEX_OUTOFBOUNDS_ERROR;
1568 if (fInputUniStrMaybeMutable) {
1569 if (compat_SyncMutableUTextContents(fInputText)) {
1570 fInputLength = utext_nativeLength(fInputText);
1575 int64_t nativeStart;
1576 nativeStart = start;
1577 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1578 status = U_INDEX_OUTOFBOUNDS_ERROR;
1582 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1583 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1585 MatchAt(nativeStart, TRUE, status);
1592 //--------------------------------------------------------------------------------
1596 //--------------------------------------------------------------------------------
1597 const RegexPattern &RegexMatcher::pattern() const {
1603 //--------------------------------------------------------------------------------
1607 //--------------------------------------------------------------------------------
1608 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1609 if (U_FAILURE(status)) {
1613 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1614 status = U_ILLEGAL_ARGUMENT_ERROR;
1617 int64_t nativeStart = regionStart;
1618 int64_t nativeLimit = regionLimit;
1619 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1620 status = U_ILLEGAL_ARGUMENT_ERROR;
1623 if (startIndex == -1)
1626 resetPreserveRegion();
1628 fRegionStart = nativeStart;
1629 fRegionLimit = nativeLimit;
1630 fActiveStart = nativeStart;
1631 fActiveLimit = nativeLimit;
1633 if (startIndex != -1) {
1634 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1635 status = U_INDEX_OUTOFBOUNDS_ERROR;
1637 fMatchEnd = startIndex;
1640 if (!fTransparentBounds) {
1641 fLookStart = nativeStart;
1642 fLookLimit = nativeLimit;
1644 if (fAnchoringBounds) {
1645 fAnchorStart = nativeStart;
1646 fAnchorLimit = nativeLimit;
1651 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1652 return region(start, limit, -1, status);
1655 //--------------------------------------------------------------------------------
1659 //--------------------------------------------------------------------------------
1660 int32_t RegexMatcher::regionEnd() const {
1661 return (int32_t)fRegionLimit;
1664 int64_t RegexMatcher::regionEnd64() const {
1665 return fRegionLimit;
1668 //--------------------------------------------------------------------------------
1672 //--------------------------------------------------------------------------------
1673 int32_t RegexMatcher::regionStart() const {
1674 return (int32_t)fRegionStart;
1677 int64_t RegexMatcher::regionStart64() const {
1678 return fRegionStart;
1682 //--------------------------------------------------------------------------------
1686 //--------------------------------------------------------------------------------
1687 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1688 UText replacementText = UTEXT_INITIALIZER;
1689 UText resultText = UTEXT_INITIALIZER;
1690 UnicodeString resultString;
1691 if (U_FAILURE(status)) {
1692 return resultString;
1695 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1696 utext_openUnicodeString(&resultText, &resultString, &status);
1698 replaceAll(&replacementText, &resultText, status);
1700 utext_close(&resultText);
1701 utext_close(&replacementText);
1703 return resultString;
1708 // replaceAll, UText mode
1710 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1711 if (U_FAILURE(status)) {
1714 if (U_FAILURE(fDeferredStatus)) {
1715 status = fDeferredStatus;
1720 UnicodeString emptyString;
1721 UText empty = UTEXT_INITIALIZER;
1723 utext_openUnicodeString(&empty, &emptyString, &status);
1724 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1725 utext_close(&empty);
1728 if (U_SUCCESS(status)) {
1731 appendReplacement(dest, replacement, status);
1732 if (U_FAILURE(status)) {
1736 appendTail(dest, status);
1743 //--------------------------------------------------------------------------------
1747 //--------------------------------------------------------------------------------
1748 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1749 UText replacementText = UTEXT_INITIALIZER;
1750 UText resultText = UTEXT_INITIALIZER;
1751 UnicodeString resultString;
1753 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754 utext_openUnicodeString(&resultText, &resultString, &status);
1756 replaceFirst(&replacementText, &resultText, status);
1758 utext_close(&resultText);
1759 utext_close(&replacementText);
1761 return resultString;
1765 // replaceFirst, UText mode
1767 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1768 if (U_FAILURE(status)) {
1771 if (U_FAILURE(fDeferredStatus)) {
1772 status = fDeferredStatus;
1778 return getInput(dest, status);
1782 UnicodeString emptyString;
1783 UText empty = UTEXT_INITIALIZER;
1785 utext_openUnicodeString(&empty, &emptyString, &status);
1786 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1787 utext_close(&empty);
1790 appendReplacement(dest, replacement, status);
1791 appendTail(dest, status);
1797 //--------------------------------------------------------------------------------
1801 //--------------------------------------------------------------------------------
1802 UBool RegexMatcher::requireEnd() const {
1807 //--------------------------------------------------------------------------------
1811 //--------------------------------------------------------------------------------
1812 RegexMatcher &RegexMatcher::reset() {
1814 fRegionLimit = fInputLength;
1816 fActiveLimit = fInputLength;
1818 fAnchorLimit = fInputLength;
1820 fLookLimit = fInputLength;
1821 resetPreserveRegion();
1827 void RegexMatcher::resetPreserveRegion() {
1831 fAppendPosition = 0;
1834 fRequireEnd = FALSE;
1836 fTickCounter = TIMER_INITIAL_VALUE;
1837 //resetStack(); // more expensive than it looks...
1841 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1842 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1843 if (fPattern->fNeedsAltInput) {
1844 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1846 if (U_FAILURE(fDeferredStatus)) {
1849 fInputLength = utext_nativeLength(fInputText);
1855 // Do the following for any UnicodeString.
1856 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1857 fInputUniStrMaybeMutable = TRUE;
1859 if (fWordBreakItr != NULL) {
1860 #if UCONFIG_NO_BREAK_ITERATION==0
1861 UErrorCode status = U_ZERO_ERROR;
1862 fWordBreakItr->setText(fInputText, status);
1869 RegexMatcher &RegexMatcher::reset(UText *input) {
1870 if (fInputText != input) {
1871 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1872 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1873 if (U_FAILURE(fDeferredStatus)) {
1876 fInputLength = utext_nativeLength(fInputText);
1881 if (fWordBreakItr != NULL) {
1882 #if UCONFIG_NO_BREAK_ITERATION==0
1883 UErrorCode status = U_ZERO_ERROR;
1884 fWordBreakItr->setText(input, status);
1889 fInputUniStrMaybeMutable = FALSE;
1894 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1895 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1899 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1900 if (U_FAILURE(status)) {
1903 reset(); // Reset also resets the region to be the entire string.
1905 if (position < 0 || position > fActiveLimit) {
1906 status = U_INDEX_OUTOFBOUNDS_ERROR;
1909 fMatchEnd = position;
1914 //--------------------------------------------------------------------------------
1918 //--------------------------------------------------------------------------------
1919 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1920 if (U_FAILURE(status)) {
1923 if (input == NULL) {
1924 status = U_ILLEGAL_ARGUMENT_ERROR;
1927 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1928 status = U_ILLEGAL_ARGUMENT_ERROR;
1931 int64_t pos = utext_getNativeIndex(fInputText);
1932 // Shallow read-only clone of the new UText into the existing input UText
1933 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1934 if (U_FAILURE(status)) {
1937 utext_setNativeIndex(fInputText, pos);
1939 if (fAltInputText != NULL) {
1940 pos = utext_getNativeIndex(fAltInputText);
1941 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1942 if (U_FAILURE(status)) {
1945 utext_setNativeIndex(fAltInputText, pos);
1952 //--------------------------------------------------------------------------------
1956 //--------------------------------------------------------------------------------
1957 void RegexMatcher::setTrace(UBool state) {
1958 fTraceDebug = state;
1964 * UText, replace entire contents of the destination UText with a substring of the source UText.
1966 * @param src The source UText
1967 * @param dest The destination UText. Must be writable.
1968 * May be NULL, in which case a new UText will be allocated.
1969 * @param start Start index of source substring.
1970 * @param limit Limit index of source substring.
1971 * @param status An error code.
1973 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1974 if (U_FAILURE(*status)) {
1977 if (start == limit) {
1979 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1982 return utext_openUChars(NULL, NULL, 0, status);
1985 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1986 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1989 *status = U_ZERO_ERROR;
1990 MaybeStackArray<UChar, 40> buffer;
1991 if (length >= buffer.getCapacity()) {
1992 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
1993 if (newBuf == NULL) {
1994 *status = U_MEMORY_ALLOCATION_ERROR;
1997 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1999 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2003 // Caller did not provide a prexisting UText.
2004 // Open a new one, and have it adopt the text buffer storage.
2005 if (U_FAILURE(*status)) {
2008 int32_t ownedLength = 0;
2009 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2010 if (ownedBuf == NULL) {
2011 *status = U_MEMORY_ALLOCATION_ERROR;
2014 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2015 if (U_FAILURE(*status)) {
2016 uprv_free(ownedBuf);
2019 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2024 //---------------------------------------------------------------------
2028 //---------------------------------------------------------------------
2029 int32_t RegexMatcher::split(const UnicodeString &input,
2030 UnicodeString dest[],
2031 int32_t destCapacity,
2034 UText inputText = UTEXT_INITIALIZER;
2035 utext_openConstUnicodeString(&inputText, &input, &status);
2036 if (U_FAILURE(status)) {
2040 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2041 if (destText == NULL) {
2042 status = U_MEMORY_ALLOCATION_ERROR;
2046 for (i = 0; i < destCapacity; i++) {
2047 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2050 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2052 for (i = 0; i < destCapacity; i++) {
2053 utext_close(destText[i]);
2056 uprv_free(destText);
2057 utext_close(&inputText);
2062 // split, UText mode
2064 int32_t RegexMatcher::split(UText *input,
2066 int32_t destCapacity,
2070 // Check arguements for validity
2072 if (U_FAILURE(status)) {
2076 if (destCapacity < 1) {
2077 status = U_ILLEGAL_ARGUMENT_ERROR;
2082 // Reset for the input text
2085 int64_t nextOutputStringStart = 0;
2086 if (fActiveLimit == 0) {
2091 // Loop through the input text, searching for the delimiter pattern
2094 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2096 if (i>=destCapacity-1) {
2097 // There is one or zero output string left.
2098 // Fill the last output string with whatever is left from the input, then exit the loop.
2099 // ( i will be == destCapacity if we filled the output array while processing
2100 // capture groups of the delimiter expression, in which case we will discard the
2101 // last capture group saved in favor of the unprocessed remainder of the
2104 if (fActiveLimit > nextOutputStringStart) {
2105 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2107 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2108 input->chunkContents+nextOutputStringStart,
2109 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2111 UText remainingText = UTEXT_INITIALIZER;
2112 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2113 fActiveLimit-nextOutputStringStart, &status);
2114 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2115 utext_close(&remainingText);
2118 UErrorCode lengthStatus = U_ZERO_ERROR;
2119 int32_t remaining16Length =
2120 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2121 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2122 if (remainingChars == NULL) {
2123 status = U_MEMORY_ALLOCATION_ERROR;
2127 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2129 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2131 UText remainingText = UTEXT_INITIALIZER;
2132 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2133 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2134 utext_close(&remainingText);
2137 uprv_free(remainingChars);
2143 // We found another delimiter. Move everything from where we started looking
2144 // up until the start of the delimiter into the next output string.
2145 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2147 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2148 input->chunkContents+nextOutputStringStart,
2149 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2151 UText remainingText = UTEXT_INITIALIZER;
2152 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2153 fMatchStart-nextOutputStringStart, &status);
2154 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2155 utext_close(&remainingText);
2158 UErrorCode lengthStatus = U_ZERO_ERROR;
2159 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2160 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2161 if (remainingChars == NULL) {
2162 status = U_MEMORY_ALLOCATION_ERROR;
2165 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2167 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2169 UText remainingText = UTEXT_INITIALIZER;
2170 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2171 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2172 utext_close(&remainingText);
2175 uprv_free(remainingChars);
2177 nextOutputStringStart = fMatchEnd;
2179 // If the delimiter pattern has capturing parentheses, the captured
2180 // text goes out into the next n destination strings.
2182 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2183 if (i >= destCapacity-2) {
2184 // Never fill the last available output string with capture group text.
2185 // It will filled with the last field, the remainder of the
2186 // unsplit input text.
2190 dest[i] = utext_extract_replace(fInputText, dest[i],
2191 start64(groupNum, status), end64(groupNum, status), &status);
2194 if (nextOutputStringStart == fActiveLimit) {
2195 // The delimiter was at the end of the string. We're done, but first
2196 // we output one last empty string, for the empty field following
2197 // the delimiter at the end of input.
2198 if (i+1 < destCapacity) {
2200 if (dest[i] == NULL) {
2201 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2203 static UChar emptyString[] = {(UChar)0};
2204 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2213 // We ran off the end of the input while looking for the next delimiter.
2214 // All the remaining text goes into the current output string.
2215 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2217 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2218 input->chunkContents+nextOutputStringStart,
2219 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2221 UText remainingText = UTEXT_INITIALIZER;
2222 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2223 fActiveLimit-nextOutputStringStart, &status);
2224 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2225 utext_close(&remainingText);
2228 UErrorCode lengthStatus = U_ZERO_ERROR;
2229 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2230 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2231 if (remainingChars == NULL) {
2232 status = U_MEMORY_ALLOCATION_ERROR;
2236 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2238 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2240 UText remainingText = UTEXT_INITIALIZER;
2241 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2242 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2243 utext_close(&remainingText);
2246 uprv_free(remainingChars);
2250 if (U_FAILURE(status)) {
2253 } // end of for loop
2258 //--------------------------------------------------------------------------------
2262 //--------------------------------------------------------------------------------
2263 int32_t RegexMatcher::start(UErrorCode &status) const {
2264 return start(0, status);
2267 int64_t RegexMatcher::start64(UErrorCode &status) const {
2268 return start64(0, status);
2271 //--------------------------------------------------------------------------------
2273 // start(int32_t group, UErrorCode &status)
2275 //--------------------------------------------------------------------------------
2277 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2278 if (U_FAILURE(status)) {
2281 if (U_FAILURE(fDeferredStatus)) {
2282 status = fDeferredStatus;
2285 if (fMatch == FALSE) {
2286 status = U_REGEX_INVALID_STATE;
2289 if (group < 0 || group > fPattern->fGroupMap->size()) {
2290 status = U_INDEX_OUTOFBOUNDS_ERROR;
2297 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2298 U_ASSERT(groupOffset < fPattern->fFrameSize);
2299 U_ASSERT(groupOffset >= 0);
2300 s = fFrame->fExtra[groupOffset];
2307 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2308 return (int32_t)start64(group, status);
2311 //--------------------------------------------------------------------------------
2313 // useAnchoringBounds
2315 //--------------------------------------------------------------------------------
2316 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2317 fAnchoringBounds = b;
2318 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2319 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2324 //--------------------------------------------------------------------------------
2326 // useTransparentBounds
2328 //--------------------------------------------------------------------------------
2329 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2330 fTransparentBounds = b;
2331 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2332 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2336 //--------------------------------------------------------------------------------
2340 //--------------------------------------------------------------------------------
2341 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2342 if (U_FAILURE(status)) {
2345 if (U_FAILURE(fDeferredStatus)) {
2346 status = fDeferredStatus;
2350 status = U_ILLEGAL_ARGUMENT_ERROR;
2357 //--------------------------------------------------------------------------------
2361 //--------------------------------------------------------------------------------
2362 int32_t RegexMatcher::getTimeLimit() const {
2367 //--------------------------------------------------------------------------------
2371 //--------------------------------------------------------------------------------
2372 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2373 if (U_FAILURE(status)) {
2376 if (U_FAILURE(fDeferredStatus)) {
2377 status = fDeferredStatus;
2381 status = U_ILLEGAL_ARGUMENT_ERROR;
2385 // Reset the matcher. This is needed here in case there is a current match
2386 // whose final stack frame (containing the match results, pointed to by fFrame)
2387 // would be lost by resizing to a smaller stack size.
2391 // Unlimited stack expansion
2392 fStack->setMaxCapacity(0);
2394 // Change the units of the limit from bytes to ints, and bump the size up
2395 // to be big enough to hold at least one stack frame for the pattern,
2396 // if it isn't there already.
2397 int32_t adjustedLimit = limit / sizeof(int32_t);
2398 if (adjustedLimit < fPattern->fFrameSize) {
2399 adjustedLimit = fPattern->fFrameSize;
2401 fStack->setMaxCapacity(adjustedLimit);
2403 fStackLimit = limit;
2407 //--------------------------------------------------------------------------------
2411 //--------------------------------------------------------------------------------
2412 int32_t RegexMatcher::getStackLimit() const {
2417 //--------------------------------------------------------------------------------
2421 //--------------------------------------------------------------------------------
2422 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2423 const void *context,
2424 UErrorCode &status) {
2425 if (U_FAILURE(status)) {
2428 fCallbackFn = callback;
2429 fCallbackContext = context;
2433 //--------------------------------------------------------------------------------
2437 //--------------------------------------------------------------------------------
2438 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2439 const void *&context,
2440 UErrorCode &status) {
2441 if (U_FAILURE(status)) {
2444 callback = fCallbackFn;
2445 context = fCallbackContext;
2449 //--------------------------------------------------------------------------------
2453 //--------------------------------------------------------------------------------
2454 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2455 const void *context,
2456 UErrorCode &status) {
2457 if (U_FAILURE(status)) {
2460 fFindProgressCallbackFn = callback;
2461 fFindProgressCallbackContext = context;
2465 //--------------------------------------------------------------------------------
2469 //--------------------------------------------------------------------------------
2470 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2471 const void *&context,
2472 UErrorCode &status) {
2473 if (U_FAILURE(status)) {
2476 callback = fFindProgressCallbackFn;
2477 context = fFindProgressCallbackContext;
2481 //================================================================================
2483 // Code following this point in this file is the internal
2484 // Match Engine Implementation.
2486 //================================================================================
2489 //--------------------------------------------------------------------------------
2492 // Discard any previous contents of the state save stack, and initialize a
2493 // new stack frame to all -1. The -1s are needed for capture group limits,
2494 // where they indicate that a group has not yet matched anything.
2495 //--------------------------------------------------------------------------------
2496 REStackFrame *RegexMatcher::resetStack() {
2497 // Discard any previous contents of the state save stack, and initialize a
2498 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2499 // where they indicate that a group has not yet matched anything.
2500 fStack->removeAllElements();
2502 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2503 if(U_FAILURE(fDeferredStatus)) {
2508 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2509 iFrame->fExtra[i] = -1;
2516 //--------------------------------------------------------------------------------
2519 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2521 // If the current char is a combining mark,
2523 // Else Scan backwards to the first non-combining char.
2524 // We are at a boundary if the this char and the original chars are
2525 // opposite in membership in \w set
2527 // parameters: pos - the current position in the input buffer
2529 // TODO: double-check edge cases at region boundaries.
2531 //--------------------------------------------------------------------------------
2532 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2533 UBool isBoundary = FALSE;
2534 UBool cIsWord = FALSE;
2536 if (pos >= fLookLimit) {
2539 // Determine whether char c at current position is a member of the word set of chars.
2540 // If we're off the end of the string, behave as though we're not at a word char.
2541 UTEXT_SETNATIVEINDEX(fInputText, pos);
2542 UChar32 c = UTEXT_CURRENT32(fInputText);
2543 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2544 // Current char is a combining one. Not a boundary.
2547 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2550 // Back up until we come to a non-combining char, determine whether
2551 // that char is a word char.
2552 UBool prevCIsWord = FALSE;
2554 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2557 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2558 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2559 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2560 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2564 isBoundary = cIsWord ^ prevCIsWord;
2568 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2569 UBool isBoundary = FALSE;
2570 UBool cIsWord = FALSE;
2572 const UChar *inputBuf = fInputText->chunkContents;
2574 if (pos >= fLookLimit) {
2577 // Determine whether char c at current position is a member of the word set of chars.
2578 // If we're off the end of the string, behave as though we're not at a word char.
2580 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2581 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2582 // Current char is a combining one. Not a boundary.
2585 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2588 // Back up until we come to a non-combining char, determine whether
2589 // that char is a word char.
2590 UBool prevCIsWord = FALSE;
2592 if (pos <= fLookStart) {
2596 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2597 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2598 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2599 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2603 isBoundary = cIsWord ^ prevCIsWord;
2607 //--------------------------------------------------------------------------------
2611 // Test for a word boundary using RBBI word break.
2613 // parameters: pos - the current position in the input buffer
2615 //--------------------------------------------------------------------------------
2616 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2617 UBool returnVal = FALSE;
2618 #if UCONFIG_NO_BREAK_ITERATION==0
2620 // If we haven't yet created a break iterator for this matcher, do it now.
2621 if (fWordBreakItr == NULL) {
2623 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2624 if (U_FAILURE(fDeferredStatus)) {
2627 fWordBreakItr->setText(fInputText, fDeferredStatus);
2630 if (pos >= fLookLimit) {
2632 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2633 // words are not boundaries. All non-word chars stand by themselves,
2634 // with word boundaries on both sides.
2636 if (!UTEXT_USES_U16(fInputText)) {
2637 // !!!: Would like a better way to do this!
2638 UErrorCode status = U_ZERO_ERROR;
2639 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2641 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2647 //--------------------------------------------------------------------------------
2649 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2650 // saves. Increment the "time" counter, and call the
2651 // user callback function if there is one installed.
2653 // If the match operation needs to be aborted, either for a time-out
2654 // or because the user callback asked for it, just set an error status.
2655 // The engine will pick that up and stop in its outer loop.
2657 //--------------------------------------------------------------------------------
2658 void RegexMatcher::IncrementTime(UErrorCode &status) {
2659 fTickCounter = TIMER_INITIAL_VALUE;
2661 if (fCallbackFn != NULL) {
2662 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2663 status = U_REGEX_STOPPED_BY_CALLER;
2667 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2668 status = U_REGEX_TIME_OUT;
2672 //--------------------------------------------------------------------------------
2675 // Make a new stack frame, initialized as a copy of the current stack frame.
2676 // Set the pattern index in the original stack frame from the operand value
2677 // in the opcode. Execution of the engine continues with the state in
2678 // the newly created stack frame
2680 // Note that reserveBlock() may grow the stack, resulting in the
2681 // whole thing being relocated in memory.
2684 // fp The top frame pointer when called. At return, a new
2685 // fame will be present
2686 // savePatIdx An index into the compiled pattern. Goes into the original
2687 // (not new) frame. If execution ever back-tracks out of the
2688 // new frame, this will be where we continue from in the pattern.
2690 // The new frame pointer.
2692 //--------------------------------------------------------------------------------
2693 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2694 if (U_FAILURE(status)) {
2697 // push storage for a new frame.
2698 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2699 if (U_FAILURE(status)) {
2700 // Failure on attempted stack expansion.
2701 // Stack function set some other error code, change it to a more
2702 // specific one for regular expressions.
2703 status = U_REGEX_STACK_OVERFLOW;
2704 // We need to return a writable stack frame, so just return the
2705 // previous frame. The match operation will stop quickly
2706 // because of the error status, after which the frame will never
2707 // be looked at again.
2710 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2712 // New stack frame = copy of old top frame.
2713 int64_t *source = (int64_t *)fp;
2714 int64_t *dest = newFP;
2716 *dest++ = *source++;
2717 if (source == newFP) {
2723 if (fTickCounter <= 0) {
2724 IncrementTime(status); // Re-initializes fTickCounter
2726 fp->fPatIdx = savePatIdx;
2727 return (REStackFrame *)newFP;
2730 #if defined(REGEX_DEBUG)
2732 UnicodeString StringFromUText(UText *ut) {
2733 UnicodeString result;
2734 for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2740 #endif // REGEX_DEBUG
2743 //--------------------------------------------------------------------------------
2745 // MatchAt This is the actual matching engine.
2747 // startIdx: begin matching a this index.
2748 // toEnd: if true, match must extend to end of the input region
2750 //--------------------------------------------------------------------------------
2751 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2752 UBool isMatch = FALSE; // True if the we have a match.
2754 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2756 int32_t op; // Operation from the compiled pattern, split into
2757 int32_t opType; // the opcode
2758 int32_t opValue; // and the operand value.
2760 #ifdef REGEX_RUN_DEBUG
2762 printf("MatchAt(startIdx=%ld)\n", startIdx);
2763 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2764 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2768 if (U_FAILURE(status)) {
2772 // Cache frequently referenced items from the compiled pattern
2774 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2776 const UChar *litText = fPattern->fLiteralText.getBuffer();
2777 UVector *sets = fPattern->fSets;
2779 fFrameSize = fPattern->fFrameSize;
2780 REStackFrame *fp = resetStack();
2781 if (U_FAILURE(fDeferredStatus)) {
2782 status = fDeferredStatus;
2787 fp->fInputIdx = startIdx;
2789 // Zero out the pattern's static data
2791 for (i = 0; i<fPattern->fDataSize; i++) {
2796 // Main loop for interpreting the compiled pattern.
2797 // One iteration of the loop per pattern operation performed.
2800 op = (int32_t)pat[fp->fPatIdx];
2801 opType = URX_TYPE(op);
2802 opValue = URX_VAL(op);
2803 #ifdef REGEX_RUN_DEBUG
2805 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2806 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2807 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2808 fPattern->dumpOp(fp->fPatIdx);
2821 // Force a backtrack. In some circumstances, the pattern compiler
2822 // will notice that the pattern can't possibly match anything, and will
2823 // emit one of these at that point.
2824 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2829 if (fp->fInputIdx < fActiveLimit) {
2830 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2831 UChar32 c = UTEXT_NEXT32(fInputText);
2833 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2839 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2845 // Test input against a literal string.
2846 // Strings require two slots in the compiled pattern, one for the
2847 // offset to the string text, and one for the length.
2849 int32_t stringStartIdx = opValue;
2850 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2852 opType = URX_TYPE(op);
2853 int32_t stringLen = URX_VAL(op);
2854 U_ASSERT(opType == URX_STRING_LEN);
2855 U_ASSERT(stringLen >= 2);
2857 const UChar *patternString = litText+stringStartIdx;
2858 int32_t patternStringIndex = 0;
2859 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2861 UChar32 patternChar;
2862 UBool success = TRUE;
2863 while (patternStringIndex < stringLen) {
2864 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2869 inputChar = UTEXT_NEXT32(fInputText);
2870 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2871 if (patternChar != inputChar) {
2878 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2880 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2886 case URX_STATE_SAVE:
2887 fp = StateSave(fp, opValue, status);
2892 // The match loop will exit via this path on a successful match,
2893 // when we reach the end of the pattern.
2894 if (toEnd && fp->fInputIdx != fActiveLimit) {
2895 // The pattern matched, but not to the end of input. Try some more.
2896 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2902 // Start and End Capture stack frame variables are laid out out like this:
2903 // fp->fExtra[opValue] - The start of a completed capture group
2904 // opValue+1 - The end of a completed capture group
2905 // opValue+2 - the start of a capture group whose end
2906 // has not yet been reached (and might not ever be).
2907 case URX_START_CAPTURE:
2908 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2909 fp->fExtra[opValue+2] = fp->fInputIdx;
2913 case URX_END_CAPTURE:
2914 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2915 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2916 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2917 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2918 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2922 case URX_DOLLAR: // $, test for End of line
2923 // or for position before new line at end of input
2925 if (fp->fInputIdx >= fAnchorLimit) {
2926 // We really are at the end of input. Success.
2932 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2934 // If we are positioned just before a new-line that is located at the
2935 // end of input, succeed.
2936 UChar32 c = UTEXT_NEXT32(fInputText);
2937 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2938 if (isLineTerminator(c)) {
2939 // If not in the middle of a CR/LF sequence
2940 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2941 // At new-line at end of input. Success
2949 UChar32 nextC = UTEXT_NEXT32(fInputText);
2950 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2953 break; // At CR/LF at end of input. Success
2957 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2962 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2963 if (fp->fInputIdx >= fAnchorLimit) {
2964 // Off the end of input. Success.
2969 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2970 UChar32 c = UTEXT_NEXT32(fInputText);
2971 // Either at the last character of input, or off the end.
2972 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2979 // Not at end of input. Back-track out.
2980 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2984 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2986 if (fp->fInputIdx >= fAnchorLimit) {
2987 // We really are at the end of input. Success.
2992 // If we are positioned just before a new-line, succeed.
2993 // It makes no difference where the new-line is within the input.
2994 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2995 UChar32 c = UTEXT_CURRENT32(fInputText);
2996 if (isLineTerminator(c)) {
2997 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
2998 // In multi-line mode, hitting a new-line just before the end of input does not
2999 // set the hitEnd or requireEnd flags
3000 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3004 // not at a new line. Fail.
3005 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3010 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3012 if (fp->fInputIdx >= fAnchorLimit) {
3013 // We really are at the end of input. Success.
3015 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3016 break; // adding a new-line would not lose the match.
3018 // If we are not positioned just before a new-line, the test fails; backtrack out.
3019 // It makes no difference where the new-line is within the input.
3020 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3021 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3022 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3028 case URX_CARET: // ^, test for start of line
3029 if (fp->fInputIdx != fAnchorStart) {
3030 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3035 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3037 if (fp->fInputIdx == fAnchorStart) {
3038 // We are at the start input. Success.
3041 // Check whether character just before the current pos is a new-line
3042 // unless we are at the end of input
3043 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3044 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3045 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3046 // It's a new-line. ^ is true. Success.
3047 // TODO: what should be done with positions between a CR and LF?
3050 // Not at the start of a line. Fail.
3051 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3056 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3058 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3059 if (fp->fInputIdx <= fAnchorStart) {
3060 // We are at the start input. Success.
3063 // Check whether character just before the current pos is a new-line
3064 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3065 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3066 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3068 // Not at the start of a line. Back-track out.
3069 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3074 case URX_BACKSLASH_B: // Test for word boundaries
3076 UBool success = isWordBoundary(fp->fInputIdx);
3077 success ^= (UBool)(opValue != 0); // flip sense for \B
3079 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3085 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3087 UBool success = isUWordBoundary(fp->fInputIdx);
3088 success ^= (UBool)(opValue != 0); // flip sense for \B
3090 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3096 case URX_BACKSLASH_D: // Test for decimal digit
3098 if (fp->fInputIdx >= fActiveLimit) {
3100 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3104 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3106 UChar32 c = UTEXT_NEXT32(fInputText);
3107 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3108 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3109 success ^= (UBool)(opValue != 0); // flip sense for \D
3111 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3113 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3119 case URX_BACKSLASH_G: // Test for position at end of previous match
3120 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3121 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3126 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3128 if (fp->fInputIdx >= fActiveLimit) {
3130 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3133 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3134 UChar32 c = UTEXT_NEXT32(fInputText);
3135 int8_t ctype = u_charType(c);
3136 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3137 success ^= (UBool)(opValue != 0); // flip sense for \H
3139 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3141 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3147 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3149 if (fp->fInputIdx >= fActiveLimit) {
3151 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3154 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3155 UChar32 c = UTEXT_NEXT32(fInputText);
3156 if (isLineTerminator(c)) {
3157 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3158 utext_next32(fInputText);
3160 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3162 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3168 case URX_BACKSLASH_V: // \v, any single line ending character.
3170 if (fp->fInputIdx >= fActiveLimit) {
3172 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3175 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3176 UChar32 c = UTEXT_NEXT32(fInputText);
3177 UBool success = isLineTerminator(c);
3178 success ^= (UBool)(opValue != 0); // flip sense for \V
3180 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3182 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3188 case URX_BACKSLASH_X:
3189 // Match a Grapheme, as defined by Unicode TR 29.
3190 // Differs slightly from Perl, which consumes combining marks independently
3194 // Fail if at end of input
3195 if (fp->fInputIdx >= fActiveLimit) {
3197 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3201 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3203 // Examine (and consume) the current char.
3204 // Dispatch into a little state machine, based on the char.
3206 c = UTEXT_NEXT32(fInputText);
3207 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3208 UnicodeSet **sets = fPattern->fStaticSets;
3209 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3210 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3211 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3212 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3213 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3214 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3215 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3221 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3222 c = UTEXT_NEXT32(fInputText);
3223 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3224 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3225 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3226 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3227 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3228 (void)UTEXT_PREVIOUS32(fInputText);
3229 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3233 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3234 c = UTEXT_NEXT32(fInputText);
3235 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3236 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3237 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3238 (void)UTEXT_PREVIOUS32(fInputText);
3239 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3243 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3244 c = UTEXT_NEXT32(fInputText);
3245 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3246 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3247 (void)UTEXT_PREVIOUS32(fInputText);
3248 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3252 // Combining characters are consumed here
3254 if (fp->fInputIdx >= fActiveLimit) {
3257 c = UTEXT_CURRENT32(fInputText);
3258 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3261 (void)UTEXT_NEXT32(fInputText);
3262 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3267 // Most control chars stand alone (don't combine with combining chars),
3268 // except for that CR/LF sequence is a single grapheme cluster.
3269 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3270 c = UTEXT_NEXT32(fInputText);
3271 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3275 if (fp->fInputIdx >= fActiveLimit) {
3284 case URX_BACKSLASH_Z: // Test for end of Input
3285 if (fp->fInputIdx < fAnchorLimit) {
3286 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3295 case URX_STATIC_SETREF:
3297 // Test input character against one of the predefined sets
3298 // (Word Characters, for example)
3299 // The high bit of the op value is a flag for the match polarity.
3300 // 0: success if input char is in set.
3301 // 1: success if input char is not in set.
3302 if (fp->fInputIdx >= fActiveLimit) {
3304 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3308 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3309 opValue &= ~URX_NEG_SET;
3310 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3312 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3313 UChar32 c = UTEXT_NEXT32(fInputText);
3315 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3316 if (s8->contains(c)) {
3320 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3321 if (s->contains(c)) {
3326 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3328 // the character wasn't in the set.
3329 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3335 case URX_STAT_SETREF_N:
3337 // Test input character for NOT being a member of one of
3338 // the predefined sets (Word Characters, for example)
3339 if (fp->fInputIdx >= fActiveLimit) {
3341 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3345 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3347 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3349 UChar32 c = UTEXT_NEXT32(fInputText);
3351 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3352 if (s8->contains(c) == FALSE) {
3353 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3357 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3358 if (s->contains(c) == FALSE) {
3359 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3363 // the character wasn't in the set.
3364 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3370 if (fp->fInputIdx >= fActiveLimit) {
3372 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3375 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3377 // There is input left. Pick up one char and test it for set membership.
3378 UChar32 c = UTEXT_NEXT32(fInputText);
3379 U_ASSERT(opValue > 0 && opValue < sets->size());
3381 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3382 if (s8->contains(c)) {
3383 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3387 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3388 if (s->contains(c)) {
3389 // The character is in the set. A Match.
3390 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3395 // the character wasn't in the set.
3396 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3403 // . matches anything, but stops at end-of-line.
3404 if (fp->fInputIdx >= fActiveLimit) {
3405 // At end of input. Match failed. Backtrack out.
3407 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3411 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3413 // There is input left. Advance over one char, unless we've hit end-of-line
3414 UChar32 c = UTEXT_NEXT32(fInputText);
3415 if (isLineTerminator(c)) {
3416 // End of line in normal mode. . does not match.
3417 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3420 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3425 case URX_DOTANY_ALL:
3427 // ., in dot-matches-all (including new lines) mode
3428 if (fp->fInputIdx >= fActiveLimit) {
3429 // At end of input. Match failed. Backtrack out.
3431 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3435 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3437 // There is input left. Advance over one char, except if we are
3438 // at a cr/lf, advance over both of them.
3440 c = UTEXT_NEXT32(fInputText);
3441 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3442 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3443 // In the case of a CR/LF, we need to advance over both.
3444 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3445 if (nextc == 0x0a) {
3446 (void)UTEXT_NEXT32(fInputText);
3447 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3454 case URX_DOTANY_UNIX:
3456 // '.' operator, matches all, but stops at end-of-line.
3457 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3458 if (fp->fInputIdx >= fActiveLimit) {
3459 // At end of input. Match failed. Backtrack out.
3461 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3465 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3467 // There is input left. Advance over one char, unless we've hit end-of-line
3468 UChar32 c = UTEXT_NEXT32(fInputText);
3470 // End of line in normal mode. '.' does not match the \n
3471 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3473 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3480 fp->fPatIdx = opValue;
3488 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3489 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3490 fp->fPatIdx = opValue; // Then JMP.
3494 // This opcode is used with (x)+, when x can match a zero length string.
3495 // Same as JMP_SAV, except conditional on the match having made forward progress.
3496 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3497 // data address of the input position at the start of the loop.
3499 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3500 int32_t stoOp = (int32_t)pat[opValue-1];
3501 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3502 int32_t frameLoc = URX_VAL(stoOp);
3503 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3504 int64_t prevInputIdx = fp->fExtra[frameLoc];
3505 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3506 if (prevInputIdx < fp->fInputIdx) {
3507 // The match did make progress. Repeat the loop.
3508 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3509 fp->fPatIdx = opValue;
3510 fp->fExtra[frameLoc] = fp->fInputIdx;
3512 // If the input position did not advance, we do nothing here,
3513 // execution will fall out of the loop.
3519 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3520 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3522 // Pick up the three extra operands that CTR_INIT has, and
3523 // skip the pattern location counter past
3524 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3526 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3527 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3528 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3529 U_ASSERT(minCount>=0);
3530 U_ASSERT(maxCount>=minCount || maxCount==-1);
3531 U_ASSERT(loopLoc>=fp->fPatIdx);
3533 if (minCount == 0) {
3534 fp = StateSave(fp, loopLoc+1, status);
3536 if (maxCount == -1) {
3537 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3538 } else if (maxCount == 0) {
3539 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3546 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3547 int32_t initOp = (int32_t)pat[opValue];
3548 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3549 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3550 int32_t minCount = (int32_t)pat[opValue+2];
3551 int32_t maxCount = (int32_t)pat[opValue+3];
3553 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3554 U_ASSERT(*pCounter == maxCount);
3557 if (*pCounter >= minCount) {
3558 if (maxCount == -1) {
3559 // Loop has no hard upper bound.
3560 // Check that it is progressing through the input, break if it is not.
3561 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3562 if (fp->fInputIdx == *pLastInputIdx) {
3565 *pLastInputIdx = fp->fInputIdx;
3568 fp = StateSave(fp, fp->fPatIdx, status);
3570 fp->fPatIdx = opValue + 4; // Loop back.
3574 case URX_CTR_INIT_NG:
3576 // Initialize a non-greedy loop
3577 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3578 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3580 // Pick up the three extra operands that CTR_INIT_NG has, and
3581 // skip the pattern location counter past
3582 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3584 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3585 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3586 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3587 U_ASSERT(minCount>=0);
3588 U_ASSERT(maxCount>=minCount || maxCount==-1);
3589 U_ASSERT(loopLoc>fp->fPatIdx);
3590 if (maxCount == -1) {
3591 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3594 if (minCount == 0) {
3595 if (maxCount != 0) {
3596 fp = StateSave(fp, fp->fPatIdx, status);
3598 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3603 case URX_CTR_LOOP_NG:
3605 // Non-greedy {min, max} loops
3606 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3607 int32_t initOp = (int32_t)pat[opValue];
3608 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3609 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3610 int32_t minCount = (int32_t)pat[opValue+2];
3611 int32_t maxCount = (int32_t)pat[opValue+3];
3614 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3615 // The loop has matched the maximum permitted number of times.
3616 // Break out of here with no action. Matching will
3617 // continue with the following pattern.
3618 U_ASSERT(*pCounter == maxCount);
3622 if (*pCounter < minCount) {
3623 // We haven't met the minimum number of matches yet.
3624 // Loop back for another one.
3625 fp->fPatIdx = opValue + 4; // Loop back.
3627 // We do have the minimum number of matches.
3629 // If there is no upper bound on the loop iterations, check that the input index
3630 // is progressing, and stop the loop if it is not.
3631 if (maxCount == -1) {
3632 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3633 if (fp->fInputIdx == *pLastInputIdx) {
3636 *pLastInputIdx = fp->fInputIdx;
3639 // Loop Continuation: we will fall into the pattern following the loop
3640 // (non-greedy, don't execute loop body first), but first do
3641 // a state save to the top of the loop, so that a match failure
3642 // in the following pattern will try another iteration of the loop.
3643 fp = StateSave(fp, opValue + 4, status);
3649 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3650 fData[opValue] = fStack->size();
3655 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3656 int32_t newStackSize = (int32_t)fData[opValue];
3657 U_ASSERT(newStackSize <= fStack->size());
3658 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3659 if (newFP == (int64_t *)fp) {
3663 for (i=0; i<fFrameSize; i++) {
3664 newFP[i] = ((int64_t *)fp)[i];
3666 fp = (REStackFrame *)newFP;
3667 fStack->setSize(newStackSize);
3673 U_ASSERT(opValue < fFrameSize);
3674 int64_t groupStartIdx = fp->fExtra[opValue];
3675 int64_t groupEndIdx = fp->fExtra[opValue+1];
3676 U_ASSERT(groupStartIdx <= groupEndIdx);
3677 if (groupStartIdx < 0) {
3678 // This capture group has not participated in the match thus far,
3679 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3682 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3683 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3685 // Note: if the capture group match was of an empty string the backref
3686 // match succeeds. Verified by testing: Perl matches succeed
3687 // in this case, so we do too.
3689 UBool success = TRUE;
3691 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3695 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3700 UChar32 captureGroupChar = utext_next32(fAltInputText);
3701 UChar32 inputChar = utext_next32(fInputText);
3702 if (inputChar != captureGroupChar) {
3709 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3711 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3720 U_ASSERT(opValue < fFrameSize);
3721 int64_t groupStartIdx = fp->fExtra[opValue];
3722 int64_t groupEndIdx = fp->fExtra[opValue+1];
3723 U_ASSERT(groupStartIdx <= groupEndIdx);
3724 if (groupStartIdx < 0) {
3725 // This capture group has not participated in the match thus far,
3726 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3729 utext_setNativeIndex(fAltInputText, groupStartIdx);
3730 utext_setNativeIndex(fInputText, fp->fInputIdx);
3731 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3732 CaseFoldingUTextIterator inputItr(*fInputText);
3734 // Note: if the capture group match was of an empty string the backref
3735 // match succeeds. Verified by testing: Perl matches succeed
3736 // in this case, so we do too.
3738 UBool success = TRUE;
3740 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3744 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3749 UChar32 captureGroupChar = captureGroupItr.next();
3750 UChar32 inputChar = inputItr.next();
3751 if (inputChar != captureGroupChar) {
3757 if (success && inputItr.inExpansion()) {
3758 // We otained a match by consuming part of a string obtained from
3759 // case-folding a single code point of the input text.
3760 // This does not count as an overall match.
3765 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3767 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3773 case URX_STO_INP_LOC:
3775 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3776 fp->fExtra[opValue] = fp->fInputIdx;
3782 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3784 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3785 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3786 int64_t savedInputIdx = fp->fExtra[dataLoc];
3787 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3788 if (savedInputIdx < fp->fInputIdx) {
3789 fp->fPatIdx = opValue; // JMP
3791 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3798 // Entering a lookahead block.
3799 // Save Stack Ptr, Input Pos.
3800 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3801 fData[opValue] = fStack->size();
3802 fData[opValue+1] = fp->fInputIdx;
3803 fActiveStart = fLookStart; // Set the match region change for
3804 fActiveLimit = fLookLimit; // transparent bounds.
3810 // Leaving a look-ahead block.
3811 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3812 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3813 int32_t stackSize = fStack->size();
3814 int32_t newStackSize =(int32_t)fData[opValue];
3815 U_ASSERT(stackSize >= newStackSize);
3816 if (stackSize > newStackSize) {
3817 // Copy the current top frame back to the new (cut back) top frame.
3818 // This makes the capture groups from within the look-ahead
3819 // expression available.
3820 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3822 for (i=0; i<fFrameSize; i++) {
3823 newFP[i] = ((int64_t *)fp)[i];
3825 fp = (REStackFrame *)newFP;
3826 fStack->setSize(newStackSize);
3828 fp->fInputIdx = fData[opValue+1];
3830 // Restore the active region bounds in the input string; they may have
3831 // been changed because of transparent bounds on a Region.
3832 fActiveStart = fRegionStart;
3833 fActiveLimit = fRegionLimit;
3838 // Case insensitive one char. The char from the pattern is already case folded.
3839 // Input text is not, but case folding the input can not reduce two or more code
3841 if (fp->fInputIdx < fActiveLimit) {
3842 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3844 UChar32 c = UTEXT_NEXT32(fInputText);
3845 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3846 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3853 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3858 // Case-insensitive test input against a literal string.
3859 // Strings require two slots in the compiled pattern, one for the
3860 // offset to the string text, and one for the length.
3861 // The compiled string has already been case folded.
3863 const UChar *patternString = litText + opValue;
3864 int32_t patternStringIdx = 0;
3866 op = (int32_t)pat[fp->fPatIdx];
3868 opType = URX_TYPE(op);
3869 opValue = URX_VAL(op);
3870 U_ASSERT(opType == URX_STRING_LEN);
3871 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3876 UBool success = TRUE;
3878 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3879 CaseFoldingUTextIterator inputIterator(*fInputText);
3880 while (patternStringIdx < patternStringLen) {
3881 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3886 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3887 cText = inputIterator.next();
3888 if (cText != cPattern) {
3893 if (inputIterator.inExpansion()) {
3898 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3900 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3908 // Entering a look-behind block.
3909 // Save Stack Ptr, Input Pos.
3910 // TODO: implement transparent bounds. Ticket #6067
3911 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3912 fData[opValue] = fStack->size();
3913 fData[opValue+1] = fp->fInputIdx;
3914 // Init the variable containing the start index for attempted matches.
3915 fData[opValue+2] = -1;
3916 // Save input string length, then reset to pin any matches to end at
3917 // the current position.
3918 fData[opValue+3] = fActiveLimit;
3919 fActiveLimit = fp->fInputIdx;
3926 // Positive Look-Behind, at top of loop checking for matches of LB expression
3927 // at all possible input starting positions.
3929 // Fetch the min and max possible match lengths. They are the operands
3930 // of this op in the pattern.
3931 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3932 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3933 if (!UTEXT_USES_U16(fInputText)) {
3934 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3935 // The max length need not be exact; it just needs to be >= actual maximum.
3938 U_ASSERT(minML <= maxML);
3939 U_ASSERT(minML >= 0);
3941 // Fetch (from data) the last input index where a match was attempted.
3942 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3943 int64_t &lbStartIdx = fData[opValue+2];
3944 if (lbStartIdx < 0) {
3945 // First time through loop.
3946 lbStartIdx = fp->fInputIdx - minML;
3947 if (lbStartIdx > 0) {
3948 // move index to a code point boudary, if it's not on one already.
3949 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3950 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3953 // 2nd through nth time through the loop.
3954 // Back up start position for match by one.
3955 if (lbStartIdx == 0) {
3958 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3959 (void)UTEXT_PREVIOUS32(fInputText);
3960 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3964 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3965 // We have tried all potential match starting points without
3966 // getting a match. Backtrack out, and out of the
3967 // Look Behind altogether.
3968 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3969 int64_t restoreInputLen = fData[opValue+3];
3970 U_ASSERT(restoreInputLen >= fActiveLimit);
3971 U_ASSERT(restoreInputLen <= fInputLength);
3972 fActiveLimit = restoreInputLen;
3976 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3977 // (successful match will fall off the end of the loop.)
3978 fp = StateSave(fp, fp->fPatIdx-3, status);
3979 fp->fInputIdx = lbStartIdx;
3984 // End of a look-behind block, after a successful match.
3986 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3987 if (fp->fInputIdx != fActiveLimit) {
3988 // The look-behind expression matched, but the match did not
3989 // extend all the way to the point that we are looking behind from.
3990 // FAIL out of here, which will take us back to the LB_CONT, which
3991 // will retry the match starting at another position or fail
3992 // the look-behind altogether, whichever is appropriate.
3993 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3997 // Look-behind match is good. Restore the orignal input string length,
3998 // which had been truncated to pin the end of the lookbehind match to the
3999 // position being looked-behind.
4000 int64_t originalInputLen = fData[opValue+3];
4001 U_ASSERT(originalInputLen >= fActiveLimit);
4002 U_ASSERT(originalInputLen <= fInputLength);
4003 fActiveLimit = originalInputLen;
4010 // Negative Look-Behind, at top of loop checking for matches of LB expression
4011 // at all possible input starting positions.
4013 // Fetch the extra parameters of this op.
4014 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4015 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4016 if (!UTEXT_USES_U16(fInputText)) {
4017 // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4018 // The max length need not be exact; it just needs to be >= actual maximum.
4021 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4022 continueLoc = URX_VAL(continueLoc);
4023 U_ASSERT(minML <= maxML);
4024 U_ASSERT(minML >= 0);
4025 U_ASSERT(continueLoc > fp->fPatIdx);
4027 // Fetch (from data) the last input index where a match was attempted.
4028 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4029 int64_t &lbStartIdx = fData[opValue+2];
4030 if (lbStartIdx < 0) {
4031 // First time through loop.
4032 lbStartIdx = fp->fInputIdx - minML;
4033 if (lbStartIdx > 0) {
4034 // move index to a code point boudary, if it's not on one already.
4035 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4036 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4039 // 2nd through nth time through the loop.
4040 // Back up start position for match by one.
4041 if (lbStartIdx == 0) {
4044 UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4045 (void)UTEXT_PREVIOUS32(fInputText);
4046 lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4050 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4051 // We have tried all potential match starting points without
4052 // getting a match, which means that the negative lookbehind as
4053 // a whole has succeeded. Jump forward to the continue location
4054 int64_t restoreInputLen = fData[opValue+3];
4055 U_ASSERT(restoreInputLen >= fActiveLimit);
4056 U_ASSERT(restoreInputLen <= fInputLength);
4057 fActiveLimit = restoreInputLen;
4058 fp->fPatIdx = continueLoc;
4062 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4063 // (successful match will cause a FAIL out of the loop altogether.)
4064 fp = StateSave(fp, fp->fPatIdx-4, status);
4065 fp->fInputIdx = lbStartIdx;
4070 // End of a negative look-behind block, after a successful match.
4072 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4073 if (fp->fInputIdx != fActiveLimit) {
4074 // The look-behind expression matched, but the match did not
4075 // extend all the way to the point that we are looking behind from.
4076 // FAIL out of here, which will take us back to the LB_CONT, which
4077 // will retry the match starting at another position or succeed
4078 // the look-behind altogether, whichever is appropriate.
4079 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4083 // Look-behind expression matched, which means look-behind test as
4086 // Restore the orignal input string length, which had been truncated
4087 // inorder to pin the end of the lookbehind match
4088 // to the position being looked-behind.
4089 int64_t originalInputLen = fData[opValue+3];
4090 U_ASSERT(originalInputLen >= fActiveLimit);
4091 U_ASSERT(originalInputLen <= fInputLength);
4092 fActiveLimit = originalInputLen;
4094 // Restore original stack position, discarding any state saved
4095 // by the successful pattern match.
4096 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4097 int32_t newStackSize = (int32_t)fData[opValue];
4098 U_ASSERT(fStack->size() > newStackSize);
4099 fStack->setSize(newStackSize);
4101 // FAIL, which will take control back to someplace
4102 // prior to entering the look-behind test.
4103 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4109 // Loop Initialization for the optimized implementation of
4110 // [some character set]*
4111 // This op scans through all matching input.
4112 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4114 U_ASSERT(opValue > 0 && opValue < sets->size());
4115 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4116 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4118 // Loop through input, until either the input is exhausted or
4119 // we reach a character that is not a member of the set.
4120 int64_t ix = fp->fInputIdx;
4121 UTEXT_SETNATIVEINDEX(fInputText, ix);
4123 if (ix >= fActiveLimit) {
4127 UChar32 c = UTEXT_NEXT32(fInputText);
4129 if (s8->contains(c) == FALSE) {
4133 if (s->contains(c) == FALSE) {
4137 ix = UTEXT_GETNATIVEINDEX(fInputText);
4140 // If there were no matching characters, skip over the loop altogether.
4141 // The loop doesn't run at all, a * op always succeeds.
4142 if (ix == fp->fInputIdx) {
4143 fp->fPatIdx++; // skip the URX_LOOP_C op.
4147 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4148 // must follow. It's operand is the stack location
4149 // that holds the starting input index for the match of this [set]*
4150 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4151 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4152 int32_t stackLoc = URX_VAL(loopcOp);
4153 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4154 fp->fExtra[stackLoc] = fp->fInputIdx;
4157 // Save State to the URX_LOOP_C op that follows this one,
4158 // so that match failures in the following code will return to there.
4159 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4160 fp = StateSave(fp, fp->fPatIdx, status);
4166 case URX_LOOP_DOT_I:
4167 // Loop Initialization for the optimized implementation of .*
4168 // This op scans through all remaining input.
4169 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4171 // Loop through input until the input is exhausted (we reach an end-of-line)
4172 // In DOTALL mode, we can just go straight to the end of the input.
4174 if ((opValue & 1) == 1) {
4175 // Dot-matches-All mode. Jump straight to the end of the string.
4179 // NOT DOT ALL mode. Line endings do not match '.'
4180 // Scan forward until a line ending or end of input.
4182 UTEXT_SETNATIVEINDEX(fInputText, ix);
4184 if (ix >= fActiveLimit) {
4188 UChar32 c = UTEXT_NEXT32(fInputText);
4189 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4190 if ((c == 0x0a) || // 0x0a is newline in both modes.
4191 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4192 isLineTerminator(c))) {
4193 // char is a line ending. Exit the scanning loop.
4197 ix = UTEXT_GETNATIVEINDEX(fInputText);
4201 // If there were no matching characters, skip over the loop altogether.
4202 // The loop doesn't run at all, a * op always succeeds.
4203 if (ix == fp->fInputIdx) {
4204 fp->fPatIdx++; // skip the URX_LOOP_C op.
4208 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4209 // must follow. It's operand is the stack location
4210 // that holds the starting input index for the match of this .*
4211 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4212 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4213 int32_t stackLoc = URX_VAL(loopcOp);
4214 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4215 fp->fExtra[stackLoc] = fp->fInputIdx;
4218 // Save State to the URX_LOOP_C op that follows this one,
4219 // so that match failures in the following code will return to there.
4220 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4221 fp = StateSave(fp, fp->fPatIdx, status);
4229 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4230 backSearchIndex = fp->fExtra[opValue];
4231 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4232 if (backSearchIndex == fp->fInputIdx) {
4233 // We've backed up the input idx to the point that the loop started.
4234 // The loop is done. Leave here without saving state.
4235 // Subsequent failures won't come back here.
4238 // Set up for the next iteration of the loop, with input index
4239 // backed up by one from the last time through,
4240 // and a state save to this instruction in case the following code fails again.
4241 // (We're going backwards because this loop emulates stack unwinding, not
4242 // the initial scan forward.)
4243 U_ASSERT(fp->fInputIdx > 0);
4244 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4245 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4246 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4248 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4249 if (prevC == 0x0a &&
4250 fp->fInputIdx > backSearchIndex &&
4252 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4253 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4254 // .*, stepping back over CRLF pair.
4255 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4260 fp = StateSave(fp, fp->fPatIdx-1, status);
4267 // Trouble. The compiled pattern contains an entry with an
4268 // unrecognized type tag.
4272 if (U_FAILURE(status)) {
4281 fLastMatchEnd = fMatchEnd;
4282 fMatchStart = startIdx;
4283 fMatchEnd = fp->fInputIdx;
4286 #ifdef REGEX_RUN_DEBUG
4289 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4291 printf("No match\n\n");
4296 fFrame = fp; // The active stack frame when the engine stopped.
4297 // Contains the capture group results that we need to
4303 //--------------------------------------------------------------------------------
4305 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4306 // assumption that the entire string is available in the UText's
4307 // chunk buffer. For now, that means we can use int32_t indexes,
4308 // except for anything that needs to be saved (like group starts
4311 // startIdx: begin matching a this index.
4312 // toEnd: if true, match must extend to end of the input region
4314 //--------------------------------------------------------------------------------
4315 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4316 UBool isMatch = FALSE; // True if the we have a match.
4318 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4320 int32_t op; // Operation from the compiled pattern, split into
4321 int32_t opType; // the opcode
4322 int32_t opValue; // and the operand value.
4324 #ifdef REGEX_RUN_DEBUG
4326 printf("MatchAt(startIdx=%d)\n", startIdx);
4327 printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4328 printf("Input String: \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4332 if (U_FAILURE(status)) {
4336 // Cache frequently referenced items from the compiled pattern
4338 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4340 const UChar *litText = fPattern->fLiteralText.getBuffer();
4341 UVector *sets = fPattern->fSets;
4343 const UChar *inputBuf = fInputText->chunkContents;
4345 fFrameSize = fPattern->fFrameSize;
4346 REStackFrame *fp = resetStack();
4347 if (U_FAILURE(fDeferredStatus)) {
4348 status = fDeferredStatus;
4353 fp->fInputIdx = startIdx;
4355 // Zero out the pattern's static data
4357 for (i = 0; i<fPattern->fDataSize; i++) {
4362 // Main loop for interpreting the compiled pattern.
4363 // One iteration of the loop per pattern operation performed.
4366 op = (int32_t)pat[fp->fPatIdx];
4367 opType = URX_TYPE(op);
4368 opValue = URX_VAL(op);
4369 #ifdef REGEX_RUN_DEBUG
4371 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4372 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4373 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4374 fPattern->dumpOp(fp->fPatIdx);
4387 // Force a backtrack. In some circumstances, the pattern compiler
4388 // will notice that the pattern can't possibly match anything, and will
4389 // emit one of these at that point.
4390 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4395 if (fp->fInputIdx < fActiveLimit) {
4397 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4404 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4410 // Test input against a literal string.
4411 // Strings require two slots in the compiled pattern, one for the
4412 // offset to the string text, and one for the length.
4413 int32_t stringStartIdx = opValue;
4416 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4418 opType = URX_TYPE(op);
4419 stringLen = URX_VAL(op);
4420 U_ASSERT(opType == URX_STRING_LEN);
4421 U_ASSERT(stringLen >= 2);
4423 const UChar * pInp = inputBuf + fp->fInputIdx;
4424 const UChar * pInpLimit = inputBuf + fActiveLimit;
4425 const UChar * pPat = litText+stringStartIdx;
4426 const UChar * pEnd = pInp + stringLen;
4427 UBool success = TRUE;
4428 while (pInp < pEnd) {
4429 if (pInp >= pInpLimit) {
4434 if (*pInp++ != *pPat++) {
4441 fp->fInputIdx += stringLen;
4443 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4449 case URX_STATE_SAVE:
4450 fp = StateSave(fp, opValue, status);
4455 // The match loop will exit via this path on a successful match,
4456 // when we reach the end of the pattern.
4457 if (toEnd && fp->fInputIdx != fActiveLimit) {
4458 // The pattern matched, but not to the end of input. Try some more.
4459 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4465 // Start and End Capture stack frame variables are laid out out like this:
4466 // fp->fExtra[opValue] - The start of a completed capture group
4467 // opValue+1 - The end of a completed capture group
4468 // opValue+2 - the start of a capture group whose end
4469 // has not yet been reached (and might not ever be).
4470 case URX_START_CAPTURE:
4471 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4472 fp->fExtra[opValue+2] = fp->fInputIdx;
4476 case URX_END_CAPTURE:
4477 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4478 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4479 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4480 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4481 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4485 case URX_DOLLAR: // $, test for End of line
4486 // or for position before new line at end of input
4487 if (fp->fInputIdx < fAnchorLimit-2) {
4488 // We are no where near the end of input. Fail.
4489 // This is the common case. Keep it first.
4490 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4493 if (fp->fInputIdx >= fAnchorLimit) {
4494 // We really are at the end of input. Success.
4500 // If we are positioned just before a new-line that is located at the
4501 // end of input, succeed.
4502 if (fp->fInputIdx == fAnchorLimit-1) {
4504 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4506 if (isLineTerminator(c)) {
4507 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4508 // At new-line at end of input. Success
4514 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4515 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4518 break; // At CR/LF at end of input. Success
4521 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4526 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4527 if (fp->fInputIdx >= fAnchorLimit-1) {
4528 // Either at the last character of input, or off the end.
4529 if (fp->fInputIdx == fAnchorLimit-1) {
4530 // At last char of input. Success if it's a new line.
4531 if (inputBuf[fp->fInputIdx] == 0x0a) {
4537 // Off the end of input. Success.
4544 // Not at end of input. Back-track out.
4545 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4549 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4551 if (fp->fInputIdx >= fAnchorLimit) {
4552 // We really are at the end of input. Success.
4557 // If we are positioned just before a new-line, succeed.
4558 // It makes no difference where the new-line is within the input.
4559 UChar32 c = inputBuf[fp->fInputIdx];
4560 if (isLineTerminator(c)) {
4561 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4562 // In multi-line mode, hitting a new-line just before the end of input does not
4563 // set the hitEnd or requireEnd flags
4564 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4568 // not at a new line. Fail.
4569 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4574 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4576 if (fp->fInputIdx >= fAnchorLimit) {
4577 // We really are at the end of input. Success.
4579 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4580 break; // adding a new-line would not lose the match.
4582 // If we are not positioned just before a new-line, the test fails; backtrack out.
4583 // It makes no difference where the new-line is within the input.
4584 if (inputBuf[fp->fInputIdx] != 0x0a) {
4585 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4591 case URX_CARET: // ^, test for start of line
4592 if (fp->fInputIdx != fAnchorStart) {
4593 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4598 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4600 if (fp->fInputIdx == fAnchorStart) {
4601 // We are at the start input. Success.
4604 // Check whether character just before the current pos is a new-line
4605 // unless we are at the end of input
4606 UChar c = inputBuf[fp->fInputIdx - 1];
4607 if ((fp->fInputIdx < fAnchorLimit) &&
4608 isLineTerminator(c)) {
4609 // It's a new-line. ^ is true. Success.
4610 // TODO: what should be done with positions between a CR and LF?
4613 // Not at the start of a line. Fail.
4614 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4619 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4621 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4622 if (fp->fInputIdx <= fAnchorStart) {
4623 // We are at the start input. Success.
4626 // Check whether character just before the current pos is a new-line
4627 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4628 UChar c = inputBuf[fp->fInputIdx - 1];
4630 // Not at the start of a line. Back-track out.
4631 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4636 case URX_BACKSLASH_B: // Test for word boundaries
4638 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4639 success ^= (UBool)(opValue != 0); // flip sense for \B
4641 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4647 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4649 UBool success = isUWordBoundary(fp->fInputIdx);
4650 success ^= (UBool)(opValue != 0); // flip sense for \B
4652 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4658 case URX_BACKSLASH_D: // Test for decimal digit
4660 if (fp->fInputIdx >= fActiveLimit) {
4662 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4667 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4668 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4669 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4670 success ^= (UBool)(opValue != 0); // flip sense for \D
4672 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4678 case URX_BACKSLASH_G: // Test for position at end of previous match
4679 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4680 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4685 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4687 if (fp->fInputIdx >= fActiveLimit) {
4689 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4693 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4694 int8_t ctype = u_charType(c);
4695 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4696 success ^= (UBool)(opValue != 0); // flip sense for \H
4698 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4704 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4706 if (fp->fInputIdx >= fActiveLimit) {
4708 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4712 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4713 if (isLineTerminator(c)) {
4714 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4715 // Check for CR/LF sequence. Consume both together when found.
4717 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4719 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4723 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4729 case URX_BACKSLASH_V: // Any single code point line ending.
4731 if (fp->fInputIdx >= fActiveLimit) {
4733 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4737 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4738 UBool success = isLineTerminator(c);
4739 success ^= (UBool)(opValue != 0); // flip sense for \V
4741 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4748 case URX_BACKSLASH_X:
4749 // Match a Grapheme, as defined by Unicode TR 29.
4750 // Differs slightly from Perl, which consumes combining marks independently
4754 // Fail if at end of input
4755 if (fp->fInputIdx >= fActiveLimit) {
4757 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4761 // Examine (and consume) the current char.
4762 // Dispatch into a little state machine, based on the char.
4764 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4765 UnicodeSet **sets = fPattern->fStaticSets;
4766 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4767 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4768 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4769 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4770 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4771 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4772 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4778 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4779 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4780 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4781 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4782 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4783 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4784 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4788 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4789 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4790 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4791 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4792 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4796 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4797 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4798 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4799 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4803 // Combining characters are consumed here
4805 if (fp->fInputIdx >= fActiveLimit) {
4808 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4809 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4810 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4817 // Most control chars stand alone (don't combine with combining chars),
4818 // except for that CR/LF sequence is a single grapheme cluster.
4819 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4824 if (fp->fInputIdx >= fActiveLimit) {
4833 case URX_BACKSLASH_Z: // Test for end of Input
4834 if (fp->fInputIdx < fAnchorLimit) {
4835 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4844 case URX_STATIC_SETREF:
4846 // Test input character against one of the predefined sets
4847 // (Word Characters, for example)
4848 // The high bit of the op value is a flag for the match polarity.
4849 // 0: success if input char is in set.
4850 // 1: success if input char is not in set.
4851 if (fp->fInputIdx >= fActiveLimit) {
4853 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4857 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4858 opValue &= ~URX_NEG_SET;
4859 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4862 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4864 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4865 if (s8->contains(c)) {
4869 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4870 if (s->contains(c)) {
4875 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4881 case URX_STAT_SETREF_N:
4883 // Test input character for NOT being a member of one of
4884 // the predefined sets (Word Characters, for example)
4885 if (fp->fInputIdx >= fActiveLimit) {
4887 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4891 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4894 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4896 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4897 if (s8->contains(c) == FALSE) {
4901 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4902 if (s->contains(c) == FALSE) {
4906 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4913 if (fp->fInputIdx >= fActiveLimit) {
4915 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4919 U_ASSERT(opValue > 0 && opValue < sets->size());
4921 // There is input left. Pick up one char and test it for set membership.
4923 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4925 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4926 if (s8->contains(c)) {
4927 // The character is in the set. A Match.
4931 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4932 if (s->contains(c)) {
4933 // The character is in the set. A Match.
4938 // the character wasn't in the set.
4939 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4946 // . matches anything, but stops at end-of-line.
4947 if (fp->fInputIdx >= fActiveLimit) {
4948 // At end of input. Match failed. Backtrack out.
4950 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4954 // There is input left. Advance over one char, unless we've hit end-of-line
4956 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4957 if (isLineTerminator(c)) {
4958 // End of line in normal mode. . does not match.
4959 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4966 case URX_DOTANY_ALL:
4968 // . in dot-matches-all (including new lines) mode
4969 if (fp->fInputIdx >= fActiveLimit) {
4970 // At end of input. Match failed. Backtrack out.
4972 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4976 // There is input left. Advance over one char, except if we are
4977 // at a cr/lf, advance over both of them.
4979 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4980 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4981 // In the case of a CR/LF, we need to advance over both.
4982 if (inputBuf[fp->fInputIdx] == 0x0a) {
4983 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4990 case URX_DOTANY_UNIX:
4992 // '.' operator, matches all, but stops at end-of-line.
4993 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4994 if (fp->fInputIdx >= fActiveLimit) {
4995 // At end of input. Match failed. Backtrack out.
4997 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5001 // There is input left. Advance over one char, unless we've hit end-of-line
5003 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5005 // End of line in normal mode. '.' does not match the \n
5006 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5013 fp->fPatIdx = opValue;
5021 U_ASSERT(opValue < fPattern->fCompiledPat->size());
5022 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5023 fp->fPatIdx = opValue; // Then JMP.
5027 // This opcode is used with (x)+, when x can match a zero length string.
5028 // Same as JMP_SAV, except conditional on the match having made forward progress.
5029 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5030 // data address of the input position at the start of the loop.
5032 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5033 int32_t stoOp = (int32_t)pat[opValue-1];
5034 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5035 int32_t frameLoc = URX_VAL(stoOp);
5036 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5037 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5038 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5039 if (prevInputIdx < fp->fInputIdx) {
5040 // The match did make progress. Repeat the loop.
5041 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5042 fp->fPatIdx = opValue;
5043 fp->fExtra[frameLoc] = fp->fInputIdx;
5045 // If the input position did not advance, we do nothing here,
5046 // execution will fall out of the loop.
5052 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5053 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5055 // Pick up the three extra operands that CTR_INIT has, and
5056 // skip the pattern location counter past
5057 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5059 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5060 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5061 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5062 U_ASSERT(minCount>=0);
5063 U_ASSERT(maxCount>=minCount || maxCount==-1);
5064 U_ASSERT(loopLoc>=fp->fPatIdx);
5066 if (minCount == 0) {
5067 fp = StateSave(fp, loopLoc+1, status);
5069 if (maxCount == -1) {
5070 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
5071 } else if (maxCount == 0) {
5072 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5079 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5080 int32_t initOp = (int32_t)pat[opValue];
5081 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5082 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5083 int32_t minCount = (int32_t)pat[opValue+2];
5084 int32_t maxCount = (int32_t)pat[opValue+3];
5086 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5087 U_ASSERT(*pCounter == maxCount);
5090 if (*pCounter >= minCount) {
5091 if (maxCount == -1) {
5092 // Loop has no hard upper bound.
5093 // Check that it is progressing through the input, break if it is not.
5094 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5095 if (fp->fInputIdx == *pLastInputIdx) {
5098 *pLastInputIdx = fp->fInputIdx;
5101 fp = StateSave(fp, fp->fPatIdx, status);
5103 fp->fPatIdx = opValue + 4; // Loop back.
5107 case URX_CTR_INIT_NG:
5109 // Initialize a non-greedy loop
5110 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5111 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5113 // Pick up the three extra operands that CTR_INIT_NG has, and
5114 // skip the pattern location counter past
5115 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5117 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5118 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5119 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5120 U_ASSERT(minCount>=0);
5121 U_ASSERT(maxCount>=minCount || maxCount==-1);
5122 U_ASSERT(loopLoc>fp->fPatIdx);
5123 if (maxCount == -1) {
5124 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5127 if (minCount == 0) {
5128 if (maxCount != 0) {
5129 fp = StateSave(fp, fp->fPatIdx, status);
5131 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5136 case URX_CTR_LOOP_NG:
5138 // Non-greedy {min, max} loops
5139 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5140 int32_t initOp = (int32_t)pat[opValue];
5141 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5142 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5143 int32_t minCount = (int32_t)pat[opValue+2];
5144 int32_t maxCount = (int32_t)pat[opValue+3];
5147 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5148 // The loop has matched the maximum permitted number of times.
5149 // Break out of here with no action. Matching will
5150 // continue with the following pattern.
5151 U_ASSERT(*pCounter == maxCount);
5155 if (*pCounter < minCount) {
5156 // We haven't met the minimum number of matches yet.
5157 // Loop back for another one.
5158 fp->fPatIdx = opValue + 4; // Loop back.
5160 // We do have the minimum number of matches.
5162 // If there is no upper bound on the loop iterations, check that the input index
5163 // is progressing, and stop the loop if it is not.
5164 if (maxCount == -1) {
5165 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5166 if (fp->fInputIdx == *pLastInputIdx) {
5169 *pLastInputIdx = fp->fInputIdx;
5172 // Loop Continuation: we will fall into the pattern following the loop
5173 // (non-greedy, don't execute loop body first), but first do
5174 // a state save to the top of the loop, so that a match failure
5175 // in the following pattern will try another iteration of the loop.
5176 fp = StateSave(fp, opValue + 4, status);
5182 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5183 fData[opValue] = fStack->size();
5188 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5189 int32_t newStackSize = (int32_t)fData[opValue];
5190 U_ASSERT(newStackSize <= fStack->size());
5191 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5192 if (newFP == (int64_t *)fp) {
5196 for (i=0; i<fFrameSize; i++) {
5197 newFP[i] = ((int64_t *)fp)[i];
5199 fp = (REStackFrame *)newFP;
5200 fStack->setSize(newStackSize);
5206 U_ASSERT(opValue < fFrameSize);
5207 int64_t groupStartIdx = fp->fExtra[opValue];
5208 int64_t groupEndIdx = fp->fExtra[opValue+1];
5209 U_ASSERT(groupStartIdx <= groupEndIdx);
5210 int64_t inputIndex = fp->fInputIdx;
5211 if (groupStartIdx < 0) {
5212 // This capture group has not participated in the match thus far,
5213 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5216 UBool success = TRUE;
5217 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5218 if (inputIndex >= fActiveLimit) {
5223 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5228 if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5229 inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5230 // Capture group ended with an unpaired lead surrogate.
5231 // Back reference is not permitted to match lead only of a surrogatge pair.
5235 fp->fInputIdx = inputIndex;
5237 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5244 U_ASSERT(opValue < fFrameSize);
5245 int64_t groupStartIdx = fp->fExtra[opValue];
5246 int64_t groupEndIdx = fp->fExtra[opValue+1];
5247 U_ASSERT(groupStartIdx <= groupEndIdx);
5248 if (groupStartIdx < 0) {
5249 // This capture group has not participated in the match thus far,
5250 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5253 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5254 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5256 // Note: if the capture group match was of an empty string the backref
5257 // match succeeds. Verified by testing: Perl matches succeed
5258 // in this case, so we do too.
5260 UBool success = TRUE;
5262 UChar32 captureGroupChar = captureGroupItr.next();
5263 if (captureGroupChar == U_SENTINEL) {
5267 UChar32 inputChar = inputItr.next();
5268 if (inputChar == U_SENTINEL) {
5273 if (inputChar != captureGroupChar) {
5279 if (success && inputItr.inExpansion()) {
5280 // We otained a match by consuming part of a string obtained from
5281 // case-folding a single code point of the input text.
5282 // This does not count as an overall match.
5287 fp->fInputIdx = inputItr.getIndex();
5289 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5294 case URX_STO_INP_LOC:
5296 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5297 fp->fExtra[opValue] = fp->fInputIdx;
5303 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5305 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5306 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5307 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5308 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5309 if (savedInputIdx < fp->fInputIdx) {
5310 fp->fPatIdx = opValue; // JMP
5312 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5319 // Entering a lookahead block.
5320 // Save Stack Ptr, Input Pos.
5321 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5322 fData[opValue] = fStack->size();
5323 fData[opValue+1] = fp->fInputIdx;
5324 fActiveStart = fLookStart; // Set the match region change for
5325 fActiveLimit = fLookLimit; // transparent bounds.
5331 // Leaving a look-ahead block.
5332 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5333 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5334 int32_t stackSize = fStack->size();
5335 int32_t newStackSize = (int32_t)fData[opValue];
5336 U_ASSERT(stackSize >= newStackSize);
5337 if (stackSize > newStackSize) {
5338 // Copy the current top frame back to the new (cut back) top frame.
5339 // This makes the capture groups from within the look-ahead
5340 // expression available.
5341 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5343 for (i=0; i<fFrameSize; i++) {
5344 newFP[i] = ((int64_t *)fp)[i];
5346 fp = (REStackFrame *)newFP;
5347 fStack->setSize(newStackSize);
5349 fp->fInputIdx = fData[opValue+1];
5351 // Restore the active region bounds in the input string; they may have
5352 // been changed because of transparent bounds on a Region.
5353 fActiveStart = fRegionStart;
5354 fActiveLimit = fRegionLimit;
5359 if (fp->fInputIdx < fActiveLimit) {
5361 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5362 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5368 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5372 // Case-insensitive test input against a literal string.
5373 // Strings require two slots in the compiled pattern, one for the
5374 // offset to the string text, and one for the length.
5375 // The compiled string has already been case folded.
5377 const UChar *patternString = litText + opValue;
5379 op = (int32_t)pat[fp->fPatIdx];
5381 opType = URX_TYPE(op);
5382 opValue = URX_VAL(op);
5383 U_ASSERT(opType == URX_STRING_LEN);
5384 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5388 UBool success = TRUE;
5389 int32_t patternStringIdx = 0;
5390 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5391 while (patternStringIdx < patternStringLen) {
5392 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5393 cText = inputIterator.next();
5394 if (cText != cPattern) {
5396 if (cText == U_SENTINEL) {
5402 if (inputIterator.inExpansion()) {
5407 fp->fInputIdx = inputIterator.getIndex();
5409 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5416 // Entering a look-behind block.
5417 // Save Stack Ptr, Input Pos.
5418 // TODO: implement transparent bounds. Ticket #6067
5419 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5420 fData[opValue] = fStack->size();
5421 fData[opValue+1] = fp->fInputIdx;
5422 // Init the variable containing the start index for attempted matches.
5423 fData[opValue+2] = -1;
5424 // Save input string length, then reset to pin any matches to end at
5425 // the current position.
5426 fData[opValue+3] = fActiveLimit;
5427 fActiveLimit = fp->fInputIdx;
5434 // Positive Look-Behind, at top of loop checking for matches of LB expression
5435 // at all possible input starting positions.
5437 // Fetch the min and max possible match lengths. They are the operands
5438 // of this op in the pattern.
5439 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5440 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5441 U_ASSERT(minML <= maxML);
5442 U_ASSERT(minML >= 0);
5444 // Fetch (from data) the last input index where a match was attempted.
5445 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5446 int64_t &lbStartIdx = fData[opValue+2];
5447 if (lbStartIdx < 0) {
5448 // First time through loop.
5449 lbStartIdx = fp->fInputIdx - minML;
5450 if (lbStartIdx > 0) {
5451 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5454 // 2nd through nth time through the loop.
5455 // Back up start position for match by one.
5456 if (lbStartIdx == 0) {
5459 U16_BACK_1(inputBuf, 0, lbStartIdx);
5463 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5464 // We have tried all potential match starting points without
5465 // getting a match. Backtrack out, and out of the
5466 // Look Behind altogether.
5467 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5468 int64_t restoreInputLen = fData[opValue+3];
5469 U_ASSERT(restoreInputLen >= fActiveLimit);
5470 U_ASSERT(restoreInputLen <= fInputLength);
5471 fActiveLimit = restoreInputLen;
5475 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5476 // (successful match will fall off the end of the loop.)
5477 fp = StateSave(fp, fp->fPatIdx-3, status);
5478 fp->fInputIdx = lbStartIdx;
5483 // End of a look-behind block, after a successful match.
5485 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5486 if (fp->fInputIdx != fActiveLimit) {
5487 // The look-behind expression matched, but the match did not
5488 // extend all the way to the point that we are looking behind from.
5489 // FAIL out of here, which will take us back to the LB_CONT, which
5490 // will retry the match starting at another position or fail
5491 // the look-behind altogether, whichever is appropriate.
5492 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5496 // Look-behind match is good. Restore the orignal input string length,
5497 // which had been truncated to pin the end of the lookbehind match to the
5498 // position being looked-behind.
5499 int64_t originalInputLen = fData[opValue+3];
5500 U_ASSERT(originalInputLen >= fActiveLimit);
5501 U_ASSERT(originalInputLen <= fInputLength);
5502 fActiveLimit = originalInputLen;
5509 // Negative Look-Behind, at top of loop checking for matches of LB expression
5510 // at all possible input starting positions.
5512 // Fetch the extra parameters of this op.
5513 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5514 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5515 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5516 continueLoc = URX_VAL(continueLoc);
5517 U_ASSERT(minML <= maxML);
5518 U_ASSERT(minML >= 0);
5519 U_ASSERT(continueLoc > fp->fPatIdx);
5521 // Fetch (from data) the last input index where a match was attempted.
5522 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5523 int64_t &lbStartIdx = fData[opValue+2];
5524 if (lbStartIdx < 0) {
5525 // First time through loop.
5526 lbStartIdx = fp->fInputIdx - minML;
5527 if (lbStartIdx > 0) {
5528 U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5531 // 2nd through nth time through the loop.
5532 // Back up start position for match by one.
5533 if (lbStartIdx == 0) {
5534 lbStartIdx--; // Because U16_BACK is unsafe starting at 0.
5536 U16_BACK_1(inputBuf, 0, lbStartIdx);
5540 if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5541 // We have tried all potential match starting points without
5542 // getting a match, which means that the negative lookbehind as
5543 // a whole has succeeded. Jump forward to the continue location
5544 int64_t restoreInputLen = fData[opValue+3];
5545 U_ASSERT(restoreInputLen >= fActiveLimit);
5546 U_ASSERT(restoreInputLen <= fInputLength);
5547 fActiveLimit = restoreInputLen;
5548 fp->fPatIdx = continueLoc;
5552 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5553 // (successful match will cause a FAIL out of the loop altogether.)
5554 fp = StateSave(fp, fp->fPatIdx-4, status);
5555 fp->fInputIdx = lbStartIdx;
5560 // End of a negative look-behind block, after a successful match.
5562 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5563 if (fp->fInputIdx != fActiveLimit) {
5564 // The look-behind expression matched, but the match did not
5565 // extend all the way to the point that we are looking behind from.
5566 // FAIL out of here, which will take us back to the LB_CONT, which
5567 // will retry the match starting at another position or succeed
5568 // the look-behind altogether, whichever is appropriate.
5569 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5573 // Look-behind expression matched, which means look-behind test as
5576 // Restore the orignal input string length, which had been truncated
5577 // inorder to pin the end of the lookbehind match
5578 // to the position being looked-behind.
5579 int64_t originalInputLen = fData[opValue+3];
5580 U_ASSERT(originalInputLen >= fActiveLimit);
5581 U_ASSERT(originalInputLen <= fInputLength);
5582 fActiveLimit = originalInputLen;
5584 // Restore original stack position, discarding any state saved
5585 // by the successful pattern match.
5586 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5587 int32_t newStackSize = (int32_t)fData[opValue];
5588 U_ASSERT(fStack->size() > newStackSize);
5589 fStack->setSize(newStackSize);
5591 // FAIL, which will take control back to someplace
5592 // prior to entering the look-behind test.
5593 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5599 // Loop Initialization for the optimized implementation of
5600 // [some character set]*
5601 // This op scans through all matching input.
5602 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5604 U_ASSERT(opValue > 0 && opValue < sets->size());
5605 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5606 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5608 // Loop through input, until either the input is exhausted or
5609 // we reach a character that is not a member of the set.
5610 int32_t ix = (int32_t)fp->fInputIdx;
5612 if (ix >= fActiveLimit) {
5617 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5619 if (s8->contains(c) == FALSE) {
5620 U16_BACK_1(inputBuf, 0, ix);
5624 if (s->contains(c) == FALSE) {
5625 U16_BACK_1(inputBuf, 0, ix);
5631 // If there were no matching characters, skip over the loop altogether.
5632 // The loop doesn't run at all, a * op always succeeds.
5633 if (ix == fp->fInputIdx) {
5634 fp->fPatIdx++; // skip the URX_LOOP_C op.
5638 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5639 // must follow. It's operand is the stack location
5640 // that holds the starting input index for the match of this [set]*
5641 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5642 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5643 int32_t stackLoc = URX_VAL(loopcOp);
5644 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5645 fp->fExtra[stackLoc] = fp->fInputIdx;
5648 // Save State to the URX_LOOP_C op that follows this one,
5649 // so that match failures in the following code will return to there.
5650 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5651 fp = StateSave(fp, fp->fPatIdx, status);
5657 case URX_LOOP_DOT_I:
5658 // Loop Initialization for the optimized implementation of .*
5659 // This op scans through all remaining input.
5660 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5662 // Loop through input until the input is exhausted (we reach an end-of-line)
5663 // In DOTALL mode, we can just go straight to the end of the input.
5665 if ((opValue & 1) == 1) {
5666 // Dot-matches-All mode. Jump straight to the end of the string.
5667 ix = (int32_t)fActiveLimit;
5670 // NOT DOT ALL mode. Line endings do not match '.'
5671 // Scan forward until a line ending or end of input.
5672 ix = (int32_t)fp->fInputIdx;
5674 if (ix >= fActiveLimit) {
5679 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5680 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5681 if ((c == 0x0a) || // 0x0a is newline in both modes.
5682 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5683 isLineTerminator(c))) {
5684 // char is a line ending. Put the input pos back to the
5685 // line ending char, and exit the scanning loop.
5686 U16_BACK_1(inputBuf, 0, ix);
5693 // If there were no matching characters, skip over the loop altogether.
5694 // The loop doesn't run at all, a * op always succeeds.
5695 if (ix == fp->fInputIdx) {
5696 fp->fPatIdx++; // skip the URX_LOOP_C op.
5700 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5701 // must follow. It's operand is the stack location
5702 // that holds the starting input index for the match of this .*
5703 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5704 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5705 int32_t stackLoc = URX_VAL(loopcOp);
5706 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5707 fp->fExtra[stackLoc] = fp->fInputIdx;
5710 // Save State to the URX_LOOP_C op that follows this one,
5711 // so that match failures in the following code will return to there.
5712 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5713 fp = StateSave(fp, fp->fPatIdx, status);
5721 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5722 backSearchIndex = (int32_t)fp->fExtra[opValue];
5723 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5724 if (backSearchIndex == fp->fInputIdx) {
5725 // We've backed up the input idx to the point that the loop started.
5726 // The loop is done. Leave here without saving state.
5727 // Subsequent failures won't come back here.
5730 // Set up for the next iteration of the loop, with input index
5731 // backed up by one from the last time through,
5732 // and a state save to this instruction in case the following code fails again.
5733 // (We're going backwards because this loop emulates stack unwinding, not
5734 // the initial scan forward.)
5735 U_ASSERT(fp->fInputIdx > 0);
5737 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5739 if (prevC == 0x0a &&
5740 fp->fInputIdx > backSearchIndex &&
5741 inputBuf[fp->fInputIdx-1] == 0x0d) {
5742 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5743 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5744 // .*, stepping back over CRLF pair.
5745 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5750 fp = StateSave(fp, fp->fPatIdx-1, status);
5757 // Trouble. The compiled pattern contains an entry with an
5758 // unrecognized type tag.
5762 if (U_FAILURE(status)) {
5771 fLastMatchEnd = fMatchEnd;
5772 fMatchStart = startIdx;
5773 fMatchEnd = fp->fInputIdx;
5776 #ifdef REGEX_RUN_DEBUG
5779 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5781 printf("No match\n\n");
5786 fFrame = fp; // The active stack frame when the engine stopped.
5787 // Contains the capture group results that we need to
5794 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5798 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS