1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
12 // ICU Regular Expressions test, part of intltest.
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
50 #define SUPPORT_MUTATING_INPUT_STRING 0
52 //---------------------------------------------------------------------------
54 // Test class boilerplate
56 //---------------------------------------------------------------------------
57 RegexTest::RegexTest()
62 RegexTest::~RegexTest()
68 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
70 if (exec) logln("TestSuite RegexTest: ");
73 case 0: name = "Basic";
76 case 1: name = "API_Match";
77 if (exec) API_Match();
79 case 2: name = "API_Replace";
80 if (exec) API_Replace();
82 case 3: name = "API_Pattern";
83 if (exec) API_Pattern();
86 #if !UCONFIG_NO_FILE_IO
93 case 5: name = "Errors";
96 case 6: name = "PerlTests";
97 if (exec) PerlTests();
99 case 7: name = "Callbacks";
100 if (exec) Callbacks();
102 case 8: name = "FindProgressCallbacks";
103 if (exec) FindProgressCallbacks();
105 case 9: name = "Bug 6149";
108 case 10: name = "UTextBasic";
109 if (exec) UTextBasic();
111 case 11: name = "API_Match_UTF8";
112 if (exec) API_Match_UTF8();
114 case 12: name = "API_Replace_UTF8";
115 if (exec) API_Replace_UTF8();
117 case 13: name = "API_Pattern_UTF8";
118 if (exec) API_Pattern_UTF8();
120 case 14: name = "PerlTestsUTF8";
121 if (exec) PerlTestsUTF8();
123 case 15: name = "PreAllocatedUTextCAPI";
124 if (exec) PreAllocatedUTextCAPI();
126 case 16: name = "Bug 7651";
129 case 17: name = "Bug 7740";
132 case 18: name = "Bug 8479";
135 case 19: name = "Bug 7029";
138 case 20: name = "CheckInvBufSize";
139 if (exec) CheckInvBufSize();
141 case 21: name = "Bug 9283";
144 case 22: name = "Bug10459";
145 if (exec) Bug10459();
147 case 23: name = "TestCaseInsensitiveStarters";
148 if (exec) TestCaseInsensitiveStarters();
150 case 24: name = "TestBug11049";
151 if (exec) TestBug11049();
153 case 25: name = "TestBug11371";
154 if (exec) TestBug11371();
156 case 26: name = "TestBug11480";
157 if (exec) TestBug11480();
159 case 27: name = "NamedCapture";
160 if (exec) NamedCapture();
162 case 28: name = "NamedCaptureLimits";
163 if (exec) NamedCaptureLimits();
166 break; //needed to end loop
173 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
175 * @see utext_openUTF8
177 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
179 //---------------------------------------------------------------------------
181 // Error Checking / Reporting macros used in all of the tests.
183 //---------------------------------------------------------------------------
185 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
186 int64_t oldIndex = utext_getNativeIndex(text);
187 utext_setNativeIndex(text, 0);
189 UChar32 c = utext_next32From(text, 0);
190 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
191 if (0x000020<=c && c<0x00007e) {
195 sprintf(bufPtr,"U+%04X", c);
196 bufPtr+= strlen(bufPtr)-1;
202 c = UTEXT_NEXT32(text);
205 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
206 char *ebuf = (char*)malloc(bufLen);
207 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
208 uprv_strncpy(buf, ebuf, bufLen);
211 utext_setNativeIndex(text, oldIndex);
215 static char ASSERT_BUF[1024];
217 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
218 if(message.length()==0) {
219 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
222 IntlTest::prettify(message,buf);
223 if(buf.length()==0) {
224 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
226 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
227 if(ASSERT_BUF[0]==0) {
229 for(int32_t i=0;i<buf.length();i++) {
231 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
236 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
240 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
242 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
243 __FILE__, __LINE__, u_errorName(status)); return;}}
245 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
247 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
248 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
249 __LINE__, u_errorName(errcode), u_errorName(status));};}
251 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
252 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
254 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
255 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
257 // expected: const char * , restricted to invariant characters.
258 // actual: const UnicodeString &
259 #define REGEX_ASSERT_UNISTR(expected, actual) { \
260 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
261 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
262 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
265 static UBool testUTextEqual(UText *uta, UText *utb) {
268 utext_setNativeIndex(uta, 0);
269 utext_setNativeIndex(utb, 0);
271 ca = utext_next32(uta);
272 cb = utext_next32(utb);
276 } while (ca != U_SENTINEL);
282 * @param expected expected text in UTF-8 (not platform) codepage
284 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
285 UErrorCode status = U_ZERO_ERROR;
286 UText expectedText = UTEXT_INITIALIZER;
287 utext_openUTF8(&expectedText, expected, -1, &status);
288 if(U_FAILURE(status)) {
289 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
292 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
293 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
296 utext_setNativeIndex(actual, 0);
297 if (!testUTextEqual(&expectedText, actual)) {
298 char buf[201 /*21*/];
299 char expectedBuf[201];
300 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
301 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
302 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
304 utext_close(&expectedText);
307 * @param expected invariant (platform local text) input
310 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
311 UErrorCode status = U_ZERO_ERROR;
312 UText expectedText = UTEXT_INITIALIZER;
313 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
314 if(U_FAILURE(status)) {
315 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
318 utext_setNativeIndex(actual, 0);
319 if (!testUTextEqual(&expectedText, actual)) {
320 char buf[201 /*21*/];
321 char expectedBuf[201];
322 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
323 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
324 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
326 utext_close(&expectedText);
330 * Assumes utf-8 input
332 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
334 * Assumes Invariant input
336 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
339 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
340 * passed into utext_openUTF8. An error will be given if
341 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
344 #define INV_BUFSIZ 2048 /* increase this if too small */
346 static int64_t inv_next=0;
348 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
349 static char inv_buf[INV_BUFSIZ];
352 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
353 if(length==-1) length=strlen(inv);
354 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
356 return utext_openUTF8(ut, inv, length, status);
358 if(inv_next+length+1>INV_BUFSIZ) {
359 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
360 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
361 *status = U_MEMORY_ALLOCATION_ERROR;
365 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
366 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
370 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
373 return utext_openUTF8(ut, (const char*)buf, length, status);
378 //---------------------------------------------------------------------------
380 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
381 // for the LookingAt() and Match() functions.
384 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
386 // The expected results are UBool - TRUE or FALSE.
387 // The input text is unescaped. The pattern is not.
390 //---------------------------------------------------------------------------
392 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
394 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
395 const UnicodeString pattern(pat, -1, US_INV);
396 const UnicodeString inputText(text, -1, US_INV);
397 UErrorCode status = U_ZERO_ERROR;
399 RegexPattern *REPattern = NULL;
400 RegexMatcher *REMatcher = NULL;
403 UnicodeString patString(pat, -1, US_INV);
404 REPattern = RegexPattern::compile(patString, 0, pe, status);
405 if (U_FAILURE(status)) {
406 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
407 line, u_errorName(status));
410 if (line==376) { REPattern->dumpPattern();}
412 UnicodeString inputString(inputText);
413 UnicodeString unEscapedInput = inputString.unescape();
414 REMatcher = REPattern->matcher(unEscapedInput, status);
415 if (U_FAILURE(status)) {
416 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
417 line, u_errorName(status));
422 actualmatch = REMatcher->lookingAt(status);
423 if (U_FAILURE(status)) {
424 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
425 line, u_errorName(status));
428 if (actualmatch != looking) {
429 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
433 status = U_ZERO_ERROR;
434 actualmatch = REMatcher->matches(status);
435 if (U_FAILURE(status)) {
436 errln("RegexTest failure in matches() at line %d. Status = %s\n",
437 line, u_errorName(status));
440 if (actualmatch != match) {
441 errln("RegexTest: wrong return from matches() at line %d.\n", line);
445 if (retVal == FALSE) {
446 REPattern->dumpPattern();
455 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
456 UText pattern = UTEXT_INITIALIZER;
457 int32_t inputUTF8Length;
458 char *textChars = NULL;
459 UText inputText = UTEXT_INITIALIZER;
460 UErrorCode status = U_ZERO_ERROR;
462 RegexPattern *REPattern = NULL;
463 RegexMatcher *REMatcher = NULL;
466 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
467 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
468 if (U_FAILURE(status)) {
469 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
470 line, u_errorName(status));
474 UnicodeString inputString(text, -1, US_INV);
475 UnicodeString unEscapedInput = inputString.unescape();
476 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
477 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
479 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
480 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
481 // UTF-8 does not allow unpaired surrogates, so this could actually happen
482 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
483 return TRUE; // not a failure of the Regex engine
485 status = U_ZERO_ERROR; // buffer overflow
486 textChars = new char[inputUTF8Length+1];
487 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
488 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
490 REMatcher = &REPattern->matcher(status)->reset(&inputText);
491 if (U_FAILURE(status)) {
492 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
493 line, u_errorName(status));
498 actualmatch = REMatcher->lookingAt(status);
499 if (U_FAILURE(status)) {
500 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
501 line, u_errorName(status));
504 if (actualmatch != looking) {
505 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
509 status = U_ZERO_ERROR;
510 actualmatch = REMatcher->matches(status);
511 if (U_FAILURE(status)) {
512 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
513 line, u_errorName(status));
516 if (actualmatch != match) {
517 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
521 if (retVal == FALSE) {
522 REPattern->dumpPattern();
527 utext_close(&inputText);
528 utext_close(&pattern);
535 //---------------------------------------------------------------------------
537 // REGEX_ERR Macro + invocation function to simplify writing tests
538 // regex tests for incorrect patterns
541 // REGEX_ERR("pattern", expected error line, column, expected status);
543 //---------------------------------------------------------------------------
544 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
546 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
547 UErrorCode expectedStatus, int32_t line) {
548 UnicodeString pattern(pat);
550 UErrorCode status = U_ZERO_ERROR;
552 RegexPattern *callerPattern = NULL;
555 // Compile the caller's pattern
557 UnicodeString patString(pat);
558 callerPattern = RegexPattern::compile(patString, 0, pe, status);
559 if (status != expectedStatus) {
560 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
562 if (status != U_ZERO_ERROR) {
563 if (pe.line != errLine || pe.offset != errCol) {
564 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
565 line, errLine, errCol, pe.line, pe.offset);
570 delete callerPattern;
573 // Compile again, using a UTF-8-based UText
575 UText patternText = UTEXT_INITIALIZER;
576 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
577 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
578 if (status != expectedStatus) {
579 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
581 if (status != U_ZERO_ERROR) {
582 if (pe.line != errLine || pe.offset != errCol) {
583 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
584 line, errLine, errCol, pe.line, pe.offset);
589 delete callerPattern;
590 utext_close(&patternText);
595 //---------------------------------------------------------------------------
597 // Basic Check for basic functionality of regex pattern matching.
598 // Avoid the use of REGEX_FIND test macro, which has
599 // substantial dependencies on basic Regex functionality.
601 //---------------------------------------------------------------------------
602 void RegexTest::Basic() {
606 // Debug - slide failing test cases early
610 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
612 UErrorCode status = U_ZERO_ERROR;
613 RegexPattern *pattern;
614 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
615 pattern->dumpPattern();
616 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
617 UBool result = m->find();
618 printf("result = %d\n", result);
619 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
620 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
627 // Pattern with parentheses
629 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
630 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
631 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
636 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
637 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
638 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
639 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
640 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
642 REGEX_TESTLM("a*", "", TRUE, TRUE);
643 REGEX_TESTLM("a*", "b", TRUE, FALSE);
649 REGEX_TESTLM(".", "abc", TRUE, FALSE);
650 REGEX_TESTLM("...", "abc", TRUE, TRUE);
651 REGEX_TESTLM("....", "abc", FALSE, FALSE);
652 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
653 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
654 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
655 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
656 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
659 // Patterns with * applied to chars at end of literal string
661 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
662 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
665 // Supplemental chars match as single chars, not a pair of surrogates.
667 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
668 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
669 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
673 // UnicodeSets in the pattern
675 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
676 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
677 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
678 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
679 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
680 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
682 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
683 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
684 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
685 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
686 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
689 // OR operator in patterns
691 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
692 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
693 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
694 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
696 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
697 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
698 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
699 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
700 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
701 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
706 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
707 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
708 REGEX_TESTLM("b+", "", FALSE, FALSE);
709 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
710 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
711 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
716 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
717 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
718 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
719 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
720 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
721 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
722 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
723 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
724 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
727 // Escape sequences that become single literal chars, handled internally
728 // by ICU's Unescape.
731 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
732 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
733 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
734 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
735 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
736 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
737 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
738 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
739 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
740 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
742 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
743 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
745 // Escape of special chars in patterns
746 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
750 //---------------------------------------------------------------------------
752 // UTextBasic Check for quirks that are specific to the UText
755 //---------------------------------------------------------------------------
756 void RegexTest::UTextBasic() {
757 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
758 UErrorCode status = U_ZERO_ERROR;
759 UText pattern = UTEXT_INITIALIZER;
760 utext_openUTF8(&pattern, str_abc, -1, &status);
761 RegexMatcher matcher(&pattern, 0, status);
764 UText input = UTEXT_INITIALIZER;
765 utext_openUTF8(&input, str_abc, -1, &status);
767 matcher.reset(&input);
769 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
771 matcher.reset(matcher.inputText());
773 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
775 utext_close(&pattern);
780 //---------------------------------------------------------------------------
782 // API_Match Test that the API for class RegexMatcher
783 // is present and nominally working, but excluding functions
784 // implementing replace operations.
786 //---------------------------------------------------------------------------
787 void RegexTest::API_Match() {
789 UErrorCode status=U_ZERO_ERROR;
793 // Debug - slide failing test cases early
802 // Simple pattern compilation
805 UnicodeString re("abc");
807 pat2 = RegexPattern::compile(re, flags, pe, status);
810 UnicodeString inStr1 = "abcdef this is a test";
811 UnicodeString instr2 = "not abc";
812 UnicodeString empty = "";
816 // Matcher creation and reset.
818 RegexMatcher *m1 = pat2->matcher(inStr1, status);
820 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
821 REGEX_ASSERT(m1->input() == inStr1);
823 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
824 REGEX_ASSERT(m1->input() == instr2);
826 REGEX_ASSERT(m1->input() == inStr1);
827 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
829 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
830 REGEX_ASSERT(m1->input() == empty);
831 REGEX_ASSERT(&m1->pattern() == pat2);
834 // reset(pos, status)
837 m1->reset(4, status);
839 REGEX_ASSERT(m1->input() == inStr1);
840 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
842 m1->reset(-1, status);
843 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
844 status = U_ZERO_ERROR;
846 m1->reset(0, status);
848 status = U_ZERO_ERROR;
850 int32_t len = m1->input().length();
851 m1->reset(len-1, status);
853 status = U_ZERO_ERROR;
855 m1->reset(len, status);
857 status = U_ZERO_ERROR;
859 m1->reset(len+1, status);
860 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
861 status = U_ZERO_ERROR;
864 // match(pos, status)
867 REGEX_ASSERT(m1->matches(4, status) == TRUE);
869 REGEX_ASSERT(m1->matches(3, status) == FALSE);
871 REGEX_ASSERT(m1->matches(5, status) == FALSE);
872 REGEX_ASSERT(m1->matches(4, status) == TRUE);
873 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
874 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
876 // Match() at end of string should fail, but should not
878 status = U_ZERO_ERROR;
879 len = m1->input().length();
880 REGEX_ASSERT(m1->matches(len, status) == FALSE);
883 // Match beyond end of string should fail with an error.
884 status = U_ZERO_ERROR;
885 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
886 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
888 // Successful match at end of string.
890 status = U_ZERO_ERROR;
891 RegexMatcher m("A?", 0, status); // will match zero length string.
894 len = inStr1.length();
895 REGEX_ASSERT(m.matches(len, status) == TRUE);
898 REGEX_ASSERT(m.matches(0, status) == TRUE);
904 // lookingAt(pos, status)
906 status = U_ZERO_ERROR;
907 m1->reset(instr2); // "not abc"
908 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
909 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
910 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
911 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
912 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
913 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
914 status = U_ZERO_ERROR;
915 len = m1->input().length();
916 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
918 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
919 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
928 // RegexMatcher::start();
929 // RegexMatcher::end();
930 // RegexMatcher::groupCount();
935 UErrorCode status=U_ZERO_ERROR;
937 UnicodeString re("01(23(45)67)(.*)");
938 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
940 UnicodeString data = "0123456789";
942 RegexMatcher *matcher = pat->matcher(data, status);
944 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
945 static const int32_t matchStarts[] = {0, 2, 4, 8};
946 static const int32_t matchEnds[] = {10, 8, 6, 10};
948 for (i=0; i<4; i++) {
949 int32_t actualStart = matcher->start(i, status);
951 if (actualStart != matchStarts[i]) {
952 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
953 __LINE__, i, matchStarts[i], actualStart);
955 int32_t actualEnd = matcher->end(i, status);
957 if (actualEnd != matchEnds[i]) {
958 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
959 __LINE__, i, matchEnds[i], actualEnd);
963 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
964 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
966 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
967 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
969 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
971 matcher->lookingAt(status);
972 REGEX_ASSERT(matcher->group(status) == "0123456789");
973 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
974 REGEX_ASSERT(matcher->group(1, status) == "234567" );
975 REGEX_ASSERT(matcher->group(2, status) == "45" );
976 REGEX_ASSERT(matcher->group(3, status) == "89" );
978 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
979 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
981 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
994 UErrorCode status=U_ZERO_ERROR;
996 UnicodeString re("abc");
997 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
999 UnicodeString data = ".abc..abc...abc..";
1000 // 012345678901234567
1002 RegexMatcher *matcher = pat->matcher(data, status);
1004 REGEX_ASSERT(matcher->find());
1005 REGEX_ASSERT(matcher->start(status) == 1);
1006 REGEX_ASSERT(matcher->find());
1007 REGEX_ASSERT(matcher->start(status) == 6);
1008 REGEX_ASSERT(matcher->find());
1009 REGEX_ASSERT(matcher->start(status) == 12);
1010 REGEX_ASSERT(matcher->find() == FALSE);
1011 REGEX_ASSERT(matcher->find() == FALSE);
1014 REGEX_ASSERT(matcher->find());
1015 REGEX_ASSERT(matcher->start(status) == 1);
1017 REGEX_ASSERT(matcher->find(0, status));
1018 REGEX_ASSERT(matcher->start(status) == 1);
1019 REGEX_ASSERT(matcher->find(1, status));
1020 REGEX_ASSERT(matcher->start(status) == 1);
1021 REGEX_ASSERT(matcher->find(2, status));
1022 REGEX_ASSERT(matcher->start(status) == 6);
1023 REGEX_ASSERT(matcher->find(12, status));
1024 REGEX_ASSERT(matcher->start(status) == 12);
1025 REGEX_ASSERT(matcher->find(13, status) == FALSE);
1026 REGEX_ASSERT(matcher->find(16, status) == FALSE);
1027 REGEX_ASSERT(matcher->find(17, status) == FALSE);
1028 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1030 status = U_ZERO_ERROR;
1031 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1032 status = U_ZERO_ERROR;
1033 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1035 REGEX_ASSERT(matcher->groupCount() == 0);
1043 // find, with \G in pattern (true if at the end of a previous match).
1048 UErrorCode status=U_ZERO_ERROR;
1050 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1051 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1053 UnicodeString data = ".abcabc.abc..";
1054 // 012345678901234567
1056 RegexMatcher *matcher = pat->matcher(data, status);
1058 REGEX_ASSERT(matcher->find());
1059 REGEX_ASSERT(matcher->start(status) == 0);
1060 REGEX_ASSERT(matcher->start(1, status) == -1);
1061 REGEX_ASSERT(matcher->start(2, status) == 1);
1063 REGEX_ASSERT(matcher->find());
1064 REGEX_ASSERT(matcher->start(status) == 4);
1065 REGEX_ASSERT(matcher->start(1, status) == 4);
1066 REGEX_ASSERT(matcher->start(2, status) == -1);
1074 // find with zero length matches, match position should bump ahead
1075 // to prevent loops.
1079 UErrorCode status=U_ZERO_ERROR;
1080 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1081 // using an always-true look-ahead.
1083 UnicodeString s(" ");
1086 if (m.find() == FALSE) {
1089 REGEX_ASSERT(m.start(status) == i);
1090 REGEX_ASSERT(m.end(status) == i);
1094 // Check that the bump goes over surrogate pairs OK
1095 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1099 if (m.find() == FALSE) {
1102 REGEX_ASSERT(m.start(status) == i);
1103 REGEX_ASSERT(m.end(status) == i);
1105 REGEX_ASSERT(i==10);
1108 // find() loop breaking test.
1109 // with pattern of /.?/, should see a series of one char matches, then a single
1110 // match of zero length at the end of the input string.
1112 UErrorCode status=U_ZERO_ERROR;
1113 RegexMatcher m(".?", 0, status);
1115 UnicodeString s(" ");
1118 if (m.find() == FALSE) {
1121 REGEX_ASSERT(m.start(status) == i);
1122 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1129 // Matchers with no input string behave as if they had an empty input string.
1133 UErrorCode status = U_ZERO_ERROR;
1134 RegexMatcher m(".?", 0, status);
1136 REGEX_ASSERT(m.find());
1137 REGEX_ASSERT(m.start(status) == 0);
1138 REGEX_ASSERT(m.input() == "");
1141 UErrorCode status = U_ZERO_ERROR;
1142 RegexPattern *p = RegexPattern::compile(".", 0, status);
1143 RegexMatcher *m = p->matcher(status);
1146 REGEX_ASSERT(m->find() == FALSE);
1147 REGEX_ASSERT(m->input() == "");
1156 UErrorCode status = U_ZERO_ERROR;
1157 UnicodeString testString("This is test data");
1158 RegexMatcher m(".*", testString, 0, status);
1160 REGEX_ASSERT(m.regionStart() == 0);
1161 REGEX_ASSERT(m.regionEnd() == testString.length());
1162 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1163 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1165 m.region(2,4, status);
1167 REGEX_ASSERT(m.matches(status));
1168 REGEX_ASSERT(m.start(status)==2);
1169 REGEX_ASSERT(m.end(status)==4);
1173 REGEX_ASSERT(m.regionStart() == 0);
1174 REGEX_ASSERT(m.regionEnd() == testString.length());
1176 UnicodeString shorterString("short");
1177 m.reset(shorterString);
1178 REGEX_ASSERT(m.regionStart() == 0);
1179 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1181 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1182 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1183 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1184 REGEX_ASSERT(&m == &m.reset());
1185 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1187 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1188 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1189 REGEX_ASSERT(&m == &m.reset());
1190 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1192 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1193 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1194 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1195 REGEX_ASSERT(&m == &m.reset());
1196 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1198 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1199 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1200 REGEX_ASSERT(&m == &m.reset());
1201 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1206 // hitEnd() and requireEnd()
1209 UErrorCode status = U_ZERO_ERROR;
1210 UnicodeString testString("aabb");
1211 RegexMatcher m1(".*", testString, 0, status);
1212 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1213 REGEX_ASSERT(m1.hitEnd() == TRUE);
1214 REGEX_ASSERT(m1.requireEnd() == FALSE);
1217 status = U_ZERO_ERROR;
1218 RegexMatcher m2("a*", testString, 0, status);
1219 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1220 REGEX_ASSERT(m2.hitEnd() == FALSE);
1221 REGEX_ASSERT(m2.requireEnd() == FALSE);
1224 status = U_ZERO_ERROR;
1225 RegexMatcher m3(".*$", testString, 0, status);
1226 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1227 REGEX_ASSERT(m3.hitEnd() == TRUE);
1228 REGEX_ASSERT(m3.requireEnd() == TRUE);
1234 // Compilation error on reset with UChar *
1235 // These were a hazard that people were stumbling over with runtime errors.
1236 // Changed them to compiler errors by adding private methods that more closely
1237 // matched the incorrect use of the functions.
1241 UErrorCode status = U_ZERO_ERROR;
1242 UChar ucharString[20];
1243 RegexMatcher m(".", 0, status);
1244 m.reset(ucharString); // should not compile.
1246 RegexPattern *p = RegexPattern::compile(".", 0, status);
1247 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1249 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1255 // Note: These tests will need to be changed when the regexp engine is
1256 // able to detect and cut short the exponential time behavior on
1257 // this type of match.
1260 UErrorCode status = U_ZERO_ERROR;
1261 // Enough 'a's in the string to cause the match to time out.
1262 // (Each on additonal 'a' doubles the time)
1263 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1264 RegexMatcher matcher("(a+)+b", testString, 0, status);
1266 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1267 matcher.setTimeLimit(100, status);
1268 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1269 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1270 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1273 UErrorCode status = U_ZERO_ERROR;
1274 // Few enough 'a's to slip in under the time limit.
1275 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1276 RegexMatcher matcher("(a+)+b", testString, 0, status);
1278 matcher.setTimeLimit(100, status);
1279 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1287 UErrorCode status = U_ZERO_ERROR;
1288 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1290 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1291 // of the '+', and makes the stack frames larger.
1292 RegexMatcher matcher("(A)+A$", testString, 0, status);
1294 // With the default stack, this match should fail to run
1295 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1296 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1298 // With unlimited stack, it should run
1299 status = U_ZERO_ERROR;
1300 matcher.setStackLimit(0, status);
1302 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1304 REGEX_ASSERT(matcher.getStackLimit() == 0);
1306 // With a limited stack, it the match should fail
1307 status = U_ZERO_ERROR;
1308 matcher.setStackLimit(10000, status);
1309 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1310 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1311 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1314 // A pattern that doesn't save state should work with
1315 // a minimal sized stack
1317 UErrorCode status = U_ZERO_ERROR;
1318 UnicodeString testString = "abc";
1319 RegexMatcher matcher("abc", testString, 0, status);
1321 matcher.setStackLimit(30, status);
1323 REGEX_ASSERT(matcher.matches(status) == TRUE);
1325 REGEX_ASSERT(matcher.getStackLimit() == 30);
1327 // Negative stack sizes should fail
1328 status = U_ZERO_ERROR;
1329 matcher.setStackLimit(1000, status);
1331 matcher.setStackLimit(-1, status);
1332 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1333 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1344 //---------------------------------------------------------------------------
1346 // API_Replace API test for class RegexMatcher, testing the
1347 // Replace family of functions.
1349 //---------------------------------------------------------------------------
1350 void RegexTest::API_Replace() {
1356 UErrorCode status=U_ZERO_ERROR;
1358 UnicodeString re("abc");
1359 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1361 UnicodeString data = ".abc..abc...abc..";
1362 // 012345678901234567
1363 RegexMatcher *matcher = pat->matcher(data, status);
1366 // Plain vanilla matches.
1369 dest = matcher->replaceFirst("yz", status);
1371 REGEX_ASSERT(dest == ".yz..abc...abc..");
1373 dest = matcher->replaceAll("yz", status);
1375 REGEX_ASSERT(dest == ".yz..yz...yz..");
1378 // Plain vanilla non-matches.
1380 UnicodeString d2 = ".abx..abx...abx..";
1382 dest = matcher->replaceFirst("yz", status);
1384 REGEX_ASSERT(dest == ".abx..abx...abx..");
1386 dest = matcher->replaceAll("yz", status);
1388 REGEX_ASSERT(dest == ".abx..abx...abx..");
1391 // Empty source string
1393 UnicodeString d3 = "";
1395 dest = matcher->replaceFirst("yz", status);
1397 REGEX_ASSERT(dest == "");
1399 dest = matcher->replaceAll("yz", status);
1401 REGEX_ASSERT(dest == "");
1404 // Empty substitution string
1406 matcher->reset(data); // ".abc..abc...abc.."
1407 dest = matcher->replaceFirst("", status);
1409 REGEX_ASSERT(dest == "...abc...abc..");
1411 dest = matcher->replaceAll("", status);
1413 REGEX_ASSERT(dest == "........");
1416 // match whole string
1418 UnicodeString d4 = "abc";
1420 dest = matcher->replaceFirst("xyz", status);
1422 REGEX_ASSERT(dest == "xyz");
1424 dest = matcher->replaceAll("xyz", status);
1426 REGEX_ASSERT(dest == "xyz");
1429 // Capture Group, simple case
1431 UnicodeString re2("a(..)");
1432 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1434 UnicodeString d5 = "abcdefg";
1435 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1437 dest = matcher2->replaceFirst("$1$1", status);
1439 REGEX_ASSERT(dest == "bcbcdefg");
1441 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1443 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1445 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1446 REGEX_ASSERT(U_FAILURE(status));
1447 status = U_ZERO_ERROR;
1449 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1450 replacement = replacement.unescape();
1451 dest = matcher2->replaceFirst(replacement, status);
1453 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1455 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1459 // Replacement String with \u hex escapes
1462 UnicodeString src = "abc 1 abc 2 abc 3";
1463 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1464 matcher->reset(src);
1465 UnicodeString result = matcher->replaceAll(substitute, status);
1467 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1470 UnicodeString src = "abc !";
1471 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1472 matcher->reset(src);
1473 UnicodeString result = matcher->replaceAll(substitute, status);
1475 UnicodeString expected = UnicodeString("--");
1476 expected.append((UChar32)0x10000);
1477 expected.append("-- !");
1478 REGEX_ASSERT(result == expected);
1480 // TODO: need more through testing of capture substitutions.
1485 status = U_ZERO_ERROR;
1486 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1487 RegexMatcher m("ss(.*?)ee", 0, status);
1489 UnicodeString result;
1491 // Multiple finds do NOT bump up the previous appendReplacement postion.
1495 m.appendReplacement(result, "ooh", status);
1497 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1499 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1500 status = U_ZERO_ERROR;
1502 m.reset(10, status);
1505 m.appendReplacement(result, "ooh", status);
1507 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1509 // find() at interior of string, appendReplacemnt still starts at beginning.
1510 status = U_ZERO_ERROR;
1515 m.appendReplacement(result, "ooh", status);
1517 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1519 m.appendTail(result);
1520 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1531 //---------------------------------------------------------------------------
1533 // API_Pattern Test that the API for class RegexPattern is
1534 // present and nominally working.
1536 //---------------------------------------------------------------------------
1537 void RegexTest::API_Pattern() {
1538 RegexPattern pata; // Test default constructor to not crash.
1541 REGEX_ASSERT(pata == patb);
1542 REGEX_ASSERT(pata == pata);
1544 UnicodeString re1("abc[a-l][m-z]");
1545 UnicodeString re2("def");
1546 UErrorCode status = U_ZERO_ERROR;
1549 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1550 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1552 REGEX_ASSERT(*pat1 == *pat1);
1553 REGEX_ASSERT(*pat1 != pata);
1557 REGEX_ASSERT(patb == *pat1);
1560 RegexPattern patc(*pat1);
1561 REGEX_ASSERT(patc == *pat1);
1562 REGEX_ASSERT(patb == patc);
1563 REGEX_ASSERT(pat1 != pat2);
1565 REGEX_ASSERT(patb != patc);
1566 REGEX_ASSERT(patb == *pat2);
1568 // Compile with no flags.
1569 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1570 REGEX_ASSERT(*pat1a == *pat1);
1572 REGEX_ASSERT(pat1a->flags() == 0);
1574 // Compile with different flags should be not equal
1575 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1578 REGEX_ASSERT(*pat1b != *pat1a);
1579 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1580 REGEX_ASSERT(pat1a->flags() == 0);
1584 RegexPattern *pat1c = pat1->clone();
1585 REGEX_ASSERT(*pat1c == *pat1);
1586 REGEX_ASSERT(*pat1c != *pat2);
1595 // Verify that a matcher created from a cloned pattern works.
1599 UErrorCode status = U_ZERO_ERROR;
1600 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1601 RegexPattern *pClone = pSource->clone();
1603 RegexMatcher *mFromClone = pClone->matcher(status);
1605 UnicodeString s = "Hello World";
1606 mFromClone->reset(s);
1607 REGEX_ASSERT(mFromClone->find() == TRUE);
1608 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1609 REGEX_ASSERT(mFromClone->find() == TRUE);
1610 REGEX_ASSERT(mFromClone->group(status) == "World");
1611 REGEX_ASSERT(mFromClone->find() == FALSE);
1617 // matches convenience API
1619 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1621 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1623 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1625 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1627 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1629 status = U_INDEX_OUTOFBOUNDS_ERROR;
1630 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1631 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1637 status = U_ZERO_ERROR;
1638 pat1 = RegexPattern::compile(" +", pe, status);
1640 UnicodeString fields[10];
1643 n = pat1->split("Now is the time", fields, 10, status);
1646 REGEX_ASSERT(fields[0]=="Now");
1647 REGEX_ASSERT(fields[1]=="is");
1648 REGEX_ASSERT(fields[2]=="the");
1649 REGEX_ASSERT(fields[3]=="time");
1650 REGEX_ASSERT(fields[4]=="");
1652 n = pat1->split("Now is the time", fields, 2, status);
1655 REGEX_ASSERT(fields[0]=="Now");
1656 REGEX_ASSERT(fields[1]=="is the time");
1657 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1660 status = U_ZERO_ERROR;
1661 n = pat1->split("Now is the time", fields, 1, status);
1664 REGEX_ASSERT(fields[0]=="Now is the time");
1665 REGEX_ASSERT(fields[1]=="*");
1666 status = U_ZERO_ERROR;
1668 n = pat1->split(" Now is the time ", fields, 10, status);
1671 REGEX_ASSERT(fields[0]=="");
1672 REGEX_ASSERT(fields[1]=="Now");
1673 REGEX_ASSERT(fields[2]=="is");
1674 REGEX_ASSERT(fields[3]=="the");
1675 REGEX_ASSERT(fields[4]=="time");
1676 REGEX_ASSERT(fields[5]=="");
1678 n = pat1->split(" ", fields, 10, status);
1681 REGEX_ASSERT(fields[0]=="");
1682 REGEX_ASSERT(fields[1]=="");
1685 n = pat1->split("", fields, 10, status);
1688 REGEX_ASSERT(fields[0]=="foo");
1692 // split, with a pattern with (capture)
1693 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1696 status = U_ZERO_ERROR;
1697 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1700 REGEX_ASSERT(fields[0]=="");
1701 REGEX_ASSERT(fields[1]=="a");
1702 REGEX_ASSERT(fields[2]=="Now is ");
1703 REGEX_ASSERT(fields[3]=="b");
1704 REGEX_ASSERT(fields[4]=="the time");
1705 REGEX_ASSERT(fields[5]=="c");
1706 REGEX_ASSERT(fields[6]=="");
1707 REGEX_ASSERT(status==U_ZERO_ERROR);
1709 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1712 REGEX_ASSERT(fields[0]==" ");
1713 REGEX_ASSERT(fields[1]=="a");
1714 REGEX_ASSERT(fields[2]=="Now is ");
1715 REGEX_ASSERT(fields[3]=="b");
1716 REGEX_ASSERT(fields[4]=="the time");
1717 REGEX_ASSERT(fields[5]=="c");
1718 REGEX_ASSERT(fields[6]=="");
1720 status = U_ZERO_ERROR;
1722 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1725 REGEX_ASSERT(fields[0]==" ");
1726 REGEX_ASSERT(fields[1]=="a");
1727 REGEX_ASSERT(fields[2]=="Now is ");
1728 REGEX_ASSERT(fields[3]=="b");
1729 REGEX_ASSERT(fields[4]=="the time");
1730 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1731 REGEX_ASSERT(fields[6]=="foo");
1733 status = U_ZERO_ERROR;
1735 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1738 REGEX_ASSERT(fields[0]==" ");
1739 REGEX_ASSERT(fields[1]=="a");
1740 REGEX_ASSERT(fields[2]=="Now is ");
1741 REGEX_ASSERT(fields[3]=="b");
1742 REGEX_ASSERT(fields[4]=="the time<c>");
1743 REGEX_ASSERT(fields[5]=="foo");
1745 status = U_ZERO_ERROR;
1747 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1750 REGEX_ASSERT(fields[0]==" ");
1751 REGEX_ASSERT(fields[1]=="a");
1752 REGEX_ASSERT(fields[2]=="Now is ");
1753 REGEX_ASSERT(fields[3]=="b");
1754 REGEX_ASSERT(fields[4]=="the time");
1755 REGEX_ASSERT(fields[5]=="foo");
1757 status = U_ZERO_ERROR;
1758 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1761 REGEX_ASSERT(fields[0]==" ");
1762 REGEX_ASSERT(fields[1]=="a");
1763 REGEX_ASSERT(fields[2]=="Now is ");
1764 REGEX_ASSERT(fields[3]=="the time<c>");
1765 status = U_ZERO_ERROR;
1768 pat1 = RegexPattern::compile("([-,])", pe, status);
1770 n = pat1->split("1-10,20", fields, 10, status);
1773 REGEX_ASSERT(fields[0]=="1");
1774 REGEX_ASSERT(fields[1]=="-");
1775 REGEX_ASSERT(fields[2]=="10");
1776 REGEX_ASSERT(fields[3]==",");
1777 REGEX_ASSERT(fields[4]=="20");
1780 // Test split of string with empty trailing fields
1781 pat1 = RegexPattern::compile(",", pe, status);
1783 n = pat1->split("a,b,c,", fields, 10, status);
1786 REGEX_ASSERT(fields[0]=="a");
1787 REGEX_ASSERT(fields[1]=="b");
1788 REGEX_ASSERT(fields[2]=="c");
1789 REGEX_ASSERT(fields[3]=="");
1791 n = pat1->split("a,,,", fields, 10, status);
1794 REGEX_ASSERT(fields[0]=="a");
1795 REGEX_ASSERT(fields[1]=="");
1796 REGEX_ASSERT(fields[2]=="");
1797 REGEX_ASSERT(fields[3]=="");
1800 // Split Separator with zero length match.
1801 pat1 = RegexPattern::compile(":?", pe, status);
1803 n = pat1->split("abc", fields, 10, status);
1806 REGEX_ASSERT(fields[0]=="");
1807 REGEX_ASSERT(fields[1]=="a");
1808 REGEX_ASSERT(fields[2]=="b");
1809 REGEX_ASSERT(fields[3]=="c");
1810 REGEX_ASSERT(fields[4]=="");
1815 // RegexPattern::pattern()
1817 pat1 = new RegexPattern();
1818 REGEX_ASSERT(pat1->pattern() == "");
1821 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1823 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1828 // classID functions
1830 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1832 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1833 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1834 UnicodeString Hello("Hello, world.");
1835 RegexMatcher *m = pat1->matcher(Hello, status);
1836 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1837 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1838 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1844 //---------------------------------------------------------------------------
1846 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1847 // is present and working, but excluding functions
1848 // implementing replace operations.
1850 //---------------------------------------------------------------------------
1851 void RegexTest::API_Match_UTF8() {
1853 UErrorCode status=U_ZERO_ERROR;
1857 // Debug - slide failing test cases early
1866 // Simple pattern compilation
1869 UText re = UTEXT_INITIALIZER;
1870 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1871 REGEX_VERBOSE_TEXT(&re);
1873 pat2 = RegexPattern::compile(&re, flags, pe, status);
1876 UText input1 = UTEXT_INITIALIZER;
1877 UText input2 = UTEXT_INITIALIZER;
1878 UText empty = UTEXT_INITIALIZER;
1879 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1880 REGEX_VERBOSE_TEXT(&input1);
1881 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1882 REGEX_VERBOSE_TEXT(&input2);
1883 utext_openUChars(&empty, NULL, 0, &status);
1885 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1886 int32_t input2Len = strlen("not abc");
1890 // Matcher creation and reset.
1892 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1894 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1895 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1896 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1898 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1899 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1900 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1902 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1903 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1905 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1906 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1909 // reset(pos, status)
1912 m1->reset(4, status);
1914 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1915 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1917 m1->reset(-1, status);
1918 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1919 status = U_ZERO_ERROR;
1921 m1->reset(0, status);
1923 status = U_ZERO_ERROR;
1925 m1->reset(input1Len-1, status);
1927 status = U_ZERO_ERROR;
1929 m1->reset(input1Len, status);
1931 status = U_ZERO_ERROR;
1933 m1->reset(input1Len+1, status);
1934 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1935 status = U_ZERO_ERROR;
1938 // match(pos, status)
1941 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1943 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1945 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1946 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1947 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1948 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1950 // Match() at end of string should fail, but should not
1952 status = U_ZERO_ERROR;
1953 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1956 // Match beyond end of string should fail with an error.
1957 status = U_ZERO_ERROR;
1958 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1959 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1961 // Successful match at end of string.
1963 status = U_ZERO_ERROR;
1964 RegexMatcher m("A?", 0, status); // will match zero length string.
1967 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1970 REGEX_ASSERT(m.matches(0, status) == TRUE);
1976 // lookingAt(pos, status)
1978 status = U_ZERO_ERROR;
1979 m1->reset(&input2); // "not abc"
1980 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1981 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1982 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1983 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1984 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1985 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1986 status = U_ZERO_ERROR;
1987 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1989 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1990 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1996 utext_close(&input1);
1997 utext_close(&input2);
1998 utext_close(&empty);
2004 // RegexMatcher::start();
2005 // RegexMatcher::end();
2006 // RegexMatcher::groupCount();
2011 UErrorCode status=U_ZERO_ERROR;
2012 UText re=UTEXT_INITIALIZER;
2013 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2014 utext_openUTF8(&re, str_01234567_pat, -1, &status);
2016 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2019 UText input = UTEXT_INITIALIZER;
2020 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2021 utext_openUTF8(&input, str_0123456789, -1, &status);
2023 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2025 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
2026 static const int32_t matchStarts[] = {0, 2, 4, 8};
2027 static const int32_t matchEnds[] = {10, 8, 6, 10};
2029 for (i=0; i<4; i++) {
2030 int32_t actualStart = matcher->start(i, status);
2032 if (actualStart != matchStarts[i]) {
2033 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2034 __FILE__, __LINE__, i, matchStarts[i], actualStart);
2036 int32_t actualEnd = matcher->end(i, status);
2038 if (actualEnd != matchEnds[i]) {
2039 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2040 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2044 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2045 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2047 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2048 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2050 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2052 matcher->lookingAt(status);
2055 UText destText = UTEXT_INITIALIZER;
2056 utext_openUnicodeString(&destText, &dest, &status);
2058 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2059 // Test shallow-clone API
2061 result = matcher->group((UText *)NULL, group_len, status);
2063 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2064 utext_close(result);
2065 result = matcher->group(0, &destText, group_len, status);
2067 REGEX_ASSERT(result == &destText);
2068 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2069 // destText is now immutable, reopen it
2070 utext_close(&destText);
2071 utext_openUnicodeString(&destText, &dest, &status);
2074 result = matcher->group(0, NULL, length, status);
2076 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2077 utext_close(result);
2078 result = matcher->group(0, &destText, length, status);
2080 REGEX_ASSERT(result == &destText);
2081 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2082 REGEX_ASSERT(length == 10);
2083 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2085 // Capture Group 1 == "234567"
2086 result = matcher->group(1, NULL, length, status);
2088 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2089 REGEX_ASSERT(length == 6);
2090 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2091 utext_close(result);
2093 result = matcher->group(1, &destText, length, status);
2095 REGEX_ASSERT(result == &destText);
2096 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2097 REGEX_ASSERT(length == 6);
2098 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2099 utext_close(result);
2101 // Capture Group 2 == "45"
2102 result = matcher->group(2, NULL, length, status);
2104 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2105 REGEX_ASSERT(length == 2);
2106 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2107 utext_close(result);
2109 result = matcher->group(2, &destText, length, status);
2111 REGEX_ASSERT(result == &destText);
2112 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2113 REGEX_ASSERT(length == 2);
2114 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2115 utext_close(result);
2117 // Capture Group 3 == "89"
2118 result = matcher->group(3, NULL, length, status);
2120 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2121 REGEX_ASSERT(length == 2);
2122 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2123 utext_close(result);
2125 result = matcher->group(3, &destText, length, status);
2127 REGEX_ASSERT(result == &destText);
2128 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2129 REGEX_ASSERT(length == 2);
2130 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2131 utext_close(result);
2133 // Capture Group number out of range.
2134 status = U_ZERO_ERROR;
2135 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136 status = U_ZERO_ERROR;
2137 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2138 status = U_ZERO_ERROR;
2140 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2145 utext_close(&destText);
2146 utext_close(&input);
2156 UErrorCode status=U_ZERO_ERROR;
2157 UText re=UTEXT_INITIALIZER;
2158 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2159 utext_openUTF8(&re, str_abc, -1, &status);
2161 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2163 UText input = UTEXT_INITIALIZER;
2164 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2165 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2166 // 012345678901234567
2168 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2170 REGEX_ASSERT(matcher->find());
2171 REGEX_ASSERT(matcher->start(status) == 1);
2172 REGEX_ASSERT(matcher->find());
2173 REGEX_ASSERT(matcher->start(status) == 6);
2174 REGEX_ASSERT(matcher->find());
2175 REGEX_ASSERT(matcher->start(status) == 12);
2176 REGEX_ASSERT(matcher->find() == FALSE);
2177 REGEX_ASSERT(matcher->find() == FALSE);
2180 REGEX_ASSERT(matcher->find());
2181 REGEX_ASSERT(matcher->start(status) == 1);
2183 REGEX_ASSERT(matcher->find(0, status));
2184 REGEX_ASSERT(matcher->start(status) == 1);
2185 REGEX_ASSERT(matcher->find(1, status));
2186 REGEX_ASSERT(matcher->start(status) == 1);
2187 REGEX_ASSERT(matcher->find(2, status));
2188 REGEX_ASSERT(matcher->start(status) == 6);
2189 REGEX_ASSERT(matcher->find(12, status));
2190 REGEX_ASSERT(matcher->start(status) == 12);
2191 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2192 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2193 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2194 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2196 status = U_ZERO_ERROR;
2197 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2198 status = U_ZERO_ERROR;
2199 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2201 REGEX_ASSERT(matcher->groupCount() == 0);
2206 utext_close(&input);
2212 // find, with \G in pattern (true if at the end of a previous match).
2217 UErrorCode status=U_ZERO_ERROR;
2218 UText re=UTEXT_INITIALIZER;
2219 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2220 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2222 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2225 UText input = UTEXT_INITIALIZER;
2226 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2227 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2228 // 012345678901234567
2230 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2232 REGEX_ASSERT(matcher->find());
2233 REGEX_ASSERT(matcher->start(status) == 0);
2234 REGEX_ASSERT(matcher->start(1, status) == -1);
2235 REGEX_ASSERT(matcher->start(2, status) == 1);
2237 REGEX_ASSERT(matcher->find());
2238 REGEX_ASSERT(matcher->start(status) == 4);
2239 REGEX_ASSERT(matcher->start(1, status) == 4);
2240 REGEX_ASSERT(matcher->start(2, status) == -1);
2246 utext_close(&input);
2251 // find with zero length matches, match position should bump ahead
2252 // to prevent loops.
2256 UErrorCode status=U_ZERO_ERROR;
2257 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2258 // using an always-true look-ahead.
2260 UText s = UTEXT_INITIALIZER;
2261 utext_openUTF8(&s, " ", -1, &status);
2264 if (m.find() == FALSE) {
2267 REGEX_ASSERT(m.start(status) == i);
2268 REGEX_ASSERT(m.end(status) == i);
2272 // Check that the bump goes over characters outside the BMP OK
2273 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2274 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2275 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2278 if (m.find() == FALSE) {
2281 REGEX_ASSERT(m.start(status) == i);
2282 REGEX_ASSERT(m.end(status) == i);
2284 REGEX_ASSERT(i==20);
2289 // find() loop breaking test.
2290 // with pattern of /.?/, should see a series of one char matches, then a single
2291 // match of zero length at the end of the input string.
2293 UErrorCode status=U_ZERO_ERROR;
2294 RegexMatcher m(".?", 0, status);
2296 UText s = UTEXT_INITIALIZER;
2297 utext_openUTF8(&s, " ", -1, &status);
2300 if (m.find() == FALSE) {
2303 REGEX_ASSERT(m.start(status) == i);
2304 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2313 // Matchers with no input string behave as if they had an empty input string.
2317 UErrorCode status = U_ZERO_ERROR;
2318 RegexMatcher m(".?", 0, status);
2320 REGEX_ASSERT(m.find());
2321 REGEX_ASSERT(m.start(status) == 0);
2322 REGEX_ASSERT(m.input() == "");
2325 UErrorCode status = U_ZERO_ERROR;
2326 RegexPattern *p = RegexPattern::compile(".", 0, status);
2327 RegexMatcher *m = p->matcher(status);
2330 REGEX_ASSERT(m->find() == FALSE);
2331 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2340 UErrorCode status = U_ZERO_ERROR;
2341 UText testPattern = UTEXT_INITIALIZER;
2342 UText testText = UTEXT_INITIALIZER;
2343 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2344 REGEX_VERBOSE_TEXT(&testPattern);
2345 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2346 REGEX_VERBOSE_TEXT(&testText);
2348 RegexMatcher m(&testPattern, &testText, 0, status);
2350 REGEX_ASSERT(m.regionStart() == 0);
2351 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2352 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2353 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2355 m.region(2,4, status);
2357 REGEX_ASSERT(m.matches(status));
2358 REGEX_ASSERT(m.start(status)==2);
2359 REGEX_ASSERT(m.end(status)==4);
2363 REGEX_ASSERT(m.regionStart() == 0);
2364 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2366 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2367 REGEX_VERBOSE_TEXT(&testText);
2369 REGEX_ASSERT(m.regionStart() == 0);
2370 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2372 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2373 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2374 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2375 REGEX_ASSERT(&m == &m.reset());
2376 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2378 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2379 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2380 REGEX_ASSERT(&m == &m.reset());
2381 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2383 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2384 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2385 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2386 REGEX_ASSERT(&m == &m.reset());
2387 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2389 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2390 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2391 REGEX_ASSERT(&m == &m.reset());
2392 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2394 utext_close(&testText);
2395 utext_close(&testPattern);
2399 // hitEnd() and requireEnd()
2402 UErrorCode status = U_ZERO_ERROR;
2403 UText testPattern = UTEXT_INITIALIZER;
2404 UText testText = UTEXT_INITIALIZER;
2405 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2406 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2407 utext_openUTF8(&testPattern, str_, -1, &status);
2408 utext_openUTF8(&testText, str_aabb, -1, &status);
2410 RegexMatcher m1(&testPattern, &testText, 0, status);
2411 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2412 REGEX_ASSERT(m1.hitEnd() == TRUE);
2413 REGEX_ASSERT(m1.requireEnd() == FALSE);
2416 status = U_ZERO_ERROR;
2417 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2418 utext_openUTF8(&testPattern, str_a, -1, &status);
2419 RegexMatcher m2(&testPattern, &testText, 0, status);
2420 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2421 REGEX_ASSERT(m2.hitEnd() == FALSE);
2422 REGEX_ASSERT(m2.requireEnd() == FALSE);
2425 status = U_ZERO_ERROR;
2426 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2427 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2428 RegexMatcher m3(&testPattern, &testText, 0, status);
2429 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2430 REGEX_ASSERT(m3.hitEnd() == TRUE);
2431 REGEX_ASSERT(m3.requireEnd() == TRUE);
2434 utext_close(&testText);
2435 utext_close(&testPattern);
2440 //---------------------------------------------------------------------------
2442 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2443 // Replace family of functions.
2445 //---------------------------------------------------------------------------
2446 void RegexTest::API_Replace_UTF8() {
2452 UErrorCode status=U_ZERO_ERROR;
2454 UText re=UTEXT_INITIALIZER;
2455 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2456 REGEX_VERBOSE_TEXT(&re);
2457 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2460 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2461 // 012345678901234567
2462 UText dataText = UTEXT_INITIALIZER;
2463 utext_openUTF8(&dataText, data, -1, &status);
2465 REGEX_VERBOSE_TEXT(&dataText);
2466 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2469 // Plain vanilla matches.
2472 UText destText = UTEXT_INITIALIZER;
2473 utext_openUnicodeString(&destText, &dest, &status);
2476 UText replText = UTEXT_INITIALIZER;
2478 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2479 utext_openUTF8(&replText, str_yz, -1, &status);
2480 REGEX_VERBOSE_TEXT(&replText);
2481 result = matcher->replaceFirst(&replText, NULL, status);
2483 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2484 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2485 utext_close(result);
2486 result = matcher->replaceFirst(&replText, &destText, status);
2488 REGEX_ASSERT(result == &destText);
2489 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2491 result = matcher->replaceAll(&replText, NULL, status);
2493 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2494 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2495 utext_close(result);
2497 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2498 result = matcher->replaceAll(&replText, &destText, status);
2500 REGEX_ASSERT(result == &destText);
2501 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2504 // Plain vanilla non-matches.
2506 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2507 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2508 matcher->reset(&dataText);
2510 result = matcher->replaceFirst(&replText, NULL, status);
2512 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2513 utext_close(result);
2514 result = matcher->replaceFirst(&replText, &destText, status);
2516 REGEX_ASSERT(result == &destText);
2517 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2519 result = matcher->replaceAll(&replText, NULL, status);
2521 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2522 utext_close(result);
2523 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2524 result = matcher->replaceAll(&replText, &destText, status);
2526 REGEX_ASSERT(result == &destText);
2527 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2530 // Empty source string
2532 utext_openUTF8(&dataText, NULL, 0, &status);
2533 matcher->reset(&dataText);
2535 result = matcher->replaceFirst(&replText, NULL, status);
2537 REGEX_ASSERT_UTEXT_UTF8("", result);
2538 utext_close(result);
2539 result = matcher->replaceFirst(&replText, &destText, status);
2541 REGEX_ASSERT(result == &destText);
2542 REGEX_ASSERT_UTEXT_UTF8("", result);
2544 result = matcher->replaceAll(&replText, NULL, status);
2546 REGEX_ASSERT_UTEXT_UTF8("", result);
2547 utext_close(result);
2548 result = matcher->replaceAll(&replText, &destText, status);
2550 REGEX_ASSERT(result == &destText);
2551 REGEX_ASSERT_UTEXT_UTF8("", result);
2554 // Empty substitution string
2556 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2557 matcher->reset(&dataText);
2559 utext_openUTF8(&replText, NULL, 0, &status);
2560 result = matcher->replaceFirst(&replText, NULL, status);
2562 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2563 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2564 utext_close(result);
2565 result = matcher->replaceFirst(&replText, &destText, status);
2567 REGEX_ASSERT(result == &destText);
2568 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2570 result = matcher->replaceAll(&replText, NULL, status);
2572 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2573 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2574 utext_close(result);
2575 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2576 result = matcher->replaceAll(&replText, &destText, status);
2578 REGEX_ASSERT(result == &destText);
2579 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2582 // match whole string
2584 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2585 utext_openUTF8(&dataText, str_abc, -1, &status);
2586 matcher->reset(&dataText);
2588 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2589 utext_openUTF8(&replText, str_xyz, -1, &status);
2590 result = matcher->replaceFirst(&replText, NULL, status);
2592 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2593 utext_close(result);
2594 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2595 result = matcher->replaceFirst(&replText, &destText, status);
2597 REGEX_ASSERT(result == &destText);
2598 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2600 result = matcher->replaceAll(&replText, NULL, status);
2602 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2603 utext_close(result);
2604 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2605 result = matcher->replaceAll(&replText, &destText, status);
2607 REGEX_ASSERT(result == &destText);
2608 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2611 // Capture Group, simple case
2613 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2614 utext_openUTF8(&re, str_add, -1, &status);
2615 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2618 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2619 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2620 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2623 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2624 utext_openUTF8(&replText, str_11, -1, &status);
2625 result = matcher2->replaceFirst(&replText, NULL, status);
2627 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2628 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2629 utext_close(result);
2630 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2631 result = matcher2->replaceFirst(&replText, &destText, status);
2633 REGEX_ASSERT(result == &destText);
2634 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2636 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2637 utext_openUTF8(&replText, str_v, -1, &status);
2638 REGEX_VERBOSE_TEXT(&replText);
2639 result = matcher2->replaceFirst(&replText, NULL, status);
2641 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2642 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2643 utext_close(result);
2644 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2645 result = matcher2->replaceFirst(&replText, &destText, status);
2647 REGEX_ASSERT(result == &destText);
2648 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2650 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2651 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2652 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2653 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2654 result = matcher2->replaceFirst(&replText, NULL, status);
2656 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2657 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2658 utext_close(result);
2659 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2660 result = matcher2->replaceFirst(&replText, &destText, status);
2662 REGEX_ASSERT(result == &destText);
2663 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2665 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2666 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2667 // 012345678901234567890123456
2668 supplDigitChars[22] = 0xF0;
2669 supplDigitChars[23] = 0x9D;
2670 supplDigitChars[24] = 0x9F;
2671 supplDigitChars[25] = 0x8F;
2672 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2674 result = matcher2->replaceFirst(&replText, NULL, status);
2676 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2677 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2678 utext_close(result);
2679 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2680 result = matcher2->replaceFirst(&replText, &destText, status);
2682 REGEX_ASSERT(result == &destText);
2683 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2684 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2685 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2686 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2687 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2688 utext_close(result);
2689 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2690 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2691 REGEX_ASSERT(result == &destText);
2692 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2695 // Replacement String with \u hex escapes
2698 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2699 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2700 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2701 utext_openUTF8(&replText, str_u0043, -1, &status);
2702 matcher->reset(&dataText);
2704 result = matcher->replaceAll(&replText, NULL, status);
2706 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2707 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2708 utext_close(result);
2709 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2710 result = matcher->replaceAll(&replText, &destText, status);
2712 REGEX_ASSERT(result == &destText);
2713 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2716 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2717 utext_openUTF8(&dataText, str_abc, -1, &status);
2718 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2719 utext_openUTF8(&replText, str_U00010000, -1, &status);
2720 matcher->reset(&dataText);
2722 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2729 result = matcher->replaceAll(&replText, NULL, status);
2731 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2732 utext_close(result);
2733 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2734 result = matcher->replaceAll(&replText, &destText, status);
2736 REGEX_ASSERT(result == &destText);
2737 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2739 // TODO: need more through testing of capture substitutions.
2744 status = U_ZERO_ERROR;
2745 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2746 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2747 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2748 utext_openUTF8(&re, str_ssee, -1, &status);
2749 utext_openUTF8(&dataText, str_blah, -1, &status);
2750 utext_openUTF8(&replText, str_ooh, -1, &status);
2752 RegexMatcher m(&re, 0, status);
2755 UnicodeString result;
2756 UText resultText = UTEXT_INITIALIZER;
2757 utext_openUnicodeString(&resultText, &result, &status);
2759 // Multiple finds do NOT bump up the previous appendReplacement postion.
2763 m.appendReplacement(&resultText, &replText, status);
2765 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2766 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2768 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2769 status = U_ZERO_ERROR;
2771 utext_openUnicodeString(&resultText, &result, &status);
2772 m.reset(10, status);
2775 m.appendReplacement(&resultText, &replText, status);
2777 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2778 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2780 // find() at interior of string, appendReplacement still starts at beginning.
2781 status = U_ZERO_ERROR;
2783 utext_openUnicodeString(&resultText, &result, &status);
2787 m.appendReplacement(&resultText, &replText, status);
2789 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2790 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2792 m.appendTail(&resultText, status);
2793 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2794 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2796 utext_close(&resultText);
2804 utext_close(&dataText);
2805 utext_close(&replText);
2806 utext_close(&destText);
2811 //---------------------------------------------------------------------------
2813 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2814 // present and nominally working.
2816 //---------------------------------------------------------------------------
2817 void RegexTest::API_Pattern_UTF8() {
2818 RegexPattern pata; // Test default constructor to not crash.
2821 REGEX_ASSERT(pata == patb);
2822 REGEX_ASSERT(pata == pata);
2824 UText re1 = UTEXT_INITIALIZER;
2825 UText re2 = UTEXT_INITIALIZER;
2826 UErrorCode status = U_ZERO_ERROR;
2829 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2830 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2831 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2832 utext_openUTF8(&re2, str_def, -1, &status);
2834 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2835 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2837 REGEX_ASSERT(*pat1 == *pat1);
2838 REGEX_ASSERT(*pat1 != pata);
2842 REGEX_ASSERT(patb == *pat1);
2845 RegexPattern patc(*pat1);
2846 REGEX_ASSERT(patc == *pat1);
2847 REGEX_ASSERT(patb == patc);
2848 REGEX_ASSERT(pat1 != pat2);
2850 REGEX_ASSERT(patb != patc);
2851 REGEX_ASSERT(patb == *pat2);
2853 // Compile with no flags.
2854 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2855 REGEX_ASSERT(*pat1a == *pat1);
2857 REGEX_ASSERT(pat1a->flags() == 0);
2859 // Compile with different flags should be not equal
2860 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2863 REGEX_ASSERT(*pat1b != *pat1a);
2864 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2865 REGEX_ASSERT(pat1a->flags() == 0);
2869 RegexPattern *pat1c = pat1->clone();
2870 REGEX_ASSERT(*pat1c == *pat1);
2871 REGEX_ASSERT(*pat1c != *pat2);
2883 // Verify that a matcher created from a cloned pattern works.
2887 UErrorCode status = U_ZERO_ERROR;
2888 UText pattern = UTEXT_INITIALIZER;
2889 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2890 utext_openUTF8(&pattern, str_pL, -1, &status);
2892 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2893 RegexPattern *pClone = pSource->clone();
2895 RegexMatcher *mFromClone = pClone->matcher(status);
2898 UText input = UTEXT_INITIALIZER;
2899 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2900 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2901 mFromClone->reset(&input);
2902 REGEX_ASSERT(mFromClone->find() == TRUE);
2903 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2904 REGEX_ASSERT(mFromClone->find() == TRUE);
2905 REGEX_ASSERT(mFromClone->group(status) == "World");
2906 REGEX_ASSERT(mFromClone->find() == FALSE);
2910 utext_close(&input);
2911 utext_close(&pattern);
2915 // matches convenience API
2918 UErrorCode status = U_ZERO_ERROR;
2919 UText pattern = UTEXT_INITIALIZER;
2920 UText input = UTEXT_INITIALIZER;
2922 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2923 utext_openUTF8(&input, str_randominput, -1, &status);
2925 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2926 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2927 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2930 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2931 utext_openUTF8(&pattern, str_abc, -1, &status);
2932 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2935 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2936 utext_openUTF8(&pattern, str_nput, -1, &status);
2937 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2940 utext_openUTF8(&pattern, str_randominput, -1, &status);
2941 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2944 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2945 utext_openUTF8(&pattern, str_u, -1, &status);
2946 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2949 utext_openUTF8(&input, str_abc, -1, &status);
2950 utext_openUTF8(&pattern, str_abc, -1, &status);
2951 status = U_INDEX_OUTOFBOUNDS_ERROR;
2952 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2953 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2955 utext_close(&input);
2956 utext_close(&pattern);
2963 status = U_ZERO_ERROR;
2964 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2965 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2966 pat1 = RegexPattern::compile(&re1, pe, status);
2968 UnicodeString fields[10];
2971 n = pat1->split("Now is the time", fields, 10, status);
2974 REGEX_ASSERT(fields[0]=="Now");
2975 REGEX_ASSERT(fields[1]=="is");
2976 REGEX_ASSERT(fields[2]=="the");
2977 REGEX_ASSERT(fields[3]=="time");
2978 REGEX_ASSERT(fields[4]=="");
2980 n = pat1->split("Now is the time", fields, 2, status);
2983 REGEX_ASSERT(fields[0]=="Now");
2984 REGEX_ASSERT(fields[1]=="is the time");
2985 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2988 status = U_ZERO_ERROR;
2989 n = pat1->split("Now is the time", fields, 1, status);
2992 REGEX_ASSERT(fields[0]=="Now is the time");
2993 REGEX_ASSERT(fields[1]=="*");
2994 status = U_ZERO_ERROR;
2996 n = pat1->split(" Now is the time ", fields, 10, status);
2999 REGEX_ASSERT(fields[0]=="");
3000 REGEX_ASSERT(fields[1]=="Now");
3001 REGEX_ASSERT(fields[2]=="is");
3002 REGEX_ASSERT(fields[3]=="the");
3003 REGEX_ASSERT(fields[4]=="time");
3004 REGEX_ASSERT(fields[5]=="");
3005 REGEX_ASSERT(fields[6]=="");
3008 n = pat1->split(" ", fields, 10, status);
3011 REGEX_ASSERT(fields[0]=="");
3012 REGEX_ASSERT(fields[1]=="");
3013 REGEX_ASSERT(fields[2]=="*");
3016 n = pat1->split("", fields, 10, status);
3019 REGEX_ASSERT(fields[0]=="foo");
3023 // split, with a pattern with (capture)
3024 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
3025 pat1 = RegexPattern::compile(&re1, pe, status);
3028 status = U_ZERO_ERROR;
3029 fields[6] = fields[7] = "*";
3030 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3033 REGEX_ASSERT(fields[0]=="");
3034 REGEX_ASSERT(fields[1]=="a");
3035 REGEX_ASSERT(fields[2]=="Now is ");
3036 REGEX_ASSERT(fields[3]=="b");
3037 REGEX_ASSERT(fields[4]=="the time");
3038 REGEX_ASSERT(fields[5]=="c");
3039 REGEX_ASSERT(fields[6]=="");
3040 REGEX_ASSERT(fields[7]=="*");
3041 REGEX_ASSERT(status==U_ZERO_ERROR);
3043 fields[6] = fields[7] = "*";
3044 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
3047 REGEX_ASSERT(fields[0]==" ");
3048 REGEX_ASSERT(fields[1]=="a");
3049 REGEX_ASSERT(fields[2]=="Now is ");
3050 REGEX_ASSERT(fields[3]=="b");
3051 REGEX_ASSERT(fields[4]=="the time");
3052 REGEX_ASSERT(fields[5]=="c");
3053 REGEX_ASSERT(fields[6]=="");
3054 REGEX_ASSERT(fields[7]=="*");
3056 status = U_ZERO_ERROR;
3058 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
3061 REGEX_ASSERT(fields[0]==" ");
3062 REGEX_ASSERT(fields[1]=="a");
3063 REGEX_ASSERT(fields[2]=="Now is ");
3064 REGEX_ASSERT(fields[3]=="b");
3065 REGEX_ASSERT(fields[4]=="the time");
3066 REGEX_ASSERT(fields[5]==" ");
3067 REGEX_ASSERT(fields[6]=="foo");
3069 status = U_ZERO_ERROR;
3071 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3074 REGEX_ASSERT(fields[0]==" ");
3075 REGEX_ASSERT(fields[1]=="a");
3076 REGEX_ASSERT(fields[2]=="Now is ");
3077 REGEX_ASSERT(fields[3]=="b");
3078 REGEX_ASSERT(fields[4]=="the time<c>");
3079 REGEX_ASSERT(fields[5]=="foo");
3081 status = U_ZERO_ERROR;
3083 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3086 REGEX_ASSERT(fields[0]==" ");
3087 REGEX_ASSERT(fields[1]=="a");
3088 REGEX_ASSERT(fields[2]=="Now is ");
3089 REGEX_ASSERT(fields[3]=="b");
3090 REGEX_ASSERT(fields[4]=="the time");
3091 REGEX_ASSERT(fields[5]=="foo");
3093 status = U_ZERO_ERROR;
3094 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3097 REGEX_ASSERT(fields[0]==" ");
3098 REGEX_ASSERT(fields[1]=="a");
3099 REGEX_ASSERT(fields[2]=="Now is ");
3100 REGEX_ASSERT(fields[3]=="the time<c>");
3101 status = U_ZERO_ERROR;
3104 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3105 pat1 = RegexPattern::compile(&re1, pe, status);
3107 n = pat1->split("1-10,20", fields, 10, status);
3110 REGEX_ASSERT(fields[0]=="1");
3111 REGEX_ASSERT(fields[1]=="-");
3112 REGEX_ASSERT(fields[2]=="10");
3113 REGEX_ASSERT(fields[3]==",");
3114 REGEX_ASSERT(fields[4]=="20");
3119 // split of a UText based string, with library allocating output UTexts.
3122 status = U_ZERO_ERROR;
3123 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3124 UnicodeString stringToSplit("first:second:third");
3125 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3128 UText *splits[10] = {NULL};
3129 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3131 REGEX_ASSERT(numFields == 5);
3132 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3133 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3134 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3135 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3136 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3137 REGEX_ASSERT(splits[5] == NULL);
3139 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3141 utext_close(splits[i]);
3145 utext_close(textToSplit);
3150 // RegexPattern::pattern() and patternText()
3152 pat1 = new RegexPattern();
3153 REGEX_ASSERT(pat1->pattern() == "");
3154 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3156 const char *helloWorldInvariant = "(Hello, world)*";
3157 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3158 pat1 = RegexPattern::compile(&re1, pe, status);
3160 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3161 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3168 //---------------------------------------------------------------------------
3170 // Extended A more thorough check for features of regex patterns
3171 // The test cases are in a separate data file,
3172 // source/tests/testdata/regextst.txt
3173 // A description of the test data format is included in that file.
3175 //---------------------------------------------------------------------------
3178 RegexTest::getPath(char buffer[2048], const char *filename) {
3179 UErrorCode status=U_ZERO_ERROR;
3180 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3181 if (U_FAILURE(status)) {
3182 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3186 strcpy(buffer, testDataDirectory);
3187 strcat(buffer, filename);
3191 void RegexTest::Extended() {
3193 const char *srcPath;
3194 UErrorCode status = U_ZERO_ERROR;
3195 int32_t lineNum = 0;
3198 // Open and read the test data file.
3200 srcPath=getPath(tdd, "regextst.txt");
3202 return; /* something went wrong, error already output */
3206 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3207 if (U_FAILURE(status)) {
3208 return; /* something went wrong, error already output */
3212 // Put the test data into a UnicodeString
3214 UnicodeString testString(FALSE, testData, len);
3216 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3217 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3218 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3220 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3221 UnicodeString testPattern; // The pattern for test from the test file.
3222 UnicodeString testFlags; // the flags for a test.
3223 UnicodeString matchString; // The marked up string to be used as input
3225 if (U_FAILURE(status)){
3226 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3232 // Loop over the test data file, once per line.
3234 while (lineMat.find()) {
3236 if (U_FAILURE(status)) {
3237 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3240 status = U_ZERO_ERROR;
3241 UnicodeString testLine = lineMat.group(1, status);
3242 if (testLine.length() == 0) {
3247 // Parse the test line. Skip blank and comment only lines.
3248 // Separate out the three main fields - pattern, flags, target.
3251 commentMat.reset(testLine);
3252 if (commentMat.lookingAt(status)) {
3253 // This line is a comment, or blank.
3258 // Pull out the pattern field, remove it from the test file line.
3260 quotedStuffMat.reset(testLine);
3261 if (quotedStuffMat.lookingAt(status)) {
3262 testPattern = quotedStuffMat.group(2, status);
3263 testLine.remove(0, quotedStuffMat.end(0, status));
3265 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3271 // Pull out the flags from the test file line.
3273 flagsMat.reset(testLine);
3274 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3275 testFlags = flagsMat.group(1, status);
3276 if (flagsMat.group(2, status).length() > 0) {
3277 errln("Bad Match flag at line %d. Scanning %c\n",
3278 lineNum, flagsMat.group(2, status).charAt(0));
3281 testLine.remove(0, flagsMat.end(0, status));
3284 // Pull out the match string, as a whole.
3285 // We'll process the <tags> later.
3287 quotedStuffMat.reset(testLine);
3288 if (quotedStuffMat.lookingAt(status)) {
3289 matchString = quotedStuffMat.group(2, status);
3290 testLine.remove(0, quotedStuffMat.end(0, status));
3292 errln("Bad match string at test file line %d", lineNum);
3297 // The only thing left from the input line should be an optional trailing comment.
3299 commentMat.reset(testLine);
3300 if (commentMat.lookingAt(status) == FALSE) {
3301 errln("Line %d: unexpected characters at end of test line.", lineNum);
3308 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3317 //---------------------------------------------------------------------------
3319 // regex_find(pattern, flags, inputString, lineNumber)
3321 // Function to run a single test from the Extended (data driven) tests.
3322 // See file test/testdata/regextst.txt for a description of the
3323 // pattern and inputString fields, and the allowed flags.
3324 // lineNumber is the source line in regextst.txt of the test.
3326 //---------------------------------------------------------------------------
3329 // Set a value into a UVector at position specified by a decimal number in
3330 // a UnicodeString. This is a utility function needed by the actual test function,
3332 static void set(UVector &vec, int32_t val, UnicodeString index) {
3333 UErrorCode status=U_ZERO_ERROR;
3335 for (int32_t i=0; i<index.length(); i++) {
3336 int32_t d=u_charDigitValue(index.charAt(i));
3340 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3341 vec.setElementAt(val, idx);
3344 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3345 UErrorCode status=U_ZERO_ERROR;
3346 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3347 vec.setElementAt(val, idx);
3350 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3352 UBool couldFind = TRUE;
3353 UTEXT_SETNATIVEINDEX(utext, 0);
3355 while (i < unistrOffset) {
3356 UChar32 c = UTEXT_NEXT32(utext);
3357 if (c != U_SENTINEL) {
3364 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3369 void RegexTest::regex_find(const UnicodeString &pattern,
3370 const UnicodeString &flags,
3371 const UnicodeString &inputString,
3372 const char *srcPath,
3374 UnicodeString unEscapedInput;
3375 UnicodeString deTaggedInput;
3377 int32_t patternUTF8Length, inputUTF8Length;
3378 char *patternChars = NULL, *inputChars = NULL;
3379 UText patternText = UTEXT_INITIALIZER;
3380 UText inputText = UTEXT_INITIALIZER;
3381 UConverter *UTF8Converter = NULL;
3383 UErrorCode status = U_ZERO_ERROR;
3385 RegexPattern *parsePat = NULL;
3386 RegexMatcher *parseMatcher = NULL;
3387 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3388 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3389 UVector groupStarts(status);
3390 UVector groupEnds(status);
3391 UVector groupStartsUTF8(status);
3392 UVector groupEndsUTF8(status);
3393 UBool isMatch = FALSE, isUTF8Match = FALSE;
3394 UBool failed = FALSE;
3397 UBool useMatchesFunc = FALSE;
3398 UBool useLookingAtFunc = FALSE;
3399 int32_t regionStart = -1;
3400 int32_t regionEnd = -1;
3401 int32_t regionStartUTF8 = -1;
3402 int32_t regionEndUTF8 = -1;
3406 // Compile the caller's pattern
3408 uint32_t bflags = 0;
3409 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3410 bflags |= UREGEX_CASE_INSENSITIVE;
3412 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3413 bflags |= UREGEX_COMMENTS;
3415 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3416 bflags |= UREGEX_DOTALL;
3418 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3419 bflags |= UREGEX_MULTILINE;
3422 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3423 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3425 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3426 bflags |= UREGEX_UNIX_LINES;
3428 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3429 bflags |= UREGEX_LITERAL;
3433 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3434 if (status != U_ZERO_ERROR) {
3435 #if UCONFIG_NO_BREAK_ITERATION==1
3436 // 'v' test flag means that the test pattern should not compile if ICU was configured
3437 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3438 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3439 goto cleanupAndReturn;
3442 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3443 // Expected pattern compilation error.
3444 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3445 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3447 goto cleanupAndReturn;
3449 // Unexpected pattern compilation error.
3450 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3451 goto cleanupAndReturn;
3455 UTF8Converter = ucnv_open("UTF8", &status);
3456 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3458 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3459 status = U_ZERO_ERROR; // buffer overflow
3460 patternChars = new char[patternUTF8Length+1];
3461 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3462 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3464 if (status == U_ZERO_ERROR) {
3465 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3467 if (status != U_ZERO_ERROR) {
3468 #if UCONFIG_NO_BREAK_ITERATION==1
3469 // 'v' test flag means that the test pattern should not compile if ICU was configured
3470 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3471 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3472 goto cleanupAndReturn;
3475 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3476 // Expected pattern compilation error.
3477 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3478 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3480 goto cleanupAndReturn;
3482 // Unexpected pattern compilation error.
3483 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3484 goto cleanupAndReturn;
3489 if (UTF8Pattern == NULL) {
3490 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3491 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3492 status = U_ZERO_ERROR;
3495 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3496 callerPattern->dumpPattern();
3499 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3500 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3501 goto cleanupAndReturn;
3506 // Number of times find() should be called on the test string, default to 1
3509 for (i=2; i<=9; i++) {
3510 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3511 if (numFinds != 1) {
3512 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3513 goto cleanupAndReturn;
3519 // 'M' flag. Use matches() instead of find()
3520 if (flags.indexOf((UChar)0x4d) >= 0) {
3521 useMatchesFunc = TRUE;
3523 if (flags.indexOf((UChar)0x4c) >= 0) {
3524 useLookingAtFunc = TRUE;
3528 // Find the tags in the input data, remove them, and record the group boundary
3531 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3532 REGEX_CHECK_STATUS_L(line);
3534 unEscapedInput = inputString.unescape();
3535 parseMatcher = parsePat->matcher(unEscapedInput, status);
3536 REGEX_CHECK_STATUS_L(line);
3537 while(parseMatcher->find()) {
3538 parseMatcher->appendReplacement(deTaggedInput, "", status);
3540 UnicodeString groupNum = parseMatcher->group(2, status);
3541 if (groupNum == "r") {
3542 // <r> or </r>, a region specification within the string
3543 if (parseMatcher->group(1, status) == "/") {
3544 regionEnd = deTaggedInput.length();
3546 regionStart = deTaggedInput.length();
3549 // <digits> or </digits>, a group match boundary tag.
3550 if (parseMatcher->group(1, status) == "/") {
3551 set(groupEnds, deTaggedInput.length(), groupNum);
3553 set(groupStarts, deTaggedInput.length(), groupNum);
3557 parseMatcher->appendTail(deTaggedInput);
3558 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3559 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3560 errln("mismatched <r> tags");
3562 goto cleanupAndReturn;
3566 // Configure the matcher according to the flags specified with this test.
3568 matcher = callerPattern->matcher(deTaggedInput, status);
3569 REGEX_CHECK_STATUS_L(line);
3570 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3571 matcher->setTrace(TRUE);
3574 if (UTF8Pattern != NULL) {
3575 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3576 status = U_ZERO_ERROR; // buffer overflow
3577 inputChars = new char[inputUTF8Length+1];
3578 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3579 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3581 if (status == U_ZERO_ERROR) {
3582 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3583 REGEX_CHECK_STATUS_L(line);
3586 if (UTF8Matcher == NULL) {
3587 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3588 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3589 status = U_ZERO_ERROR;
3594 // Generate native indices for UTF8 versions of region and capture group info
3596 if (UTF8Matcher != NULL) {
3597 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3598 UTF8Matcher->setTrace(TRUE);
3600 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3601 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3603 // Fill out the native index UVector info.
3604 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3605 for (i=0; i<groupStarts.size(); i++) {
3606 int32_t start = groupStarts.elementAti(i);
3607 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3610 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3611 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3613 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3615 setInt(groupStartsUTF8, startUTF8, i);
3618 int32_t end = groupEnds.elementAti(i);
3619 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3622 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3623 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3625 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3627 setInt(groupEndsUTF8, endUTF8, i);
3632 if (regionStart>=0) {
3633 matcher->region(regionStart, regionEnd, status);
3634 REGEX_CHECK_STATUS_L(line);
3635 if (UTF8Matcher != NULL) {
3636 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3637 REGEX_CHECK_STATUS_L(line);
3640 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3641 matcher->useAnchoringBounds(FALSE);
3642 if (UTF8Matcher != NULL) {
3643 UTF8Matcher->useAnchoringBounds(FALSE);
3646 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3647 matcher->useTransparentBounds(TRUE);
3648 if (UTF8Matcher != NULL) {
3649 UTF8Matcher->useTransparentBounds(TRUE);
3656 // Do a find on the de-tagged input using the caller's pattern
3657 // TODO: error on count>1 and not find().
3658 // error on both matches() and lookingAt().
3660 for (i=0; i<numFinds; i++) {
3661 if (useMatchesFunc) {
3662 isMatch = matcher->matches(status);
3663 if (UTF8Matcher != NULL) {
3664 isUTF8Match = UTF8Matcher->matches(status);
3666 } else if (useLookingAtFunc) {
3667 isMatch = matcher->lookingAt(status);
3668 if (UTF8Matcher != NULL) {
3669 isUTF8Match = UTF8Matcher->lookingAt(status);
3672 isMatch = matcher->find();
3673 if (UTF8Matcher != NULL) {
3674 isUTF8Match = UTF8Matcher->find();
3678 matcher->setTrace(FALSE);
3680 UTF8Matcher->setTrace(FALSE);
3682 if (U_FAILURE(status)) {
3683 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3687 // Match up the groups from the find() with the groups from the tags
3690 // number of tags should match number of groups from find operation.
3691 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3692 // G option in test means that capture group data is not available in the
3693 // expected results, so the check needs to be suppressed.
3694 if (isMatch == FALSE && groupStarts.size() != 0) {
3695 dataerrln("Error at line %d: Match expected, but none found.", line);
3697 goto cleanupAndReturn;
3698 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3699 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3701 goto cleanupAndReturn;
3703 if (isMatch && groupStarts.size() == 0) {
3704 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3707 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3708 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3712 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3713 // Only check for match / no match. Don't check capture groups.
3714 goto cleanupAndReturn;
3717 REGEX_CHECK_STATUS_L(line);
3718 for (i=0; i<=matcher->groupCount(); i++) {
3719 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3720 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3721 if (matcher->start(i, status) != expectedStart) {
3722 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3723 line, i, expectedStart, matcher->start(i, status));
3725 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3726 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3727 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3728 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3730 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3733 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3734 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3735 if (matcher->end(i, status) != expectedEnd) {
3736 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3737 line, i, expectedEnd, matcher->end(i, status));
3739 // Error on end position; keep going; real error is probably yet to come as group
3740 // end positions work from end of the input data towards the front.
3741 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3742 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3743 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3745 // Error on end position; keep going; real error is probably yet to come as group
3746 // end positions work from end of the input data towards the front.
3749 if ( matcher->groupCount()+1 < groupStarts.size()) {
3750 errln("Error at line %d: Expected %d capture groups, found %d.",
3751 line, groupStarts.size()-1, matcher->groupCount());
3754 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3755 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3756 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3760 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3761 matcher->requireEnd() == TRUE) {
3762 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3764 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3765 UTF8Matcher->requireEnd() == TRUE) {
3766 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3770 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3771 matcher->requireEnd() == FALSE) {
3772 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3774 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3775 UTF8Matcher->requireEnd() == FALSE) {
3776 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3780 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3781 matcher->hitEnd() == TRUE) {
3782 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3784 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3785 UTF8Matcher->hitEnd() == TRUE) {
3786 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3790 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3791 matcher->hitEnd() == FALSE) {
3792 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3794 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3795 UTF8Matcher->hitEnd() == FALSE) {
3796 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3803 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3804 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3805 // callerPattern->dump();
3807 delete parseMatcher;
3812 delete callerPattern;
3814 utext_close(&inputText);
3815 delete[] inputChars;
3816 utext_close(&patternText);
3817 delete[] patternChars;
3818 ucnv_close(UTF8Converter);
3824 //---------------------------------------------------------------------------
3826 // Errors Check for error handling in patterns.
3828 //---------------------------------------------------------------------------
3829 void RegexTest::Errors() {
3830 // \escape sequences that aren't implemented yet.
3831 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3833 // Missing close parentheses
3834 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3835 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3836 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3838 // Extra close paren
3839 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3840 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3841 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3843 // Look-ahead, Look-behind
3844 // TODO: add tests for unbounded length look-behinds.
3845 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3847 // Attempt to use non-default flags
3850 UErrorCode status = U_ZERO_ERROR;
3851 int32_t flags = UREGEX_CANON_EQ |
3852 UREGEX_COMMENTS | UREGEX_DOTALL |
3854 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3855 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3860 // Quantifiers are allowed only after something that can be quantified.
3861 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3862 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3863 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3865 // Mal-formed {min,max} quantifiers
3866 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3867 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3868 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3869 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3870 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3871 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3872 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3873 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3874 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3877 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3879 // Invalid Back Reference \0
3880 // For ICU 3.8 and earlier
3881 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3883 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3888 //-------------------------------------------------------------------------------
3890 // Read a text data file, convert it to UChars, and return the data
3891 // in one big UChar * buffer, which the caller must delete.
3893 //--------------------------------------------------------------------------------
3894 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3895 const char *defEncoding, UErrorCode &status) {
3896 UChar *retPtr = NULL;
3897 char *fileBuf = NULL;
3898 UConverter* conv = NULL;
3902 if (U_FAILURE(status)) {
3909 f = fopen(fileName, "rb");
3911 dataerrln("Error opening test data file %s\n", fileName);
3912 status = U_FILE_ACCESS_ERROR;
3921 fseek( f, 0, SEEK_END);
3922 fileSize = ftell(f);
3923 fileBuf = new char[fileSize];
3924 fseek(f, 0, SEEK_SET);
3925 amt_read = fread(fileBuf, 1, fileSize, f);
3926 if (amt_read != fileSize || fileSize <= 0) {
3927 errln("Error reading test data file.");
3928 goto cleanUpAndReturn;
3932 // Look for a Unicode Signature (BOM) on the data just read
3934 int32_t signatureLength;
3935 const char * fileBufC;
3936 const char* encoding;
3939 encoding = ucnv_detectUnicodeSignature(
3940 fileBuf, fileSize, &signatureLength, &status);
3941 if(encoding!=NULL ){
3942 fileBufC += signatureLength;
3943 fileSize -= signatureLength;
3945 encoding = defEncoding;
3946 if (strcmp(encoding, "utf-8") == 0) {
3947 errln("file %s is missing its BOM", fileName);
3952 // Open a converter to take the rule file to UTF-16
3954 conv = ucnv_open(encoding, &status);
3955 if (U_FAILURE(status)) {
3956 goto cleanUpAndReturn;
3960 // Convert the rules to UChar.
3961 // Preflight first to determine required buffer size.
3963 ulen = ucnv_toUChars(conv,
3969 if (status == U_BUFFER_OVERFLOW_ERROR) {
3970 // Buffer Overflow is expected from the preflight operation.
3971 status = U_ZERO_ERROR;
3973 retPtr = new UChar[ulen+1];
3986 if (U_FAILURE(status)) {
3987 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3996 //-------------------------------------------------------------------------------
3998 // PerlTests - Run Perl's regular expression tests
3999 // The input file for this test is re_tests, the standard regular
4000 // expression test data distributed with the Perl source code.
4002 // Here is Perl's description of the test data file:
4004 // # The tests are in a separate file 't/op/re_tests'.
4005 // # Each line in that file is a separate test.
4006 // # There are five columns, separated by tabs.
4008 // # Column 1 contains the pattern, optionally enclosed in C<''>.
4009 // # Modifiers can be put after the closing C<'>.
4011 // # Column 2 contains the string to be matched.
4013 // # Column 3 contains the expected result:
4014 // # y expect a match
4015 // # n expect no match
4016 // # c expect an error
4017 // # B test exposes a known bug in Perl, should be skipped
4018 // # b test exposes a known bug in Perl, should be skipped if noamp
4020 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4022 // # Column 4 contains a string, usually C<$&>.
4024 // # Column 5 contains the expected result of double-quote
4025 // # interpolating that string after the match, or start of error message.
4027 // # Column 6, if present, contains a reason why the test is skipped.
4028 // # This is printed with "skipped", for harness to pick up.
4030 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
4032 // # If you want to add a regular expression test that can't be expressed
4033 // # in this format, don't add it here: put it in op/pat.t instead.
4035 // For ICU, if field 3 contains an 'i', the test will be skipped.
4036 // The test exposes is some known incompatibility between ICU and Perl regexps.
4037 // (The i is in addition to whatever was there before.)
4039 //-------------------------------------------------------------------------------
4040 void RegexTest::PerlTests() {
4042 const char *srcPath;
4043 UErrorCode status = U_ZERO_ERROR;
4047 // Open and read the test data file.
4049 srcPath=getPath(tdd, "re_tests.txt");
4051 return; /* something went wrong, error already output */
4055 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4056 if (U_FAILURE(status)) {
4057 return; /* something went wrong, error already output */
4061 // Put the test data into a UnicodeString
4063 UnicodeString testDataString(FALSE, testData, len);
4066 // Regex to break the input file into lines, and strip the new lines.
4067 // One line per match, capture group one is the desired data.
4069 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4070 if (U_FAILURE(status)) {
4071 dataerrln("RegexPattern::compile() error");
4074 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4077 // Regex to split a test file line into fields.
4078 // There are six fields, separated by tabs.
4080 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4083 // Regex to identify test patterns with flag settings, and to separate them.
4084 // Test patterns with flags look like 'pattern'i
4085 // Test patterns without flags are not quoted: pattern
4086 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4088 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4089 RegexMatcher* flagMat = flagPat->matcher(status);
4092 // The Perl tests reference several perl-isms, which are evaluated/substituted
4093 // in the test data. Not being perl, this must be done explicitly. Here
4094 // are string constants and REs for these constructs.
4096 UnicodeString nulnulSrc("${nulnul}");
4097 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4098 nulnul = nulnul.unescape();
4100 UnicodeString ffffSrc("${ffff}");
4101 UnicodeString ffff("\\uffff", -1, US_INV);
4102 ffff = ffff.unescape();
4104 // regexp for $-[0], $+[2], etc.
4105 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4106 RegexMatcher *groupsMat = groupsPat->matcher(status);
4108 // regexp for $0, $1, $2, etc.
4109 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4110 RegexMatcher *cgMat = cgPat->matcher(status);
4114 // Main Loop for the Perl Tests, runs once per line from the
4117 int32_t lineNum = 0;
4118 int32_t skippedUnimplementedCount = 0;
4119 while (lineMat->find()) {
4123 // Get a line, break it into its fields, do the Perl
4124 // variable substitutions.
4126 UnicodeString line = lineMat->group(1, status);
4127 UnicodeString fields[7];
4128 fieldPat->split(line, fields, 7, status);
4130 flagMat->reset(fields[0]);
4131 flagMat->matches(status);
4132 UnicodeString pattern = flagMat->group(2, status);
4133 pattern.findAndReplace("${bang}", "!");
4134 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4135 pattern.findAndReplace(ffffSrc, ffff);
4138 // Identify patterns that include match flag settings,
4139 // split off the flags, remove the extra quotes.
4141 UnicodeString flagStr = flagMat->group(3, status);
4142 if (U_FAILURE(status)) {
4143 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4147 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4148 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4149 const UChar UChar_m = 0x6d;
4150 const UChar UChar_x = 0x78;
4151 const UChar UChar_y = 0x79;
4152 if (flagStr.indexOf(UChar_i) != -1) {
4153 flags |= UREGEX_CASE_INSENSITIVE;
4155 if (flagStr.indexOf(UChar_m) != -1) {
4156 flags |= UREGEX_MULTILINE;
4158 if (flagStr.indexOf(UChar_x) != -1) {
4159 flags |= UREGEX_COMMENTS;
4163 // Compile the test pattern.
4165 status = U_ZERO_ERROR;
4166 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4167 if (status == U_REGEX_UNIMPLEMENTED) {
4169 // Test of a feature that is planned for ICU, but not yet implemented.
4171 skippedUnimplementedCount++;
4173 status = U_ZERO_ERROR;
4177 if (U_FAILURE(status)) {
4178 // Some tests are supposed to generate errors.
4179 // Only report an error for tests that are supposed to succeed.
4180 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4181 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4183 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4185 status = U_ZERO_ERROR;
4190 if (fields[2].indexOf(UChar_i) >= 0) {
4191 // ICU should skip this test.
4196 if (fields[2].indexOf(UChar_c) >= 0) {
4197 // This pattern should have caused a compilation error, but didn't/
4198 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4204 // replace the Perl variables that appear in some of the
4205 // match data strings.
4207 UnicodeString matchString = fields[1];
4208 matchString.findAndReplace(nulnulSrc, nulnul);
4209 matchString.findAndReplace(ffffSrc, ffff);
4211 // Replace any \n in the match string with an actual new-line char.
4212 // Don't do full unescape, as this unescapes more than Perl does, which
4213 // causes other spurious failures in the tests.
4214 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4219 // Run the test, check for expected match/don't match result.
4221 RegexMatcher *testMat = testPat->matcher(matchString, status);
4222 UBool found = testMat->find();
4223 UBool expected = FALSE;
4224 if (fields[2].indexOf(UChar_y) >=0) {
4227 if (expected != found) {
4228 errln("line %d: Expected %smatch, got %smatch",
4229 lineNum, expected?"":"no ", found?"":"no " );
4233 // Don't try to check expected results if there is no match.
4234 // (Some have stuff in the expected fields)
4242 // Interpret the Perl expression from the fourth field of the data file,
4243 // building up an ICU string from the results of the ICU match.
4244 // The Perl expression will contain references to the results of
4245 // a regex match, including the matched string, capture group strings,
4246 // group starting and ending indicies, etc.
4248 UnicodeString resultString;
4249 UnicodeString perlExpr = fields[3];
4250 #if SUPPORT_MUTATING_INPUT_STRING
4251 groupsMat->reset(perlExpr);
4252 cgMat->reset(perlExpr);
4255 while (perlExpr.length() > 0) {
4256 #if !SUPPORT_MUTATING_INPUT_STRING
4257 // Perferred usage. Reset after any modification to input string.
4258 groupsMat->reset(perlExpr);
4259 cgMat->reset(perlExpr);
4262 if (perlExpr.startsWith("$&")) {
4263 resultString.append(testMat->group(status));
4264 perlExpr.remove(0, 2);
4267 else if (groupsMat->lookingAt(status)) {
4269 UnicodeString digitString = groupsMat->group(2, status);
4271 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4272 UnicodeString plusOrMinus = groupsMat->group(1, status);
4273 int32_t matchPosition;
4274 if (plusOrMinus.compare("+") == 0) {
4275 matchPosition = testMat->end(groupNum, status);
4277 matchPosition = testMat->start(groupNum, status);
4279 if (matchPosition != -1) {
4280 ICU_Utility::appendNumber(resultString, matchPosition);
4282 perlExpr.remove(0, groupsMat->end(status));
4285 else if (cgMat->lookingAt(status)) {
4287 UnicodeString digitString = cgMat->group(1, status);
4289 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4290 if (U_SUCCESS(status)) {
4291 resultString.append(testMat->group(groupNum, status));
4292 status = U_ZERO_ERROR;
4294 perlExpr.remove(0, cgMat->end(status));
4297 else if (perlExpr.startsWith("@-")) {
4299 for (i=0; i<=testMat->groupCount(); i++) {
4301 resultString.append(" ");
4303 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4305 perlExpr.remove(0, 2);
4308 else if (perlExpr.startsWith("@+")) {
4310 for (i=0; i<=testMat->groupCount(); i++) {
4312 resultString.append(" ");
4314 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4316 perlExpr.remove(0, 2);
4319 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4320 // or as an escaped sequence (e.g. \n)
4321 if (perlExpr.length() > 1) {
4322 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4324 UChar c = perlExpr.charAt(0);
4326 case 'n': c = '\n'; break;
4327 // add any other escape sequences that show up in the test expected results.
4329 resultString.append(c);
4330 perlExpr.remove(0, 1);
4334 // Any characters from the perl expression that we don't explicitly
4335 // recognize before here are assumed to be literals and copied
4336 // as-is to the expected results.
4337 resultString.append(perlExpr.charAt(0));
4338 perlExpr.remove(0, 1);
4341 if (U_FAILURE(status)) {
4342 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4348 // Expected Results Compare
4350 UnicodeString expectedS(fields[4]);
4351 expectedS.findAndReplace(nulnulSrc, nulnul);
4352 expectedS.findAndReplace(ffffSrc, ffff);
4353 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4356 if (expectedS.compare(resultString) != 0) {
4357 err("Line %d: Incorrect perl expression results.", lineNum);
4358 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4366 // All done. Clean up allocated stuff.
4384 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4389 //-------------------------------------------------------------------------------
4391 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4392 // (instead of using UnicodeStrings) to test the alternate engine.
4393 // The input file for this test is re_tests, the standard regular
4394 // expression test data distributed with the Perl source code.
4395 // See PerlTests() for more information.
4397 //-------------------------------------------------------------------------------
4398 void RegexTest::PerlTestsUTF8() {
4400 const char *srcPath;
4401 UErrorCode status = U_ZERO_ERROR;
4403 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4404 UText patternText = UTEXT_INITIALIZER;
4405 char *patternChars = NULL;
4406 int32_t patternLength;
4407 int32_t patternCapacity = 0;
4408 UText inputText = UTEXT_INITIALIZER;
4409 char *inputChars = NULL;
4410 int32_t inputLength;
4411 int32_t inputCapacity = 0;
4413 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4416 // Open and read the test data file.
4418 srcPath=getPath(tdd, "re_tests.txt");
4420 return; /* something went wrong, error already output */
4424 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4425 if (U_FAILURE(status)) {
4426 return; /* something went wrong, error already output */
4430 // Put the test data into a UnicodeString
4432 UnicodeString testDataString(FALSE, testData, len);
4435 // Regex to break the input file into lines, and strip the new lines.
4436 // One line per match, capture group one is the desired data.
4438 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4439 if (U_FAILURE(status)) {
4440 dataerrln("RegexPattern::compile() error");
4443 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4446 // Regex to split a test file line into fields.
4447 // There are six fields, separated by tabs.
4449 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4452 // Regex to identify test patterns with flag settings, and to separate them.
4453 // Test patterns with flags look like 'pattern'i
4454 // Test patterns without flags are not quoted: pattern
4455 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4457 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4458 RegexMatcher* flagMat = flagPat->matcher(status);
4461 // The Perl tests reference several perl-isms, which are evaluated/substituted
4462 // in the test data. Not being perl, this must be done explicitly. Here
4463 // are string constants and REs for these constructs.
4465 UnicodeString nulnulSrc("${nulnul}");
4466 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4467 nulnul = nulnul.unescape();
4469 UnicodeString ffffSrc("${ffff}");
4470 UnicodeString ffff("\\uffff", -1, US_INV);
4471 ffff = ffff.unescape();
4473 // regexp for $-[0], $+[2], etc.
4474 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4475 RegexMatcher *groupsMat = groupsPat->matcher(status);
4477 // regexp for $0, $1, $2, etc.
4478 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4479 RegexMatcher *cgMat = cgPat->matcher(status);
4483 // Main Loop for the Perl Tests, runs once per line from the
4486 int32_t lineNum = 0;
4487 int32_t skippedUnimplementedCount = 0;
4488 while (lineMat->find()) {
4492 // Get a line, break it into its fields, do the Perl
4493 // variable substitutions.
4495 UnicodeString line = lineMat->group(1, status);
4496 UnicodeString fields[7];
4497 fieldPat->split(line, fields, 7, status);
4499 flagMat->reset(fields[0]);
4500 flagMat->matches(status);
4501 UnicodeString pattern = flagMat->group(2, status);
4502 pattern.findAndReplace("${bang}", "!");
4503 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4504 pattern.findAndReplace(ffffSrc, ffff);
4507 // Identify patterns that include match flag settings,
4508 // split off the flags, remove the extra quotes.
4510 UnicodeString flagStr = flagMat->group(3, status);
4511 if (U_FAILURE(status)) {
4512 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4516 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4517 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4518 const UChar UChar_m = 0x6d;
4519 const UChar UChar_x = 0x78;
4520 const UChar UChar_y = 0x79;
4521 if (flagStr.indexOf(UChar_i) != -1) {
4522 flags |= UREGEX_CASE_INSENSITIVE;
4524 if (flagStr.indexOf(UChar_m) != -1) {
4525 flags |= UREGEX_MULTILINE;
4527 if (flagStr.indexOf(UChar_x) != -1) {
4528 flags |= UREGEX_COMMENTS;
4532 // Put the pattern in a UTF-8 UText
4534 status = U_ZERO_ERROR;
4535 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4536 if (status == U_BUFFER_OVERFLOW_ERROR) {
4537 status = U_ZERO_ERROR;
4538 delete[] patternChars;
4539 patternCapacity = patternLength + 1;
4540 patternChars = new char[patternCapacity];
4541 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4543 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4546 // Compile the test pattern.
4548 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4549 if (status == U_REGEX_UNIMPLEMENTED) {
4551 // Test of a feature that is planned for ICU, but not yet implemented.
4553 skippedUnimplementedCount++;
4555 status = U_ZERO_ERROR;
4559 if (U_FAILURE(status)) {
4560 // Some tests are supposed to generate errors.
4561 // Only report an error for tests that are supposed to succeed.
4562 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4563 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4565 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4567 status = U_ZERO_ERROR;
4572 if (fields[2].indexOf(UChar_i) >= 0) {
4573 // ICU should skip this test.
4578 if (fields[2].indexOf(UChar_c) >= 0) {
4579 // This pattern should have caused a compilation error, but didn't/
4580 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4587 // replace the Perl variables that appear in some of the
4588 // match data strings.
4590 UnicodeString matchString = fields[1];
4591 matchString.findAndReplace(nulnulSrc, nulnul);
4592 matchString.findAndReplace(ffffSrc, ffff);
4594 // Replace any \n in the match string with an actual new-line char.
4595 // Don't do full unescape, as this unescapes more than Perl does, which
4596 // causes other spurious failures in the tests.
4597 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4600 // Put the input in a UTF-8 UText
4602 status = U_ZERO_ERROR;
4603 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4604 if (status == U_BUFFER_OVERFLOW_ERROR) {
4605 status = U_ZERO_ERROR;
4606 delete[] inputChars;
4607 inputCapacity = inputLength + 1;
4608 inputChars = new char[inputCapacity];
4609 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4611 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4614 // Run the test, check for expected match/don't match result.
4616 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4617 UBool found = testMat->find();
4618 UBool expected = FALSE;
4619 if (fields[2].indexOf(UChar_y) >=0) {
4622 if (expected != found) {
4623 errln("line %d: Expected %smatch, got %smatch",
4624 lineNum, expected?"":"no ", found?"":"no " );
4628 // Don't try to check expected results if there is no match.
4629 // (Some have stuff in the expected fields)
4637 // Interpret the Perl expression from the fourth field of the data file,
4638 // building up an ICU string from the results of the ICU match.
4639 // The Perl expression will contain references to the results of
4640 // a regex match, including the matched string, capture group strings,
4641 // group starting and ending indicies, etc.
4643 UnicodeString resultString;
4644 UnicodeString perlExpr = fields[3];
4646 while (perlExpr.length() > 0) {
4647 groupsMat->reset(perlExpr);
4648 cgMat->reset(perlExpr);
4650 if (perlExpr.startsWith("$&")) {
4651 resultString.append(testMat->group(status));
4652 perlExpr.remove(0, 2);
4655 else if (groupsMat->lookingAt(status)) {
4657 UnicodeString digitString = groupsMat->group(2, status);
4659 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4660 UnicodeString plusOrMinus = groupsMat->group(1, status);
4661 int32_t matchPosition;
4662 if (plusOrMinus.compare("+") == 0) {
4663 matchPosition = testMat->end(groupNum, status);
4665 matchPosition = testMat->start(groupNum, status);
4667 if (matchPosition != -1) {
4668 ICU_Utility::appendNumber(resultString, matchPosition);
4670 perlExpr.remove(0, groupsMat->end(status));
4673 else if (cgMat->lookingAt(status)) {
4675 UnicodeString digitString = cgMat->group(1, status);
4677 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4678 if (U_SUCCESS(status)) {
4679 resultString.append(testMat->group(groupNum, status));
4680 status = U_ZERO_ERROR;
4682 perlExpr.remove(0, cgMat->end(status));
4685 else if (perlExpr.startsWith("@-")) {
4687 for (i=0; i<=testMat->groupCount(); i++) {
4689 resultString.append(" ");
4691 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4693 perlExpr.remove(0, 2);
4696 else if (perlExpr.startsWith("@+")) {
4698 for (i=0; i<=testMat->groupCount(); i++) {
4700 resultString.append(" ");
4702 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4704 perlExpr.remove(0, 2);
4707 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4708 // or as an escaped sequence (e.g. \n)
4709 if (perlExpr.length() > 1) {
4710 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4712 UChar c = perlExpr.charAt(0);
4714 case 'n': c = '\n'; break;
4715 // add any other escape sequences that show up in the test expected results.
4717 resultString.append(c);
4718 perlExpr.remove(0, 1);
4722 // Any characters from the perl expression that we don't explicitly
4723 // recognize before here are assumed to be literals and copied
4724 // as-is to the expected results.
4725 resultString.append(perlExpr.charAt(0));
4726 perlExpr.remove(0, 1);
4729 if (U_FAILURE(status)) {
4730 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4736 // Expected Results Compare
4738 UnicodeString expectedS(fields[4]);
4739 expectedS.findAndReplace(nulnulSrc, nulnul);
4740 expectedS.findAndReplace(ffffSrc, ffff);
4741 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4744 if (expectedS.compare(resultString) != 0) {
4745 err("Line %d: Incorrect perl expression results.", lineNum);
4746 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4754 // All done. Clean up allocated stuff.
4771 utext_close(&patternText);
4772 utext_close(&inputText);
4774 delete [] patternChars;
4775 delete [] inputChars;
4778 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4783 //--------------------------------------------------------------
4785 // Bug6149 Verify limits to heap expansion for backtrack stack.
4786 // Use this pattern,
4787 // "(a?){1,8000000}"
4788 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4789 // This test is likely to be fragile, as further optimizations stop
4790 // more cases of pointless looping in the match engine.
4792 //---------------------------------------------------------------
4793 void RegexTest::Bug6149() {
4794 UnicodeString pattern("(a?){1,8000000}");
4795 UnicodeString s("xyz");
4797 UErrorCode status = U_ZERO_ERROR;
4799 RegexMatcher matcher(pattern, s, flags, status);
4800 UBool result = false;
4801 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4802 REGEX_ASSERT(result == FALSE);
4807 // Callbacks() Test the callback function.
4808 // When set, callbacks occur periodically during matching operations,
4809 // giving the application code the ability to abort the operation
4810 // before it's normal completion.
4813 struct callBackContext {
4818 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4822 static UBool U_CALLCONV
4823 testCallBackFn(const void *context, int32_t steps) {
4824 callBackContext *info = (callBackContext *)context;
4825 if (info->lastSteps+1 != steps) {
4826 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4828 info->lastSteps = steps;
4830 return (info->numCalls < info->maxCalls);
4834 void RegexTest::Callbacks() {
4836 // Getter returns NULLs if no callback has been set
4838 // The variables that the getter will fill in.
4839 // Init to non-null values so that the action of the getter can be seen.
4840 const void *returnedContext = &returnedContext;
4841 URegexMatchCallback *returnedFn = &testCallBackFn;
4843 UErrorCode status = U_ZERO_ERROR;
4844 RegexMatcher matcher("x", 0, status);
4846 matcher.getMatchCallback(returnedFn, returnedContext, status);
4848 REGEX_ASSERT(returnedFn == NULL);
4849 REGEX_ASSERT(returnedContext == NULL);
4854 callBackContext cbInfo = {this, 0, 0, 0};
4855 const void *returnedContext;
4856 URegexMatchCallback *returnedFn;
4857 UErrorCode status = U_ZERO_ERROR;
4858 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4860 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4862 matcher.getMatchCallback(returnedFn, returnedContext, status);
4864 REGEX_ASSERT(returnedFn == testCallBackFn);
4865 REGEX_ASSERT(returnedContext == &cbInfo);
4867 // A short-running match shouldn't invoke the callback
4868 status = U_ZERO_ERROR;
4870 UnicodeString s = "xxx";
4872 REGEX_ASSERT(matcher.matches(status));
4874 REGEX_ASSERT(cbInfo.numCalls == 0);
4876 // A medium-length match that runs long enough to invoke the
4877 // callback, but not so long that the callback aborts it.
4878 status = U_ZERO_ERROR;
4880 s = "aaaaaaaaaaaaaaaaaaab";
4882 REGEX_ASSERT(matcher.matches(status)==FALSE);
4884 REGEX_ASSERT(cbInfo.numCalls > 0);
4886 // A longer running match that the callback function will abort.
4887 status = U_ZERO_ERROR;
4889 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4891 REGEX_ASSERT(matcher.matches(status)==FALSE);
4892 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4893 REGEX_ASSERT(cbInfo.numCalls == 4);
4895 // A longer running find that the callback function will abort.
4896 status = U_ZERO_ERROR;
4898 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4900 REGEX_ASSERT(matcher.find(status)==FALSE);
4901 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4902 REGEX_ASSERT(cbInfo.numCalls == 4);
4910 // FindProgressCallbacks() Test the find "progress" callback function.
4911 // When set, the find progress callback will be invoked during a find operations
4912 // after each return from a match attempt, giving the application the opportunity
4913 // to terminate a long-running find operation before it's normal completion.
4916 struct progressCallBackContext {
4921 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4924 // call-back function for find().
4925 // Return TRUE to continue the find().
4926 // Return FALSE to stop the find().
4928 static UBool U_CALLCONV
4929 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4930 progressCallBackContext *info = (progressCallBackContext *)context;
4932 info->lastIndex = matchIndex;
4933 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4934 return (info->numCalls < info->maxCalls);
4938 void RegexTest::FindProgressCallbacks() {
4940 // Getter returns NULLs if no callback has been set
4942 // The variables that the getter will fill in.
4943 // Init to non-null values so that the action of the getter can be seen.
4944 const void *returnedContext = &returnedContext;
4945 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4947 UErrorCode status = U_ZERO_ERROR;
4948 RegexMatcher matcher("x", 0, status);
4950 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4952 REGEX_ASSERT(returnedFn == NULL);
4953 REGEX_ASSERT(returnedContext == NULL);
4958 progressCallBackContext cbInfo = {this, 0, 0, 0};
4959 const void *returnedContext;
4960 URegexFindProgressCallback *returnedFn;
4961 UErrorCode status = U_ZERO_ERROR;
4962 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4964 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4966 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4968 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4969 REGEX_ASSERT(returnedContext == &cbInfo);
4971 // A find that matches on the initial position does NOT invoke the callback.
4972 status = U_ZERO_ERROR;
4974 UnicodeString s = "aaxxx";
4977 matcher.setTrace(TRUE);
4979 REGEX_ASSERT(matcher.find(0, status));
4981 REGEX_ASSERT(cbInfo.numCalls == 0);
4983 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4984 // but not so many times that we interrupt the operation.
4985 status = U_ZERO_ERROR;
4986 s = "aaaaaaaaaaaaaaaaaaab";
4987 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4989 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4991 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4993 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4994 status = U_ZERO_ERROR;
4995 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4996 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4998 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4999 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
5000 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
5002 // Now a match that will succeed, but after an interruption
5003 status = U_ZERO_ERROR;
5004 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
5005 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
5007 REGEX_ASSERT(matcher.find(0, status)==FALSE);
5008 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
5009 // Now retry the match from where left off
5010 cbInfo.maxCalls = 100; // No callback limit
5011 status = U_ZERO_ERROR;
5012 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
5020 //---------------------------------------------------------------------------
5022 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
5023 // UTexts. The pure-C implementation of UText
5024 // has no mutable backing stores, but we can
5025 // use UnicodeString here to test the functionality.
5027 //---------------------------------------------------------------------------
5028 void RegexTest::PreAllocatedUTextCAPI () {
5029 UErrorCode status = U_ZERO_ERROR;
5030 URegularExpression *re;
5031 UText patternText = UTEXT_INITIALIZER;
5032 UnicodeString buffer;
5033 UText bufferText = UTEXT_INITIALIZER;
5035 utext_openUnicodeString(&bufferText, &buffer, &status);
5038 * getText() and getUText()
5041 UText text1 = UTEXT_INITIALIZER;
5042 UText text2 = UTEXT_INITIALIZER;
5043 UChar text2Chars[20];
5046 status = U_ZERO_ERROR;
5047 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5048 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5049 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5050 utext_openUChars(&text2, text2Chars, -1, &status);
5052 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5053 re = uregex_openUText(&patternText, 0, NULL, &status);
5055 /* First set a UText */
5056 uregex_setUText(re, &text1, &status);
5057 resultText = uregex_getUText(re, &bufferText, &status);
5059 REGEX_ASSERT(resultText == &bufferText);
5060 utext_setNativeIndex(resultText, 0);
5061 utext_setNativeIndex(&text1, 0);
5062 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5064 resultText = uregex_getUText(re, &bufferText, &status);
5066 REGEX_ASSERT(resultText == &bufferText);
5067 utext_setNativeIndex(resultText, 0);
5068 utext_setNativeIndex(&text1, 0);
5069 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5071 /* Then set a UChar * */
5072 uregex_setText(re, text2Chars, 7, &status);
5073 resultText = uregex_getUText(re, &bufferText, &status);
5075 REGEX_ASSERT(resultText == &bufferText);
5076 utext_setNativeIndex(resultText, 0);
5077 utext_setNativeIndex(&text2, 0);
5078 REGEX_ASSERT(testUTextEqual(resultText, &text2));
5081 utext_close(&text1);
5082 utext_close(&text2);
5094 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
5095 // 012345678901234567890123456789012345678901234567
5098 status = U_ZERO_ERROR;
5099 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5102 uregex_setText(re, text1, -1, &status);
5103 result = uregex_find(re, 0, &status);
5104 REGEX_ASSERT(result==TRUE);
5106 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5107 status = U_ZERO_ERROR;
5108 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5110 REGEX_ASSERT(actual == &bufferText);
5111 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5112 REGEX_ASSERT(length == 16);
5113 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5115 /* Capture group #1. Should succeed, matching " interior ". */
5116 status = U_ZERO_ERROR;
5117 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5119 REGEX_ASSERT(actual == &bufferText);
5120 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
5121 REGEX_ASSERT(length == 10);
5122 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5124 /* Capture group out of range. Error. */
5125 status = U_ZERO_ERROR;
5126 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5127 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5128 REGEX_ASSERT(actual == &bufferText);
5139 UText replText = UTEXT_INITIALIZER;
5141 status = U_ZERO_ERROR;
5142 utext_openUnicodeString(&bufferText, &buffer, &status);
5144 status = U_ZERO_ERROR;
5145 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5146 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5147 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5149 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5152 /* Normal case, with match */
5153 uregex_setText(re, text1, -1, &status);
5155 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5157 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5159 REGEX_ASSERT(result == &bufferText);
5160 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5162 /* No match. Text should copy to output with no changes. */
5163 uregex_setText(re, text2, -1, &status);
5164 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5165 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5167 REGEX_ASSERT(result == &bufferText);
5168 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5170 /* Unicode escapes */
5171 uregex_setText(re, text1, -1, &status);
5172 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5173 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5174 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5176 REGEX_ASSERT(result == &bufferText);
5177 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5180 utext_close(&replText);
5190 UText replText = UTEXT_INITIALIZER;
5193 status = U_ZERO_ERROR;
5194 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5195 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5196 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5198 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5201 /* Normal case, with match */
5202 uregex_setText(re, text1, -1, &status);
5203 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5204 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5206 REGEX_ASSERT(result == &bufferText);
5207 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5209 /* No match. Text should copy to output with no changes. */
5210 uregex_setText(re, text2, -1, &status);
5211 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5212 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5214 REGEX_ASSERT(result == &bufferText);
5215 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5218 utext_close(&replText);
5223 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5224 * so we don't need to test it here.
5227 utext_close(&bufferText);
5228 utext_close(&patternText);
5232 //--------------------------------------------------------------
5234 // NamedCapture Check basic named capture group functionality
5236 //--------------------------------------------------------------
5237 void RegexTest::NamedCapture() {
5238 UErrorCode status = U_ZERO_ERROR;
5239 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5240 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5242 int32_t group = pat->groupNumberFromName("five", -1, status);
5244 REGEX_ASSERT(5 == group);
5245 group = pat->groupNumberFromName("three", -1, status);
5247 REGEX_ASSERT(3 == group);
5249 status = U_ZERO_ERROR;
5250 group = pat->groupNumberFromName(UnicodeString("six"), status);
5252 REGEX_ASSERT(6 == group);
5254 status = U_ZERO_ERROR;
5255 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5256 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5258 status = U_ZERO_ERROR;
5260 // After copying a pattern, named capture should still work in the copy.
5261 RegexPattern *copiedPat = new RegexPattern(*pat);
5262 REGEX_ASSERT(*copiedPat == *pat);
5263 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5265 group = copiedPat->groupNumberFromName("five", -1, status);
5267 REGEX_ASSERT(5 == group);
5268 group = copiedPat->groupNumberFromName("three", -1, status);
5270 REGEX_ASSERT(3 == group);
5273 // ReplaceAll with named capture group.
5274 status = U_ZERO_ERROR;
5275 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5276 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5278 // m.pattern().dumpPattern();
5279 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5281 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5284 // ReplaceAll, allowed capture group numbers.
5285 text = UnicodeString("abcmxyz");
5286 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5289 status = U_ZERO_ERROR;
5290 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5292 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5294 status = U_ZERO_ERROR;
5295 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5297 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5299 status = U_ZERO_ERROR;
5300 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5302 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5304 status = U_ZERO_ERROR;
5305 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5307 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5309 status = U_ZERO_ERROR;
5310 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5312 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5314 status = U_ZERO_ERROR;
5315 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5316 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5318 status = U_ZERO_ERROR;
5319 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5320 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5321 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5323 status = U_ZERO_ERROR;
5324 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5325 REGEX_CHECK_STATUS; // that push group num out of range.
5326 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5328 status = U_ZERO_ERROR;
5329 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5331 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5333 status = U_ZERO_ERROR;
5334 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5336 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5338 status = U_ZERO_ERROR;
5339 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5340 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5342 status = U_ZERO_ERROR;
5343 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5344 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5346 status = U_ZERO_ERROR;
5347 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5348 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5350 status = U_ZERO_ERROR;
5351 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5352 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5356 // Repeat the above replaceAll() tests using the plain C API, which
5357 // has a separate implementation internally.
5358 // TODO: factor out the test data.
5360 status = U_ZERO_ERROR;
5361 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5363 text = UnicodeString("abcmxyz");
5364 uregex_setText(re, text.getBuffer(), text.length(), &status);
5367 UChar resultBuf[100];
5368 int32_t resultLength;
5371 status = U_ZERO_ERROR;
5372 repl = UnicodeString("<$0>");
5373 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5375 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5377 status = U_ZERO_ERROR;
5378 repl = UnicodeString("<$1>");
5379 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5381 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5383 status = U_ZERO_ERROR;
5384 repl = UnicodeString("<${one}>");
5385 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5387 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5389 status = U_ZERO_ERROR;
5390 repl = UnicodeString("<$2>");
5391 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5393 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5395 status = U_ZERO_ERROR;
5396 repl = UnicodeString("<$3>");
5397 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5399 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5401 status = U_ZERO_ERROR;
5402 repl = UnicodeString("<$4>");
5403 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5404 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5406 status = U_ZERO_ERROR;
5407 repl = UnicodeString("<$04>");
5408 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5410 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5412 status = U_ZERO_ERROR;
5413 repl = UnicodeString("<$000016>");
5414 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5416 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5418 status = U_ZERO_ERROR;
5419 repl = UnicodeString("<$3$2$1${one}>");
5420 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5422 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5424 status = U_ZERO_ERROR;
5425 repl = UnicodeString("$3$2$1${one}");
5426 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5428 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5430 status = U_ZERO_ERROR;
5431 repl = UnicodeString("<${noSuchName}>");
5432 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5433 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5435 status = U_ZERO_ERROR;
5436 repl = UnicodeString("<${invalid-name}>");
5437 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5438 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5440 status = U_ZERO_ERROR;
5441 repl = UnicodeString("<${one");
5442 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5443 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5445 status = U_ZERO_ERROR;
5446 repl = UnicodeString("$not a capture group");
5447 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5448 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5453 //--------------------------------------------------------------
5455 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5456 // The point is not so much what the exact limit is,
5457 // but that a largish number doesn't hit bad non-linear performance,
5458 // and that exceeding the limit fails cleanly.
5460 //--------------------------------------------------------------
5461 void RegexTest::NamedCaptureLimits() {
5463 logln("Skipping test. Runs in exhuastive mode only.");
5466 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5467 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5469 UnicodeString pattern;
5472 for (nn=1; nn<goodLimit; nn++) {
5473 sprintf(nnbuf, "(?<nn%d>)", nn);
5474 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5476 UErrorCode status = U_ZERO_ERROR;
5477 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5479 for (nn=1; nn<goodLimit; nn++) {
5480 sprintf(nnbuf, "nn%d", nn);
5481 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5482 REGEX_ASSERT(nn == groupNum);
5483 if (nn != groupNum) {
5490 for (nn=1; nn<failLimit; nn++) {
5491 sprintf(nnbuf, "(?<nn%d>)", nn);
5492 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5494 status = U_ZERO_ERROR;
5495 pat = RegexPattern::compile(pattern, 0, status);
5496 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5501 //--------------------------------------------------------------
5503 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5505 //---------------------------------------------------------------
5506 void RegexTest::Bug7651() {
5507 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5508 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5509 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5510 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5511 UnicodeString s("#ff @abcd This is test");
5512 RegexPattern *REPattern = NULL;
5513 RegexMatcher *REMatcher = NULL;
5514 UErrorCode status = U_ZERO_ERROR;
5517 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5519 REMatcher = REPattern->matcher(s, status);
5521 REGEX_ASSERT(REMatcher->find());
5522 REGEX_ASSERT(REMatcher->start(status) == 0);
5525 status = U_ZERO_ERROR;
5527 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5529 REMatcher = REPattern->matcher(s, status);
5531 REGEX_ASSERT(REMatcher->find());
5532 REGEX_ASSERT(REMatcher->start(status) == 0);
5535 status = U_ZERO_ERROR;
5538 void RegexTest::Bug7740() {
5539 UErrorCode status = U_ZERO_ERROR;
5540 UnicodeString pattern = "(a)";
5541 UnicodeString text = "abcdef";
5542 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5544 REGEX_ASSERT(m->lookingAt(status));
5546 status = U_ILLEGAL_ARGUMENT_ERROR;
5547 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5548 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5549 REGEX_ASSERT(s == "");
5553 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5555 void RegexTest::Bug8479() {
5556 UErrorCode status = U_ZERO_ERROR;
5558 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5560 if (U_SUCCESS(status))
5564 pMatcher->reset(str);
5565 status = U_ZERO_ERROR;
5566 pMatcher->matches(status);
5567 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5574 void RegexTest::Bug7029() {
5575 UErrorCode status = U_ZERO_ERROR;
5577 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5578 UnicodeString text = "abc.def";
5579 UnicodeString splits[10];
5581 int32_t numFields = pMatcher->split(text, splits, 10, status);
5583 REGEX_ASSERT(numFields == 8);
5588 // This test is checking for the existance of any supplemental characters that case-fold
5589 // to a bmp character.
5591 // At the time of this writing there are none. If any should appear in a subsequent release
5592 // of Unicode, the code in regular expressions compilation that determines the longest
5593 // posssible match for a literal string will need to be enhanced.
5595 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5596 // for details on what to do in case of a failure of this test.
5598 void RegexTest::Bug9283() {
5599 #if !UCONFIG_NO_NORMALIZATION
5600 UErrorCode status = U_ZERO_ERROR;
5601 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5605 for (index=0; ; index++) {
5606 c = supplementalsWithCaseFolding.charAt(index);
5610 UnicodeString cf = UnicodeString(c).foldCase();
5611 REGEX_ASSERT(cf.length() >= 2);
5613 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5617 void RegexTest::CheckInvBufSize() {
5618 if(inv_next>=INV_BUFSIZ) {
5619 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5620 __FILE__, INV_BUFSIZ, inv_next);
5622 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5627 void RegexTest::Bug10459() {
5628 UErrorCode status = U_ZERO_ERROR;
5629 UnicodeString patternString("(txt)");
5630 UnicodeString txtString("txt");
5632 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5634 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5637 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5640 uregex_setUText(icu_re, utext_txt, &status);
5643 // The bug was that calling uregex_group() before doing a matching operation
5644 // was causing a segfault. Only for Regular Expressions created from UText.
5645 // It should set an U_REGEX_INVALID_STATE.
5648 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5649 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5650 REGEX_ASSERT(len == 0);
5652 uregex_close(icu_re);
5653 utext_close(utext_pat);
5654 utext_close(utext_txt);
5657 void RegexTest::TestCaseInsensitiveStarters() {
5658 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5659 // become stale because of new Unicode characters.
5660 // If it is stale, rerun the generation tool
5661 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5662 // and replace the embedded data in i18n/regexcmp.cpp
5664 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5665 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5668 UnicodeSet s(cp, cp);
5669 s.closeOver(USET_CASE_INSENSITIVE);
5670 UnicodeSetIterator setIter(s);
5671 while (setIter.next()) {
5672 if (!setIter.isString()) {
5675 const UnicodeString &str = setIter.getString();
5676 UChar32 firstChar = str.char32At(0);
5677 UnicodeSet starters;
5678 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5679 if (!starters.contains(cp)) {
5680 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5688 void RegexTest::TestBug11049() {
5689 // Original bug report: pattern with match start consisting of one of several individual characters,
5690 // and the text being matched ending with a supplementary character. find() would read past the
5691 // end of the input text when searching for potential match starting points.
5693 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5694 // detect the bad read.
5696 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5697 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5699 // Test again with a pattern starting with a single character,
5700 // which takes a different code path than starting with an OR expression,
5701 // but with similar logic.
5702 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5703 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5706 // Run a single test case from TestBug11049(). Internal function.
5707 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5708 UErrorCode status = U_ZERO_ERROR;
5709 UnicodeString patternString = UnicodeString(pattern).unescape();
5710 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5712 UnicodeString dataString = UnicodeString(data).unescape();
5713 UChar *exactBuffer = new UChar[dataString.length()];
5714 dataString.extract(exactBuffer, dataString.length(), status);
5715 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5717 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5720 UBool result = matcher->find();
5721 if (result != expectMatch) {
5722 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5723 __FILE__, lineNumber, expectMatch, result, pattern, data);
5726 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5727 // off-by-one on find() with match at the last code point.
5728 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5729 // because string.unescape() will only shrink it.
5730 char * utf8Buffer = new char[uprv_strlen(data)+1];
5731 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5733 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5736 result = matcher->find();
5737 if (result != expectMatch) {
5738 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5739 __FILE__, lineNumber, expectMatch, result, pattern, data);
5741 delete [] utf8Buffer;
5744 delete [] exactBuffer;
5748 void RegexTest::TestBug11371() {
5750 logln("Skipping test. Runs in exhuastive mode only.");
5753 UErrorCode status = U_ZERO_ERROR;
5754 UnicodeString patternString;
5756 for (int i=0; i<8000000; i++) {
5757 patternString.append(UnicodeString("()"));
5759 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5760 if (status != U_REGEX_PATTERN_TOO_BIG) {
5761 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5762 __FILE__, __LINE__, u_errorName(status));
5765 status = U_ZERO_ERROR;
5766 patternString = "(";
5767 for (int i=0; i<20000000; i++) {
5768 patternString.append(UnicodeString("A++"));
5770 patternString.append(UnicodeString("){0}B++"));
5771 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5772 if (status != U_REGEX_PATTERN_TOO_BIG) {
5773 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5774 __FILE__, __LINE__, u_errorName(status));
5777 // Pattern with too much string data, such that string indexes overflow operand data field size
5778 // in compiled instruction.
5779 status = U_ZERO_ERROR;
5781 while (patternString.length() < 0x00ffffff) {
5782 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5784 patternString.append(UnicodeString("X? trailing string"));
5785 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5786 if (status != U_REGEX_PATTERN_TOO_BIG) {
5787 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5788 __FILE__, __LINE__, u_errorName(status));
5792 void RegexTest::TestBug11480() {
5793 // C API, get capture group of a group that does not participate in the match.
5794 // (Returns a zero length string, with nul termination,
5795 // indistinguishable from a group with a zero length match.)
5797 UErrorCode status = U_ZERO_ERROR;
5798 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5800 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5801 uregex_setText(re, text.getBuffer(), text.length(), &status);
5803 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5804 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5805 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5806 REGEX_ASSERT(length == 0);
5807 REGEX_ASSERT(buf[0] == 13);
5808 REGEX_ASSERT(buf[1] == 0);
5809 REGEX_ASSERT(buf[2] == 13);
5812 // UText C++ API, length of match is 0 for non-participating matches.
5813 UText ut = UTEXT_INITIALIZER;
5814 utext_openUnicodeString(&ut, &text, &status);
5815 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5818 REGEX_ASSERT(matcher.lookingAt(0, status));
5820 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5821 int64_t groupLen = -666;
5822 UText group = UTEXT_INITIALIZER;
5823 matcher.group(1, &group, groupLen, status);
5825 REGEX_ASSERT(groupLen == 1);
5826 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5828 // Capture group 2, the (B), does not participate in the match.
5829 matcher.group(2, &group, groupLen, status);
5831 REGEX_ASSERT(groupLen == 0);
5832 REGEX_ASSERT(matcher.start(2, status) == -1);
5837 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */