source/test/intltest/regextst.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 2002-2016, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************/
   8
   9 //
  10 //   regextst.cpp
  11 //
  12 //      ICU Regular Expressions test, part of intltest.
  13 //
  14
  15 /*
  16      NOTE!!
  17
  18      PLEASE be careful about ASCII assumptions in this test.
  19      This test is one of the worst repeat offenders.
  20      If you have questions, contact someone on the ICU PMC
  21      who has access to an EBCDIC system.
  22
  23  */
  24
  25 #include "intltest.h"
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  27
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <string.h>
  31
  32 #include "unicode/localpointer.h"
  33 #include "unicode/regex.h"
  34 #include "unicode/uchar.h"
  35 #include "unicode/ucnv.h"
  36 #include "unicode/uniset.h"
  37 #include "unicode/uregex.h"
  38 #include "unicode/usetiter.h"
  39 #include "unicode/ustring.h"
  40 #include "unicode/utext.h"
  41
  42 #include "regextst.h"
  43 #include "regexcmp.h"
  44 #include "uvector.h"
  45 #include "util.h"
  46 #include "cmemory.h"
  47 #include "cstring.h"
  48 #include "uinvchar.h"
  49
  50 #define SUPPORT_MUTATING_INPUT_STRING   0
  51
  52 //---------------------------------------------------------------------------
  53 //
  54 //  Test class boilerplate
  55 //
  56 //---------------------------------------------------------------------------
  57 RegexTest::RegexTest()
  58 {
  59 }
  60
  61
  62 RegexTest::~RegexTest()
  63 {
  64 }
  65
  66
  67
  68 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  69 {
  70     if (exec) logln("TestSuite RegexTest: ");
  71     switch (index) {
  72
  73         case 0: name = "Basic";
  74             if (exec) Basic();
  75             break;
  76         case 1: name = "API_Match";
  77             if (exec) API_Match();
  78             break;
  79         case 2: name = "API_Replace";
  80             if (exec) API_Replace();
  81             break;
  82         case 3: name = "API_Pattern";
  83             if (exec) API_Pattern();
  84             break;
  85         case 4:
  86 #if !UCONFIG_NO_FILE_IO
  87             name = "Extended";
  88             if (exec) Extended();
  89 #else
  90             name = "skip";
  91 #endif
  92             break;
  93         case 5: name = "Errors";
  94             if (exec) Errors();
  95             break;
  96         case 6: name = "PerlTests";
  97             if (exec) PerlTests();
  98             break;
  99         case 7: name = "Callbacks";
 100             if (exec) Callbacks();
 101             break;
 102         case 8: name = "FindProgressCallbacks";
 103             if (exec) FindProgressCallbacks();
 104             break;
 105         case 9: name = "Bug 6149";
 106              if (exec) Bug6149();
 107              break;
 108         case 10: name = "UTextBasic";
 109           if (exec) UTextBasic();
 110           break;
 111         case 11: name = "API_Match_UTF8";
 112           if (exec) API_Match_UTF8();
 113           break;
 114         case 12: name = "API_Replace_UTF8";
 115           if (exec) API_Replace_UTF8();
 116           break;
 117         case 13: name = "API_Pattern_UTF8";
 118           if (exec) API_Pattern_UTF8();
 119           break;
 120         case 14: name = "PerlTestsUTF8";
 121           if (exec) PerlTestsUTF8();
 122           break;
 123         case 15: name = "PreAllocatedUTextCAPI";
 124           if (exec) PreAllocatedUTextCAPI();
 125           break;
 126         case 16: name = "Bug 7651";
 127              if (exec) Bug7651();
 128              break;
 129         case 17: name = "Bug 7740";
 130             if (exec) Bug7740();
 131             break;
 132         case 18: name = "Bug 8479";
 133             if (exec) Bug8479();
 134             break;
 135         case 19: name = "Bug 7029";
 136             if (exec) Bug7029();
 137             break;
 138         case 20: name = "CheckInvBufSize";
 139             if (exec) CheckInvBufSize();
 140             break;
 141         case 21: name = "Bug 9283";
 142             if (exec) Bug9283();
 143             break;
 144         case 22: name = "Bug10459";
 145             if (exec) Bug10459();
 146             break;
 147         case 23: name = "TestCaseInsensitiveStarters";
 148             if (exec) TestCaseInsensitiveStarters();
 149             break;
 150         case 24: name = "TestBug11049";
 151             if (exec) TestBug11049();
 152             break;
 153         case 25: name = "TestBug11371";
 154             if (exec) TestBug11371();
 155             break;
 156         case 26: name = "TestBug11480";
 157             if (exec) TestBug11480();
 158             break;
 159         case 27: name = "NamedCapture";
 160             if (exec) NamedCapture();
 161             break;
 162         case 28: name = "NamedCaptureLimits";
 163             if (exec) NamedCaptureLimits();
 164             break;
 165         default: name = "";
 166             break; //needed to end loop
 167     }
 168 }
 169
 170
 171
 172 /**
 173  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
 174  * into ASCII.
 175  * @see utext_openUTF8
 176  */
 177 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
 178
 179 //---------------------------------------------------------------------------
 180 //
 181 //   Error Checking / Reporting macros used in all of the tests.
 182 //
 183 //---------------------------------------------------------------------------
 184
 185 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
 186   int64_t oldIndex = utext_getNativeIndex(text);
 187   utext_setNativeIndex(text, 0);
 188   char *bufPtr = buf;
 189   UChar32 c = utext_next32From(text, 0);
 190   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
 191     if (0x000020<=c && c<0x00007e) {
 192       *bufPtr = c;
 193     } else {
 194 #if 0
 195       sprintf(bufPtr,"U+%04X", c);
 196       bufPtr+= strlen(bufPtr)-1;
 197 #else
 198       *bufPtr = '%';
 199 #endif
 200     }
 201     bufPtr++;
 202     c = UTEXT_NEXT32(text);
 203   }
 204   *bufPtr = 0;
 205 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
 206   char *ebuf = (char*)malloc(bufLen);
 207   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
 208   uprv_strncpy(buf, ebuf, bufLen);
 209   free((void*)ebuf);
 210 #endif
 211   utext_setNativeIndex(text, oldIndex);
 212 }
 213
 214
 215 static char ASSERT_BUF[1024];
 216
 217 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
 218   if(message.length()==0) {
 219     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
 220   } else {
 221     UnicodeString buf;
 222     IntlTest::prettify(message,buf);
 223     if(buf.length()==0) {
 224       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
 225     } else {
 226       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
 227       if(ASSERT_BUF[0]==0) {
 228         ASSERT_BUF[0]=0;
 229         for(int32_t i=0;i<buf.length();i++) {
 230           UChar ch = buf[i];
 231           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
 232         }
 233       }
 234     }
 235   }
 236   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
 237   return ASSERT_BUF;
 238 }
 239
 240 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
 241
 242 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
 243                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
 244
 245 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
 246
 247 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
 248 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
 249     __LINE__, u_errorName(errcode), u_errorName(status));};}
 250
 251 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
 252     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
 253
 254 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
 255     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
 256
 257 // expected: const char * , restricted to invariant characters.
 258 // actual: const UnicodeString &
 259 #define REGEX_ASSERT_UNISTR(expected, actual) { \
 260     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
 261         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
 262                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
 263
 264
 265 static UBool testUTextEqual(UText *uta, UText *utb) {
 266     UChar32 ca = 0;
 267     UChar32 cb = 0;
 268     utext_setNativeIndex(uta, 0);
 269     utext_setNativeIndex(utb, 0);
 270     do {
 271         ca = utext_next32(uta);
 272         cb = utext_next32(utb);
 273         if (ca != cb) {
 274             break;
 275         }
 276     } while (ca != U_SENTINEL);
 277     return ca == cb;
 278 }
 279
 280
 281 /**
 282  * @param expected expected text in UTF-8 (not platform) codepage
 283  */
 284 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
 285     UErrorCode status = U_ZERO_ERROR;
 286     UText expectedText = UTEXT_INITIALIZER;
 287     utext_openUTF8(&expectedText, expected, -1, &status);
 288     if(U_FAILURE(status)) {
 289       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 290       return;
 291     }
 292     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
 293       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
 294       return;
 295     }
 296     utext_setNativeIndex(actual, 0);
 297     if (!testUTextEqual(&expectedText, actual)) {
 298         char buf[201 /*21*/];
 299         char expectedBuf[201];
 300         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
 301         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
 302         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 303     }
 304     utext_close(&expectedText);
 305 }
 306 /**
 307  * @param expected invariant (platform local text) input
 308  */
 309
 310 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
 311     UErrorCode status = U_ZERO_ERROR;
 312     UText expectedText = UTEXT_INITIALIZER;
 313     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
 314     if(U_FAILURE(status)) {
 315       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 316       return;
 317     }
 318     utext_setNativeIndex(actual, 0);
 319     if (!testUTextEqual(&expectedText, actual)) {
 320         char buf[201 /*21*/];
 321         char expectedBuf[201];
 322         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
 323         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
 324         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 325     }
 326     utext_close(&expectedText);
 327 }
 328
 329 /**
 330  * Assumes utf-8 input
 331  */
 332 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
 333 /**
 334  * Assumes Invariant input
 335  */
 336 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
 337
 338 /**
 339  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
 340  * passed into utext_openUTF8. An error will be given if
 341  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
 342  */
 343
 344 #define INV_BUFSIZ 2048 /* increase this if too small */
 345
 346 static int64_t inv_next=0;
 347
 348 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
 349 static char inv_buf[INV_BUFSIZ];
 350 #endif
 351
 352 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
 353   if(length==-1) length=strlen(inv);
 354 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
 355   inv_next+=length;
 356   return utext_openUTF8(ut, inv, length, status);
 357 #else
 358   if(inv_next+length+1>INV_BUFSIZ) {
 359     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
 360             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
 361     *status = U_MEMORY_ALLOCATION_ERROR;
 362     return NULL;
 363   }
 364
 365   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
 366   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
 367   inv_next+=length;
 368
 369 #if 0
 370   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
 371 #endif
 372
 373   return utext_openUTF8(ut, (const char*)buf, length, status);
 374 #endif
 375 }
 376
 377
 378 //---------------------------------------------------------------------------
 379 //
 380 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
 381 //                       for the LookingAt() and  Match() functions.
 382 //
 383 //       usage:
 384 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
 385 //
 386 //          The expected results are UBool - TRUE or FALSE.
 387 //          The input text is unescaped.  The pattern is not.
 388 //
 389 //
 390 //---------------------------------------------------------------------------
 391
 392 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
 393
 394 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 395     const UnicodeString pattern(pat, -1, US_INV);
 396     const UnicodeString inputText(text, -1, US_INV);
 397     UErrorCode          status  = U_ZERO_ERROR;
 398     UParseError         pe;
 399     RegexPattern        *REPattern = NULL;
 400     RegexMatcher        *REMatcher = NULL;
 401     UBool               retVal     = TRUE;
 402
 403     UnicodeString patString(pat, -1, US_INV);
 404     REPattern = RegexPattern::compile(patString, 0, pe, status);
 405     if (U_FAILURE(status)) {
 406         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
 407             line, u_errorName(status));
 408         return FALSE;
 409     }
 410     if (line==376) { REPattern->dumpPattern();}
 411
 412     UnicodeString inputString(inputText);
 413     UnicodeString unEscapedInput = inputString.unescape();
 414     REMatcher = REPattern->matcher(unEscapedInput, status);
 415     if (U_FAILURE(status)) {
 416         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
 417             line, u_errorName(status));
 418         return FALSE;
 419     }
 420
 421     UBool actualmatch;
 422     actualmatch = REMatcher->lookingAt(status);
 423     if (U_FAILURE(status)) {
 424         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
 425             line, u_errorName(status));
 426         retVal =  FALSE;
 427     }
 428     if (actualmatch != looking) {
 429         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
 430         retVal = FALSE;
 431     }
 432
 433     status = U_ZERO_ERROR;
 434     actualmatch = REMatcher->matches(status);
 435     if (U_FAILURE(status)) {
 436         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
 437             line, u_errorName(status));
 438         retVal = FALSE;
 439     }
 440     if (actualmatch != match) {
 441         errln("RegexTest: wrong return from matches() at line %d.\n", line);
 442         retVal = FALSE;
 443     }
 444
 445     if (retVal == FALSE) {
 446         REPattern->dumpPattern();
 447     }
 448
 449     delete REPattern;
 450     delete REMatcher;
 451     return retVal;
 452 }
 453
 454
 455 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 456     UText               pattern    = UTEXT_INITIALIZER;
 457     int32_t             inputUTF8Length;
 458     char                *textChars = NULL;
 459     UText               inputText  = UTEXT_INITIALIZER;
 460     UErrorCode          status     = U_ZERO_ERROR;
 461     UParseError         pe;
 462     RegexPattern        *REPattern = NULL;
 463     RegexMatcher        *REMatcher = NULL;
 464     UBool               retVal     = TRUE;
 465
 466     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
 467     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
 468     if (U_FAILURE(status)) {
 469         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
 470             line, u_errorName(status));
 471         return FALSE;
 472     }
 473
 474     UnicodeString inputString(text, -1, US_INV);
 475     UnicodeString unEscapedInput = inputString.unescape();
 476     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
 477     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
 478
 479     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
 480     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 481         // UTF-8 does not allow unpaired surrogates, so this could actually happen
 482         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
 483         return TRUE; // not a failure of the Regex engine
 484     }
 485     status = U_ZERO_ERROR; // buffer overflow
 486     textChars = new char[inputUTF8Length+1];
 487     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
 488     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
 489
 490     REMatcher = &REPattern->matcher(status)->reset(&inputText);
 491     if (U_FAILURE(status)) {
 492         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
 493             line, u_errorName(status));
 494         return FALSE;
 495     }
 496
 497     UBool actualmatch;
 498     actualmatch = REMatcher->lookingAt(status);
 499     if (U_FAILURE(status)) {
 500         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
 501             line, u_errorName(status));
 502         retVal =  FALSE;
 503     }
 504     if (actualmatch != looking) {
 505         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
 506         retVal = FALSE;
 507     }
 508
 509     status = U_ZERO_ERROR;
 510     actualmatch = REMatcher->matches(status);
 511     if (U_FAILURE(status)) {
 512         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
 513             line, u_errorName(status));
 514         retVal = FALSE;
 515     }
 516     if (actualmatch != match) {
 517         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
 518         retVal = FALSE;
 519     }
 520
 521     if (retVal == FALSE) {
 522         REPattern->dumpPattern();
 523     }
 524
 525     delete REPattern;
 526     delete REMatcher;
 527     utext_close(&inputText);
 528     utext_close(&pattern);
 529     delete[] textChars;
 530     return retVal;
 531 }
 532
 533
 534
 535 //---------------------------------------------------------------------------
 536 //
 537 //    REGEX_ERR       Macro + invocation function to simplify writing tests
 538 //                       regex tests for incorrect patterns
 539 //
 540 //       usage:
 541 //          REGEX_ERR("pattern",   expected error line, column, expected status);
 542 //
 543 //---------------------------------------------------------------------------
 544 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
 545
 546 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
 547                           UErrorCode expectedStatus, int32_t line) {
 548     UnicodeString       pattern(pat);
 549
 550     UErrorCode          status         = U_ZERO_ERROR;
 551     UParseError         pe;
 552     RegexPattern        *callerPattern = NULL;
 553
 554     //
 555     //  Compile the caller's pattern
 556     //
 557     UnicodeString patString(pat);
 558     callerPattern = RegexPattern::compile(patString, 0, pe, status);
 559     if (status != expectedStatus) {
 560         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 561     } else {
 562         if (status != U_ZERO_ERROR) {
 563             if (pe.line != errLine || pe.offset != errCol) {
 564                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 565                     line, errLine, errCol, pe.line, pe.offset);
 566             }
 567         }
 568     }
 569
 570     delete callerPattern;
 571
 572     //
 573     //  Compile again, using a UTF-8-based UText
 574     //
 575     UText patternText = UTEXT_INITIALIZER;
 576     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
 577     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
 578     if (status != expectedStatus) {
 579         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 580     } else {
 581         if (status != U_ZERO_ERROR) {
 582             if (pe.line != errLine || pe.offset != errCol) {
 583                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 584                     line, errLine, errCol, pe.line, pe.offset);
 585             }
 586         }
 587     }
 588
 589     delete callerPattern;
 590     utext_close(&patternText);
 591 }
 592
 593
 594
 595 //---------------------------------------------------------------------------
 596 //
 597 //      Basic      Check for basic functionality of regex pattern matching.
 598 //                 Avoid the use of REGEX_FIND test macro, which has
 599 //                 substantial dependencies on basic Regex functionality.
 600 //
 601 //---------------------------------------------------------------------------
 602 void RegexTest::Basic() {
 603
 604
 605 //
 606 // Debug - slide failing test cases early
 607 //
 608 #if 0
 609     {
 610         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
 611         UParseError pe;
 612         UErrorCode  status = U_ZERO_ERROR;
 613         RegexPattern *pattern;
 614         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
 615         pattern->dumpPattern();
 616         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
 617         UBool result = m->find();
 618         printf("result = %d\n", result);
 619         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
 620         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
 621     }
 622     exit(1);
 623 #endif
 624
 625
 626     //
 627     // Pattern with parentheses
 628     //
 629     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
 630     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
 631     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
 632
 633     //
 634     // Patterns with *
 635     //
 636     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
 637     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
 638     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
 639     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
 640     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
 641
 642     REGEX_TESTLM("a*", "",  TRUE, TRUE);
 643     REGEX_TESTLM("a*", "b", TRUE, FALSE);
 644
 645
 646     //
 647     //  Patterns with "."
 648     //
 649     REGEX_TESTLM(".", "abc", TRUE, FALSE);
 650     REGEX_TESTLM("...", "abc", TRUE, TRUE);
 651     REGEX_TESTLM("....", "abc", FALSE, FALSE);
 652     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
 653     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
 654     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
 655     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
 656     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
 657
 658     //
 659     //  Patterns with * applied to chars at end of literal string
 660     //
 661     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
 662     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
 663
 664     //
 665     //  Supplemental chars match as single chars, not a pair of surrogates.
 666     //
 667     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
 668     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
 669     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
 670
 671
 672     //
 673     //  UnicodeSets in the pattern
 674     //
 675     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
 676     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
 677     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
 678     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 679     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 680     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
 681
 682     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
 683     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
 684     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
 685     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
 686     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
 687
 688     //
 689     //   OR operator in patterns
 690     //
 691     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
 692     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
 693     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
 694     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
 695
 696     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
 697     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
 698     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
 699     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
 700     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
 701     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
 702
 703     //
 704     //  +
 705     //
 706     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
 707     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
 708     REGEX_TESTLM("b+", "", FALSE, FALSE);
 709     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
 710     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
 711     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
 712
 713     //
 714     //   ?
 715     //
 716     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
 717     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
 718     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
 719     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
 720     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
 721     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
 722     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
 723     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
 724     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
 725
 726     //
 727     //  Escape sequences that become single literal chars, handled internally
 728     //   by ICU's Unescape.
 729     //
 730
 731     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
 732     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
 733     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
 734     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
 735     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
 736     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
 737     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
 738     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
 739     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
 740     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
 741
 742     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
 743     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
 744
 745     // Escape of special chars in patterns
 746     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
 747 }
 748
 749
 750 //---------------------------------------------------------------------------
 751 //
 752 //    UTextBasic   Check for quirks that are specific to the UText
 753 //                 implementation.
 754 //
 755 //---------------------------------------------------------------------------
 756 void RegexTest::UTextBasic() {
 757     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
 758     UErrorCode status = U_ZERO_ERROR;
 759     UText pattern = UTEXT_INITIALIZER;
 760     utext_openUTF8(&pattern, str_abc, -1, &status);
 761     RegexMatcher matcher(&pattern, 0, status);
 762     REGEX_CHECK_STATUS;
 763
 764     UText input = UTEXT_INITIALIZER;
 765     utext_openUTF8(&input, str_abc, -1, &status);
 766     REGEX_CHECK_STATUS;
 767     matcher.reset(&input);
 768     REGEX_CHECK_STATUS;
 769     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 770
 771     matcher.reset(matcher.inputText());
 772     REGEX_CHECK_STATUS;
 773     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 774
 775     utext_close(&pattern);
 776     utext_close(&input);
 777 }
 778
 779
 780 //---------------------------------------------------------------------------
 781 //
 782 //      API_Match   Test that the API for class RegexMatcher
 783 //                  is present and nominally working, but excluding functions
 784 //                  implementing replace operations.
 785 //
 786 //---------------------------------------------------------------------------
 787 void RegexTest::API_Match() {
 788     UParseError         pe;
 789     UErrorCode          status=U_ZERO_ERROR;
 790     int32_t             flags = 0;
 791
 792     //
 793     // Debug - slide failing test cases early
 794     //
 795 #if 0
 796     {
 797     }
 798     return;
 799 #endif
 800
 801     //
 802     // Simple pattern compilation
 803     //
 804     {
 805         UnicodeString       re("abc");
 806         RegexPattern        *pat2;
 807         pat2 = RegexPattern::compile(re, flags, pe, status);
 808         REGEX_CHECK_STATUS;
 809
 810         UnicodeString inStr1 = "abcdef this is a test";
 811         UnicodeString instr2 = "not abc";
 812         UnicodeString empty  = "";
 813
 814
 815         //
 816         // Matcher creation and reset.
 817         //
 818         RegexMatcher *m1 = pat2->matcher(inStr1, status);
 819         REGEX_CHECK_STATUS;
 820         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 821         REGEX_ASSERT(m1->input() == inStr1);
 822         m1->reset(instr2);
 823         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 824         REGEX_ASSERT(m1->input() == instr2);
 825         m1->reset(inStr1);
 826         REGEX_ASSERT(m1->input() == inStr1);
 827         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 828         m1->reset(empty);
 829         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 830         REGEX_ASSERT(m1->input() == empty);
 831         REGEX_ASSERT(&m1->pattern() == pat2);
 832
 833         //
 834         //  reset(pos, status)
 835         //
 836         m1->reset(inStr1);
 837         m1->reset(4, status);
 838         REGEX_CHECK_STATUS;
 839         REGEX_ASSERT(m1->input() == inStr1);
 840         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 841
 842         m1->reset(-1, status);
 843         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 844         status = U_ZERO_ERROR;
 845
 846         m1->reset(0, status);
 847         REGEX_CHECK_STATUS;
 848         status = U_ZERO_ERROR;
 849
 850         int32_t len = m1->input().length();
 851         m1->reset(len-1, status);
 852         REGEX_CHECK_STATUS;
 853         status = U_ZERO_ERROR;
 854
 855         m1->reset(len, status);
 856         REGEX_CHECK_STATUS;
 857         status = U_ZERO_ERROR;
 858
 859         m1->reset(len+1, status);
 860         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 861         status = U_ZERO_ERROR;
 862
 863         //
 864         // match(pos, status)
 865         //
 866         m1->reset(instr2);
 867         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 868         m1->reset();
 869         REGEX_ASSERT(m1->matches(3, status) == FALSE);
 870         m1->reset();
 871         REGEX_ASSERT(m1->matches(5, status) == FALSE);
 872         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 873         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
 874         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 875
 876         // Match() at end of string should fail, but should not
 877         //  be an error.
 878         status = U_ZERO_ERROR;
 879         len = m1->input().length();
 880         REGEX_ASSERT(m1->matches(len, status) == FALSE);
 881         REGEX_CHECK_STATUS;
 882
 883         // Match beyond end of string should fail with an error.
 884         status = U_ZERO_ERROR;
 885         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
 886         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 887
 888         // Successful match at end of string.
 889         {
 890             status = U_ZERO_ERROR;
 891             RegexMatcher m("A?", 0, status);  // will match zero length string.
 892             REGEX_CHECK_STATUS;
 893             m.reset(inStr1);
 894             len = inStr1.length();
 895             REGEX_ASSERT(m.matches(len, status) == TRUE);
 896             REGEX_CHECK_STATUS;
 897             m.reset(empty);
 898             REGEX_ASSERT(m.matches(0, status) == TRUE);
 899             REGEX_CHECK_STATUS;
 900         }
 901
 902
 903         //
 904         // lookingAt(pos, status)
 905         //
 906         status = U_ZERO_ERROR;
 907         m1->reset(instr2);  // "not abc"
 908         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 909         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
 910         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
 911         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 912         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
 913         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 914         status = U_ZERO_ERROR;
 915         len = m1->input().length();
 916         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
 917         REGEX_CHECK_STATUS;
 918         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
 919         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 920
 921         delete m1;
 922         delete pat2;
 923     }
 924
 925
 926     //
 927     // Capture Group.
 928     //     RegexMatcher::start();
 929     //     RegexMatcher::end();
 930     //     RegexMatcher::groupCount();
 931     //
 932     {
 933         int32_t             flags=0;
 934         UParseError         pe;
 935         UErrorCode          status=U_ZERO_ERROR;
 936
 937         UnicodeString       re("01(23(45)67)(.*)");
 938         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 939         REGEX_CHECK_STATUS;
 940         UnicodeString data = "0123456789";
 941
 942         RegexMatcher *matcher = pat->matcher(data, status);
 943         REGEX_CHECK_STATUS;
 944         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
 945         static const int32_t matchStarts[] = {0,  2, 4, 8};
 946         static const int32_t matchEnds[]   = {10, 8, 6, 10};
 947         int32_t i;
 948         for (i=0; i<4; i++) {
 949             int32_t actualStart = matcher->start(i, status);
 950             REGEX_CHECK_STATUS;
 951             if (actualStart != matchStarts[i]) {
 952                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
 953                     __LINE__, i, matchStarts[i], actualStart);
 954             }
 955             int32_t actualEnd = matcher->end(i, status);
 956             REGEX_CHECK_STATUS;
 957             if (actualEnd != matchEnds[i]) {
 958                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
 959                     __LINE__, i, matchEnds[i], actualEnd);
 960             }
 961         }
 962
 963         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
 964         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
 965
 966         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 967         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 968         matcher->reset();
 969         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
 970
 971         matcher->lookingAt(status);
 972         REGEX_ASSERT(matcher->group(status)    == "0123456789");
 973         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
 974         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
 975         REGEX_ASSERT(matcher->group(2, status) == "45"        );
 976         REGEX_ASSERT(matcher->group(3, status) == "89"        );
 977         REGEX_CHECK_STATUS;
 978         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 979         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 980         matcher->reset();
 981         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
 982
 983         delete matcher;
 984         delete pat;
 985
 986     }
 987
 988     //
 989     //  find
 990     //
 991     {
 992         int32_t             flags=0;
 993         UParseError         pe;
 994         UErrorCode          status=U_ZERO_ERROR;
 995
 996         UnicodeString       re("abc");
 997         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 998         REGEX_CHECK_STATUS;
 999         UnicodeString data = ".abc..abc...abc..";
1000         //                    012345678901234567
1001
1002         RegexMatcher *matcher = pat->matcher(data, status);
1003         REGEX_CHECK_STATUS;
1004         REGEX_ASSERT(matcher->find());
1005         REGEX_ASSERT(matcher->start(status) == 1);
1006         REGEX_ASSERT(matcher->find());
1007         REGEX_ASSERT(matcher->start(status) == 6);
1008         REGEX_ASSERT(matcher->find());
1009         REGEX_ASSERT(matcher->start(status) == 12);
1010         REGEX_ASSERT(matcher->find() == FALSE);
1011         REGEX_ASSERT(matcher->find() == FALSE);
1012
1013         matcher->reset();
1014         REGEX_ASSERT(matcher->find());
1015         REGEX_ASSERT(matcher->start(status) == 1);
1016
1017         REGEX_ASSERT(matcher->find(0, status));
1018         REGEX_ASSERT(matcher->start(status) == 1);
1019         REGEX_ASSERT(matcher->find(1, status));
1020         REGEX_ASSERT(matcher->start(status) == 1);
1021         REGEX_ASSERT(matcher->find(2, status));
1022         REGEX_ASSERT(matcher->start(status) == 6);
1023         REGEX_ASSERT(matcher->find(12, status));
1024         REGEX_ASSERT(matcher->start(status) == 12);
1025         REGEX_ASSERT(matcher->find(13, status) == FALSE);
1026         REGEX_ASSERT(matcher->find(16, status) == FALSE);
1027         REGEX_ASSERT(matcher->find(17, status) == FALSE);
1028         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1029
1030         status = U_ZERO_ERROR;
1031         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1032         status = U_ZERO_ERROR;
1033         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1034
1035         REGEX_ASSERT(matcher->groupCount() == 0);
1036
1037         delete matcher;
1038         delete pat;
1039     }
1040
1041
1042     //
1043     //  find, with \G in pattern (true if at the end of a previous match).
1044     //
1045     {
1046         int32_t             flags=0;
1047         UParseError         pe;
1048         UErrorCode          status=U_ZERO_ERROR;
1049
1050         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1051         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1052         REGEX_CHECK_STATUS;
1053         UnicodeString data = ".abcabc.abc..";
1054         //                    012345678901234567
1055
1056         RegexMatcher *matcher = pat->matcher(data, status);
1057         REGEX_CHECK_STATUS;
1058         REGEX_ASSERT(matcher->find());
1059         REGEX_ASSERT(matcher->start(status) == 0);
1060         REGEX_ASSERT(matcher->start(1, status) == -1);
1061         REGEX_ASSERT(matcher->start(2, status) == 1);
1062
1063         REGEX_ASSERT(matcher->find());
1064         REGEX_ASSERT(matcher->start(status) == 4);
1065         REGEX_ASSERT(matcher->start(1, status) == 4);
1066         REGEX_ASSERT(matcher->start(2, status) == -1);
1067         REGEX_CHECK_STATUS;
1068
1069         delete matcher;
1070         delete pat;
1071     }
1072
1073     //
1074     //   find with zero length matches, match position should bump ahead
1075     //     to prevent loops.
1076     //
1077     {
1078         int32_t                 i;
1079         UErrorCode          status=U_ZERO_ERROR;
1080         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1081                                                       //   using an always-true look-ahead.
1082         REGEX_CHECK_STATUS;
1083         UnicodeString s("    ");
1084         m.reset(s);
1085         for (i=0; ; i++) {
1086             if (m.find() == FALSE) {
1087                 break;
1088             }
1089             REGEX_ASSERT(m.start(status) == i);
1090             REGEX_ASSERT(m.end(status) == i);
1091         }
1092         REGEX_ASSERT(i==5);
1093
1094         // Check that the bump goes over surrogate pairs OK
1095         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1096         s = s.unescape();
1097         m.reset(s);
1098         for (i=0; ; i+=2) {
1099             if (m.find() == FALSE) {
1100                 break;
1101             }
1102             REGEX_ASSERT(m.start(status) == i);
1103             REGEX_ASSERT(m.end(status) == i);
1104         }
1105         REGEX_ASSERT(i==10);
1106     }
1107     {
1108         // find() loop breaking test.
1109         //        with pattern of /.?/, should see a series of one char matches, then a single
1110         //        match of zero length at the end of the input string.
1111         int32_t                 i;
1112         UErrorCode          status=U_ZERO_ERROR;
1113         RegexMatcher        m(".?", 0, status);
1114         REGEX_CHECK_STATUS;
1115         UnicodeString s("    ");
1116         m.reset(s);
1117         for (i=0; ; i++) {
1118             if (m.find() == FALSE) {
1119                 break;
1120             }
1121             REGEX_ASSERT(m.start(status) == i);
1122             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1123         }
1124         REGEX_ASSERT(i==5);
1125     }
1126
1127
1128     //
1129     // Matchers with no input string behave as if they had an empty input string.
1130     //
1131
1132     {
1133         UErrorCode status = U_ZERO_ERROR;
1134         RegexMatcher  m(".?", 0, status);
1135         REGEX_CHECK_STATUS;
1136         REGEX_ASSERT(m.find());
1137         REGEX_ASSERT(m.start(status) == 0);
1138         REGEX_ASSERT(m.input() == "");
1139     }
1140     {
1141         UErrorCode status = U_ZERO_ERROR;
1142         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1143         RegexMatcher  *m = p->matcher(status);
1144         REGEX_CHECK_STATUS;
1145
1146         REGEX_ASSERT(m->find() == FALSE);
1147         REGEX_ASSERT(m->input() == "");
1148         delete m;
1149         delete p;
1150     }
1151
1152     //
1153     // Regions
1154     //
1155     {
1156         UErrorCode status = U_ZERO_ERROR;
1157         UnicodeString testString("This is test data");
1158         RegexMatcher m(".*", testString,  0, status);
1159         REGEX_CHECK_STATUS;
1160         REGEX_ASSERT(m.regionStart() == 0);
1161         REGEX_ASSERT(m.regionEnd() == testString.length());
1162         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1163         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1164
1165         m.region(2,4, status);
1166         REGEX_CHECK_STATUS;
1167         REGEX_ASSERT(m.matches(status));
1168         REGEX_ASSERT(m.start(status)==2);
1169         REGEX_ASSERT(m.end(status)==4);
1170         REGEX_CHECK_STATUS;
1171
1172         m.reset();
1173         REGEX_ASSERT(m.regionStart() == 0);
1174         REGEX_ASSERT(m.regionEnd() == testString.length());
1175
1176         UnicodeString shorterString("short");
1177         m.reset(shorterString);
1178         REGEX_ASSERT(m.regionStart() == 0);
1179         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1180
1181         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1182         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1183         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1184         REGEX_ASSERT(&m == &m.reset());
1185         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1186
1187         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1188         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1189         REGEX_ASSERT(&m == &m.reset());
1190         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1191
1192         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1193         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1194         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1195         REGEX_ASSERT(&m == &m.reset());
1196         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1197
1198         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1199         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1200         REGEX_ASSERT(&m == &m.reset());
1201         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1202
1203     }
1204
1205     //
1206     // hitEnd() and requireEnd()
1207     //
1208     {
1209         UErrorCode status = U_ZERO_ERROR;
1210         UnicodeString testString("aabb");
1211         RegexMatcher m1(".*", testString,  0, status);
1212         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1213         REGEX_ASSERT(m1.hitEnd() == TRUE);
1214         REGEX_ASSERT(m1.requireEnd() == FALSE);
1215         REGEX_CHECK_STATUS;
1216
1217         status = U_ZERO_ERROR;
1218         RegexMatcher m2("a*", testString, 0, status);
1219         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1220         REGEX_ASSERT(m2.hitEnd() == FALSE);
1221         REGEX_ASSERT(m2.requireEnd() == FALSE);
1222         REGEX_CHECK_STATUS;
1223
1224         status = U_ZERO_ERROR;
1225         RegexMatcher m3(".*$", testString, 0, status);
1226         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1227         REGEX_ASSERT(m3.hitEnd() == TRUE);
1228         REGEX_ASSERT(m3.requireEnd() == TRUE);
1229         REGEX_CHECK_STATUS;
1230     }
1231
1232
1233     //
1234     // Compilation error on reset with UChar *
1235     //   These were a hazard that people were stumbling over with runtime errors.
1236     //   Changed them to compiler errors by adding private methods that more closely
1237     //   matched the incorrect use of the functions.
1238     //
1239 #if 0
1240     {
1241         UErrorCode status = U_ZERO_ERROR;
1242         UChar ucharString[20];
1243         RegexMatcher m(".", 0, status);
1244         m.reset(ucharString);  // should not compile.
1245
1246         RegexPattern *p = RegexPattern::compile(".", 0, status);
1247         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1248
1249         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1250     }
1251 #endif
1252
1253     //
1254     //  Time Outs.
1255     //       Note:  These tests will need to be changed when the regexp engine is
1256     //              able to detect and cut short the exponential time behavior on
1257     //              this type of match.
1258     //
1259     {
1260         UErrorCode status = U_ZERO_ERROR;
1261         //    Enough 'a's in the string to cause the match to time out.
1262         //       (Each on additonal 'a' doubles the time)
1263         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1264         RegexMatcher matcher("(a+)+b", testString, 0, status);
1265         REGEX_CHECK_STATUS;
1266         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1267         matcher.setTimeLimit(100, status);
1268         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1269         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1270         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1271     }
1272     {
1273         UErrorCode status = U_ZERO_ERROR;
1274         //   Few enough 'a's to slip in under the time limit.
1275         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1276         RegexMatcher matcher("(a+)+b", testString, 0, status);
1277         REGEX_CHECK_STATUS;
1278         matcher.setTimeLimit(100, status);
1279         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1280         REGEX_CHECK_STATUS;
1281     }
1282
1283     //
1284     //  Stack Limits
1285     //
1286     {
1287         UErrorCode status = U_ZERO_ERROR;
1288         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1289
1290         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1291         //   of the '+', and makes the stack frames larger.
1292         RegexMatcher matcher("(A)+A$", testString, 0, status);
1293
1294         // With the default stack, this match should fail to run
1295         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1296         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1297
1298         // With unlimited stack, it should run
1299         status = U_ZERO_ERROR;
1300         matcher.setStackLimit(0, status);
1301         REGEX_CHECK_STATUS;
1302         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1303         REGEX_CHECK_STATUS;
1304         REGEX_ASSERT(matcher.getStackLimit() == 0);
1305
1306         // With a limited stack, it the match should fail
1307         status = U_ZERO_ERROR;
1308         matcher.setStackLimit(10000, status);
1309         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1310         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1311         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1312     }
1313
1314         // A pattern that doesn't save state should work with
1315         //   a minimal sized stack
1316     {
1317         UErrorCode status = U_ZERO_ERROR;
1318         UnicodeString testString = "abc";
1319         RegexMatcher matcher("abc", testString, 0, status);
1320         REGEX_CHECK_STATUS;
1321         matcher.setStackLimit(30, status);
1322         REGEX_CHECK_STATUS;
1323         REGEX_ASSERT(matcher.matches(status) == TRUE);
1324         REGEX_CHECK_STATUS;
1325         REGEX_ASSERT(matcher.getStackLimit() == 30);
1326
1327         // Negative stack sizes should fail
1328         status = U_ZERO_ERROR;
1329         matcher.setStackLimit(1000, status);
1330         REGEX_CHECK_STATUS;
1331         matcher.setStackLimit(-1, status);
1332         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1333         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1334     }
1335
1336
1337 }
1338
1339
1340
1341
1342
1343
1344 //---------------------------------------------------------------------------
1345 //
1346 //      API_Replace        API test for class RegexMatcher, testing the
1347 //                         Replace family of functions.
1348 //
1349 //---------------------------------------------------------------------------
1350 void RegexTest::API_Replace() {
1351     //
1352     //  Replace
1353     //
1354     int32_t             flags=0;
1355     UParseError         pe;
1356     UErrorCode          status=U_ZERO_ERROR;
1357
1358     UnicodeString       re("abc");
1359     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1360     REGEX_CHECK_STATUS;
1361     UnicodeString data = ".abc..abc...abc..";
1362     //                    012345678901234567
1363     RegexMatcher *matcher = pat->matcher(data, status);
1364
1365     //
1366     //  Plain vanilla matches.
1367     //
1368     UnicodeString  dest;
1369     dest = matcher->replaceFirst("yz", status);
1370     REGEX_CHECK_STATUS;
1371     REGEX_ASSERT(dest == ".yz..abc...abc..");
1372
1373     dest = matcher->replaceAll("yz", status);
1374     REGEX_CHECK_STATUS;
1375     REGEX_ASSERT(dest == ".yz..yz...yz..");
1376
1377     //
1378     //  Plain vanilla non-matches.
1379     //
1380     UnicodeString d2 = ".abx..abx...abx..";
1381     matcher->reset(d2);
1382     dest = matcher->replaceFirst("yz", status);
1383     REGEX_CHECK_STATUS;
1384     REGEX_ASSERT(dest == ".abx..abx...abx..");
1385
1386     dest = matcher->replaceAll("yz", status);
1387     REGEX_CHECK_STATUS;
1388     REGEX_ASSERT(dest == ".abx..abx...abx..");
1389
1390     //
1391     // Empty source string
1392     //
1393     UnicodeString d3 = "";
1394     matcher->reset(d3);
1395     dest = matcher->replaceFirst("yz", status);
1396     REGEX_CHECK_STATUS;
1397     REGEX_ASSERT(dest == "");
1398
1399     dest = matcher->replaceAll("yz", status);
1400     REGEX_CHECK_STATUS;
1401     REGEX_ASSERT(dest == "");
1402
1403     //
1404     // Empty substitution string
1405     //
1406     matcher->reset(data);              // ".abc..abc...abc.."
1407     dest = matcher->replaceFirst("", status);
1408     REGEX_CHECK_STATUS;
1409     REGEX_ASSERT(dest == "...abc...abc..");
1410
1411     dest = matcher->replaceAll("", status);
1412     REGEX_CHECK_STATUS;
1413     REGEX_ASSERT(dest == "........");
1414
1415     //
1416     // match whole string
1417     //
1418     UnicodeString d4 = "abc";
1419     matcher->reset(d4);
1420     dest = matcher->replaceFirst("xyz", status);
1421     REGEX_CHECK_STATUS;
1422     REGEX_ASSERT(dest == "xyz");
1423
1424     dest = matcher->replaceAll("xyz", status);
1425     REGEX_CHECK_STATUS;
1426     REGEX_ASSERT(dest == "xyz");
1427
1428     //
1429     // Capture Group, simple case
1430     //
1431     UnicodeString       re2("a(..)");
1432     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1433     REGEX_CHECK_STATUS;
1434     UnicodeString d5 = "abcdefg";
1435     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1436     REGEX_CHECK_STATUS;
1437     dest = matcher2->replaceFirst("$1$1", status);
1438     REGEX_CHECK_STATUS;
1439     REGEX_ASSERT(dest == "bcbcdefg");
1440
1441     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1442     REGEX_CHECK_STATUS;
1443     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1444
1445     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1446     REGEX_ASSERT(U_FAILURE(status));
1447     status = U_ZERO_ERROR;
1448
1449     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1450     replacement = replacement.unescape();
1451     dest = matcher2->replaceFirst(replacement, status);
1452     REGEX_CHECK_STATUS;
1453     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1454
1455     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1456
1457
1458     //
1459     // Replacement String with \u hex escapes
1460     //
1461     {
1462         UnicodeString  src = "abc 1 abc 2 abc 3";
1463         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1464         matcher->reset(src);
1465         UnicodeString  result = matcher->replaceAll(substitute, status);
1466         REGEX_CHECK_STATUS;
1467         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1468     }
1469     {
1470         UnicodeString  src = "abc !";
1471         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1472         matcher->reset(src);
1473         UnicodeString  result = matcher->replaceAll(substitute, status);
1474         REGEX_CHECK_STATUS;
1475         UnicodeString expected = UnicodeString("--");
1476         expected.append((UChar32)0x10000);
1477         expected.append("-- !");
1478         REGEX_ASSERT(result == expected);
1479     }
1480     // TODO:  need more through testing of capture substitutions.
1481
1482     // Bug 4057
1483     //
1484     {
1485         status = U_ZERO_ERROR;
1486         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1487         RegexMatcher m("ss(.*?)ee", 0, status);
1488         REGEX_CHECK_STATUS;
1489         UnicodeString result;
1490
1491         // Multiple finds do NOT bump up the previous appendReplacement postion.
1492         m.reset(s);
1493         m.find();
1494         m.find();
1495         m.appendReplacement(result, "ooh", status);
1496         REGEX_CHECK_STATUS;
1497         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1498
1499         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1500         status = U_ZERO_ERROR;
1501         result.truncate(0);
1502         m.reset(10, status);
1503         m.find();
1504         m.find();
1505         m.appendReplacement(result, "ooh", status);
1506         REGEX_CHECK_STATUS;
1507         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1508
1509         // find() at interior of string, appendReplacemnt still starts at beginning.
1510         status = U_ZERO_ERROR;
1511         result.truncate(0);
1512         m.reset();
1513         m.find(10, status);
1514         m.find();
1515         m.appendReplacement(result, "ooh", status);
1516         REGEX_CHECK_STATUS;
1517         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1518
1519         m.appendTail(result);
1520         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1521
1522     }
1523
1524     delete matcher2;
1525     delete pat2;
1526     delete matcher;
1527     delete pat;
1528 }
1529
1530
1531 //---------------------------------------------------------------------------
1532 //
1533 //      API_Pattern       Test that the API for class RegexPattern is
1534 //                        present and nominally working.
1535 //
1536 //---------------------------------------------------------------------------
1537 void RegexTest::API_Pattern() {
1538     RegexPattern        pata;    // Test default constructor to not crash.
1539     RegexPattern        patb;
1540
1541     REGEX_ASSERT(pata == patb);
1542     REGEX_ASSERT(pata == pata);
1543
1544     UnicodeString re1("abc[a-l][m-z]");
1545     UnicodeString re2("def");
1546     UErrorCode    status = U_ZERO_ERROR;
1547     UParseError   pe;
1548
1549     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1550     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1551     REGEX_CHECK_STATUS;
1552     REGEX_ASSERT(*pat1 == *pat1);
1553     REGEX_ASSERT(*pat1 != pata);
1554
1555     // Assign
1556     patb = *pat1;
1557     REGEX_ASSERT(patb == *pat1);
1558
1559     // Copy Construct
1560     RegexPattern patc(*pat1);
1561     REGEX_ASSERT(patc == *pat1);
1562     REGEX_ASSERT(patb == patc);
1563     REGEX_ASSERT(pat1 != pat2);
1564     patb = *pat2;
1565     REGEX_ASSERT(patb != patc);
1566     REGEX_ASSERT(patb == *pat2);
1567
1568     // Compile with no flags.
1569     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1570     REGEX_ASSERT(*pat1a == *pat1);
1571
1572     REGEX_ASSERT(pat1a->flags() == 0);
1573
1574     // Compile with different flags should be not equal
1575     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1576     REGEX_CHECK_STATUS;
1577
1578     REGEX_ASSERT(*pat1b != *pat1a);
1579     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1580     REGEX_ASSERT(pat1a->flags() == 0);
1581     delete pat1b;
1582
1583     // clone
1584     RegexPattern *pat1c = pat1->clone();
1585     REGEX_ASSERT(*pat1c == *pat1);
1586     REGEX_ASSERT(*pat1c != *pat2);
1587
1588     delete pat1c;
1589     delete pat1a;
1590     delete pat1;
1591     delete pat2;
1592
1593
1594     //
1595     //   Verify that a matcher created from a cloned pattern works.
1596     //     (Jitterbug 3423)
1597     //
1598     {
1599         UErrorCode     status     = U_ZERO_ERROR;
1600         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1601         RegexPattern  *pClone     = pSource->clone();
1602         delete         pSource;
1603         RegexMatcher  *mFromClone = pClone->matcher(status);
1604         REGEX_CHECK_STATUS;
1605         UnicodeString s = "Hello World";
1606         mFromClone->reset(s);
1607         REGEX_ASSERT(mFromClone->find() == TRUE);
1608         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1609         REGEX_ASSERT(mFromClone->find() == TRUE);
1610         REGEX_ASSERT(mFromClone->group(status) == "World");
1611         REGEX_ASSERT(mFromClone->find() == FALSE);
1612         delete mFromClone;
1613         delete pClone;
1614     }
1615
1616     //
1617     //   matches convenience API
1618     //
1619     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1620     REGEX_CHECK_STATUS;
1621     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1622     REGEX_CHECK_STATUS;
1623     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1624     REGEX_CHECK_STATUS;
1625     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1626     REGEX_CHECK_STATUS;
1627     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1628     REGEX_CHECK_STATUS;
1629     status = U_INDEX_OUTOFBOUNDS_ERROR;
1630     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1631     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1632
1633
1634     //
1635     // Split()
1636     //
1637     status = U_ZERO_ERROR;
1638     pat1 = RegexPattern::compile(" +",  pe, status);
1639     REGEX_CHECK_STATUS;
1640     UnicodeString  fields[10];
1641
1642     int32_t n;
1643     n = pat1->split("Now is the time", fields, 10, status);
1644     REGEX_CHECK_STATUS;
1645     REGEX_ASSERT(n==4);
1646     REGEX_ASSERT(fields[0]=="Now");
1647     REGEX_ASSERT(fields[1]=="is");
1648     REGEX_ASSERT(fields[2]=="the");
1649     REGEX_ASSERT(fields[3]=="time");
1650     REGEX_ASSERT(fields[4]=="");
1651
1652     n = pat1->split("Now is the time", fields, 2, status);
1653     REGEX_CHECK_STATUS;
1654     REGEX_ASSERT(n==2);
1655     REGEX_ASSERT(fields[0]=="Now");
1656     REGEX_ASSERT(fields[1]=="is the time");
1657     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1658
1659     fields[1] = "*";
1660     status = U_ZERO_ERROR;
1661     n = pat1->split("Now is the time", fields, 1, status);
1662     REGEX_CHECK_STATUS;
1663     REGEX_ASSERT(n==1);
1664     REGEX_ASSERT(fields[0]=="Now is the time");
1665     REGEX_ASSERT(fields[1]=="*");
1666     status = U_ZERO_ERROR;
1667
1668     n = pat1->split("    Now       is the time   ", fields, 10, status);
1669     REGEX_CHECK_STATUS;
1670     REGEX_ASSERT(n==6);
1671     REGEX_ASSERT(fields[0]=="");
1672     REGEX_ASSERT(fields[1]=="Now");
1673     REGEX_ASSERT(fields[2]=="is");
1674     REGEX_ASSERT(fields[3]=="the");
1675     REGEX_ASSERT(fields[4]=="time");
1676     REGEX_ASSERT(fields[5]=="");
1677
1678     n = pat1->split("     ", fields, 10, status);
1679     REGEX_CHECK_STATUS;
1680     REGEX_ASSERT(n==2);
1681     REGEX_ASSERT(fields[0]=="");
1682     REGEX_ASSERT(fields[1]=="");
1683
1684     fields[0] = "foo";
1685     n = pat1->split("", fields, 10, status);
1686     REGEX_CHECK_STATUS;
1687     REGEX_ASSERT(n==0);
1688     REGEX_ASSERT(fields[0]=="foo");
1689
1690     delete pat1;
1691
1692     //  split, with a pattern with (capture)
1693     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1694     REGEX_CHECK_STATUS;
1695
1696     status = U_ZERO_ERROR;
1697     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1698     REGEX_CHECK_STATUS;
1699     REGEX_ASSERT(n==7);
1700     REGEX_ASSERT(fields[0]=="");
1701     REGEX_ASSERT(fields[1]=="a");
1702     REGEX_ASSERT(fields[2]=="Now is ");
1703     REGEX_ASSERT(fields[3]=="b");
1704     REGEX_ASSERT(fields[4]=="the time");
1705     REGEX_ASSERT(fields[5]=="c");
1706     REGEX_ASSERT(fields[6]=="");
1707     REGEX_ASSERT(status==U_ZERO_ERROR);
1708
1709     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1710     REGEX_CHECK_STATUS;
1711     REGEX_ASSERT(n==7);
1712     REGEX_ASSERT(fields[0]=="  ");
1713     REGEX_ASSERT(fields[1]=="a");
1714     REGEX_ASSERT(fields[2]=="Now is ");
1715     REGEX_ASSERT(fields[3]=="b");
1716     REGEX_ASSERT(fields[4]=="the time");
1717     REGEX_ASSERT(fields[5]=="c");
1718     REGEX_ASSERT(fields[6]=="");
1719
1720     status = U_ZERO_ERROR;
1721     fields[6] = "foo";
1722     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1723     REGEX_CHECK_STATUS;
1724     REGEX_ASSERT(n==6);
1725     REGEX_ASSERT(fields[0]=="  ");
1726     REGEX_ASSERT(fields[1]=="a");
1727     REGEX_ASSERT(fields[2]=="Now is ");
1728     REGEX_ASSERT(fields[3]=="b");
1729     REGEX_ASSERT(fields[4]=="the time");
1730     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1731     REGEX_ASSERT(fields[6]=="foo");
1732
1733     status = U_ZERO_ERROR;
1734     fields[5] = "foo";
1735     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1736     REGEX_CHECK_STATUS;
1737     REGEX_ASSERT(n==5);
1738     REGEX_ASSERT(fields[0]=="  ");
1739     REGEX_ASSERT(fields[1]=="a");
1740     REGEX_ASSERT(fields[2]=="Now is ");
1741     REGEX_ASSERT(fields[3]=="b");
1742     REGEX_ASSERT(fields[4]=="the time<c>");
1743     REGEX_ASSERT(fields[5]=="foo");
1744
1745     status = U_ZERO_ERROR;
1746     fields[5] = "foo";
1747     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1748     REGEX_CHECK_STATUS;
1749     REGEX_ASSERT(n==5);
1750     REGEX_ASSERT(fields[0]=="  ");
1751     REGEX_ASSERT(fields[1]=="a");
1752     REGEX_ASSERT(fields[2]=="Now is ");
1753     REGEX_ASSERT(fields[3]=="b");
1754     REGEX_ASSERT(fields[4]=="the time");
1755     REGEX_ASSERT(fields[5]=="foo");
1756
1757     status = U_ZERO_ERROR;
1758     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1759     REGEX_CHECK_STATUS;
1760     REGEX_ASSERT(n==4);
1761     REGEX_ASSERT(fields[0]=="  ");
1762     REGEX_ASSERT(fields[1]=="a");
1763     REGEX_ASSERT(fields[2]=="Now is ");
1764     REGEX_ASSERT(fields[3]=="the time<c>");
1765     status = U_ZERO_ERROR;
1766     delete pat1;
1767
1768     pat1 = RegexPattern::compile("([-,])",  pe, status);
1769     REGEX_CHECK_STATUS;
1770     n = pat1->split("1-10,20", fields, 10, status);
1771     REGEX_CHECK_STATUS;
1772     REGEX_ASSERT(n==5);
1773     REGEX_ASSERT(fields[0]=="1");
1774     REGEX_ASSERT(fields[1]=="-");
1775     REGEX_ASSERT(fields[2]=="10");
1776     REGEX_ASSERT(fields[3]==",");
1777     REGEX_ASSERT(fields[4]=="20");
1778     delete pat1;
1779
1780     // Test split of string with empty trailing fields
1781     pat1 = RegexPattern::compile(",", pe, status);
1782     REGEX_CHECK_STATUS;
1783     n = pat1->split("a,b,c,", fields, 10, status);
1784     REGEX_CHECK_STATUS;
1785     REGEX_ASSERT(n==4);
1786     REGEX_ASSERT(fields[0]=="a");
1787     REGEX_ASSERT(fields[1]=="b");
1788     REGEX_ASSERT(fields[2]=="c");
1789     REGEX_ASSERT(fields[3]=="");
1790
1791     n = pat1->split("a,,,", fields, 10, status);
1792     REGEX_CHECK_STATUS;
1793     REGEX_ASSERT(n==4);
1794     REGEX_ASSERT(fields[0]=="a");
1795     REGEX_ASSERT(fields[1]=="");
1796     REGEX_ASSERT(fields[2]=="");
1797     REGEX_ASSERT(fields[3]=="");
1798     delete pat1;
1799
1800     // Split Separator with zero length match.
1801     pat1 = RegexPattern::compile(":?", pe, status);
1802     REGEX_CHECK_STATUS;
1803     n = pat1->split("abc", fields, 10, status);
1804     REGEX_CHECK_STATUS;
1805     REGEX_ASSERT(n==5);
1806     REGEX_ASSERT(fields[0]=="");
1807     REGEX_ASSERT(fields[1]=="a");
1808     REGEX_ASSERT(fields[2]=="b");
1809     REGEX_ASSERT(fields[3]=="c");
1810     REGEX_ASSERT(fields[4]=="");
1811
1812     delete pat1;
1813
1814     //
1815     // RegexPattern::pattern()
1816     //
1817     pat1 = new RegexPattern();
1818     REGEX_ASSERT(pat1->pattern() == "");
1819     delete pat1;
1820
1821     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1822     REGEX_CHECK_STATUS;
1823     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1824     delete pat1;
1825
1826
1827     //
1828     // classID functions
1829     //
1830     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1831     REGEX_CHECK_STATUS;
1832     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1833     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1834     UnicodeString Hello("Hello, world.");
1835     RegexMatcher *m = pat1->matcher(Hello, status);
1836     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1837     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1838     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1839     delete m;
1840     delete pat1;
1841
1842 }
1843
1844 //---------------------------------------------------------------------------
1845 //
1846 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1847 //                       is present and working, but excluding functions
1848 //                       implementing replace operations.
1849 //
1850 //---------------------------------------------------------------------------
1851 void RegexTest::API_Match_UTF8() {
1852     UParseError         pe;
1853     UErrorCode          status=U_ZERO_ERROR;
1854     int32_t             flags = 0;
1855
1856     //
1857     // Debug - slide failing test cases early
1858     //
1859 #if 0
1860     {
1861     }
1862     return;
1863 #endif
1864
1865     //
1866     // Simple pattern compilation
1867     //
1868     {
1869         UText               re = UTEXT_INITIALIZER;
1870         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1871         REGEX_VERBOSE_TEXT(&re);
1872         RegexPattern        *pat2;
1873         pat2 = RegexPattern::compile(&re, flags, pe, status);
1874         REGEX_CHECK_STATUS;
1875
1876         UText input1 = UTEXT_INITIALIZER;
1877         UText input2 = UTEXT_INITIALIZER;
1878         UText empty  = UTEXT_INITIALIZER;
1879         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1880         REGEX_VERBOSE_TEXT(&input1);
1881         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1882         REGEX_VERBOSE_TEXT(&input2);
1883         utext_openUChars(&empty, NULL, 0, &status);
1884
1885         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1886         int32_t input2Len = strlen("not abc");
1887
1888
1889         //
1890         // Matcher creation and reset.
1891         //
1892         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1893         REGEX_CHECK_STATUS;
1894         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1895         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1896         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1897         m1->reset(&input2);
1898         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1899         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1900         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1901         m1->reset(&input1);
1902         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1903         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1904         m1->reset(&empty);
1905         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1906         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1907
1908         //
1909         //  reset(pos, status)
1910         //
1911         m1->reset(&input1);
1912         m1->reset(4, status);
1913         REGEX_CHECK_STATUS;
1914         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1915         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1916
1917         m1->reset(-1, status);
1918         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1919         status = U_ZERO_ERROR;
1920
1921         m1->reset(0, status);
1922         REGEX_CHECK_STATUS;
1923         status = U_ZERO_ERROR;
1924
1925         m1->reset(input1Len-1, status);
1926         REGEX_CHECK_STATUS;
1927         status = U_ZERO_ERROR;
1928
1929         m1->reset(input1Len, status);
1930         REGEX_CHECK_STATUS;
1931         status = U_ZERO_ERROR;
1932
1933         m1->reset(input1Len+1, status);
1934         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1935         status = U_ZERO_ERROR;
1936
1937         //
1938         // match(pos, status)
1939         //
1940         m1->reset(&input2);
1941         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1942         m1->reset();
1943         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1944         m1->reset();
1945         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1946         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1947         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1948         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1949
1950         // Match() at end of string should fail, but should not
1951         //  be an error.
1952         status = U_ZERO_ERROR;
1953         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1954         REGEX_CHECK_STATUS;
1955
1956         // Match beyond end of string should fail with an error.
1957         status = U_ZERO_ERROR;
1958         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1959         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1960
1961         // Successful match at end of string.
1962         {
1963             status = U_ZERO_ERROR;
1964             RegexMatcher m("A?", 0, status);  // will match zero length string.
1965             REGEX_CHECK_STATUS;
1966             m.reset(&input1);
1967             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1968             REGEX_CHECK_STATUS;
1969             m.reset(&empty);
1970             REGEX_ASSERT(m.matches(0, status) == TRUE);
1971             REGEX_CHECK_STATUS;
1972         }
1973
1974
1975         //
1976         // lookingAt(pos, status)
1977         //
1978         status = U_ZERO_ERROR;
1979         m1->reset(&input2);  // "not abc"
1980         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1981         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1982         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1983         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1984         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1985         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1986         status = U_ZERO_ERROR;
1987         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1988         REGEX_CHECK_STATUS;
1989         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1990         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1991
1992         delete m1;
1993         delete pat2;
1994
1995         utext_close(&re);
1996         utext_close(&input1);
1997         utext_close(&input2);
1998         utext_close(&empty);
1999     }
2000
2001
2002     //
2003     // Capture Group.
2004     //     RegexMatcher::start();
2005     //     RegexMatcher::end();
2006     //     RegexMatcher::groupCount();
2007     //
2008     {
2009         int32_t             flags=0;
2010         UParseError         pe;
2011         UErrorCode          status=U_ZERO_ERROR;
2012         UText               re=UTEXT_INITIALIZER;
2013         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2014         utext_openUTF8(&re, str_01234567_pat, -1, &status);
2015
2016         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2017         REGEX_CHECK_STATUS;
2018
2019         UText input = UTEXT_INITIALIZER;
2020         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2021         utext_openUTF8(&input, str_0123456789, -1, &status);
2022
2023         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2024         REGEX_CHECK_STATUS;
2025         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
2026         static const int32_t matchStarts[] = {0,  2, 4, 8};
2027         static const int32_t matchEnds[]   = {10, 8, 6, 10};
2028         int32_t i;
2029         for (i=0; i<4; i++) {
2030             int32_t actualStart = matcher->start(i, status);
2031             REGEX_CHECK_STATUS;
2032             if (actualStart != matchStarts[i]) {
2033                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2034                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
2035             }
2036             int32_t actualEnd = matcher->end(i, status);
2037             REGEX_CHECK_STATUS;
2038             if (actualEnd != matchEnds[i]) {
2039                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2040                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2041             }
2042         }
2043
2044         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2045         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2046
2047         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2048         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2049         matcher->reset();
2050         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2051
2052         matcher->lookingAt(status);
2053
2054         UnicodeString dest;
2055         UText destText = UTEXT_INITIALIZER;
2056         utext_openUnicodeString(&destText, &dest, &status);
2057         UText *result;
2058         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2059         //  Test shallow-clone API
2060         int64_t   group_len;
2061         result = matcher->group((UText *)NULL, group_len, status);
2062         REGEX_CHECK_STATUS;
2063         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2064         utext_close(result);
2065         result = matcher->group(0, &destText, group_len, status);
2066         REGEX_CHECK_STATUS;
2067         REGEX_ASSERT(result == &destText);
2068         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2069         //  destText is now immutable, reopen it
2070         utext_close(&destText);
2071         utext_openUnicodeString(&destText, &dest, &status);
2072
2073         int64_t length;
2074         result = matcher->group(0, NULL, length, status);
2075         REGEX_CHECK_STATUS;
2076         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2077         utext_close(result);
2078         result = matcher->group(0, &destText, length, status);
2079         REGEX_CHECK_STATUS;
2080         REGEX_ASSERT(result == &destText);
2081         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2082         REGEX_ASSERT(length == 10);
2083         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2084
2085         // Capture Group 1 == "234567"
2086         result = matcher->group(1, NULL, length, status);
2087         REGEX_CHECK_STATUS;
2088         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2089         REGEX_ASSERT(length == 6);
2090         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2091         utext_close(result);
2092
2093         result = matcher->group(1, &destText, length, status);
2094         REGEX_CHECK_STATUS;
2095         REGEX_ASSERT(result == &destText);
2096         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2097         REGEX_ASSERT(length == 6);
2098         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2099         utext_close(result);
2100
2101         // Capture Group 2 == "45"
2102         result = matcher->group(2, NULL, length, status);
2103         REGEX_CHECK_STATUS;
2104         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2105         REGEX_ASSERT(length == 2);
2106         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2107         utext_close(result);
2108
2109         result = matcher->group(2, &destText, length, status);
2110         REGEX_CHECK_STATUS;
2111         REGEX_ASSERT(result == &destText);
2112         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2113         REGEX_ASSERT(length == 2);
2114         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2115         utext_close(result);
2116
2117         // Capture Group 3 == "89"
2118         result = matcher->group(3, NULL, length, status);
2119         REGEX_CHECK_STATUS;
2120         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2121         REGEX_ASSERT(length == 2);
2122         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2123         utext_close(result);
2124
2125         result = matcher->group(3, &destText, length, status);
2126         REGEX_CHECK_STATUS;
2127         REGEX_ASSERT(result == &destText);
2128         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2129         REGEX_ASSERT(length == 2);
2130         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2131         utext_close(result);
2132
2133         // Capture Group number out of range.
2134         status = U_ZERO_ERROR;
2135         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136         status = U_ZERO_ERROR;
2137         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2138         status = U_ZERO_ERROR;
2139         matcher->reset();
2140         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2141
2142         delete matcher;
2143         delete pat;
2144
2145         utext_close(&destText);
2146         utext_close(&input);
2147         utext_close(&re);
2148     }
2149
2150     //
2151     //  find
2152     //
2153     {
2154         int32_t             flags=0;
2155         UParseError         pe;
2156         UErrorCode          status=U_ZERO_ERROR;
2157         UText               re=UTEXT_INITIALIZER;
2158         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2159         utext_openUTF8(&re, str_abc, -1, &status);
2160
2161         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2162         REGEX_CHECK_STATUS;
2163         UText input = UTEXT_INITIALIZER;
2164         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2165         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2166         //                      012345678901234567
2167
2168         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2169         REGEX_CHECK_STATUS;
2170         REGEX_ASSERT(matcher->find());
2171         REGEX_ASSERT(matcher->start(status) == 1);
2172         REGEX_ASSERT(matcher->find());
2173         REGEX_ASSERT(matcher->start(status) == 6);
2174         REGEX_ASSERT(matcher->find());
2175         REGEX_ASSERT(matcher->start(status) == 12);
2176         REGEX_ASSERT(matcher->find() == FALSE);
2177         REGEX_ASSERT(matcher->find() == FALSE);
2178
2179         matcher->reset();
2180         REGEX_ASSERT(matcher->find());
2181         REGEX_ASSERT(matcher->start(status) == 1);
2182
2183         REGEX_ASSERT(matcher->find(0, status));
2184         REGEX_ASSERT(matcher->start(status) == 1);
2185         REGEX_ASSERT(matcher->find(1, status));
2186         REGEX_ASSERT(matcher->start(status) == 1);
2187         REGEX_ASSERT(matcher->find(2, status));
2188         REGEX_ASSERT(matcher->start(status) == 6);
2189         REGEX_ASSERT(matcher->find(12, status));
2190         REGEX_ASSERT(matcher->start(status) == 12);
2191         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2192         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2193         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2194         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2195
2196         status = U_ZERO_ERROR;
2197         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2198         status = U_ZERO_ERROR;
2199         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2200
2201         REGEX_ASSERT(matcher->groupCount() == 0);
2202
2203         delete matcher;
2204         delete pat;
2205
2206         utext_close(&input);
2207         utext_close(&re);
2208     }
2209
2210
2211     //
2212     //  find, with \G in pattern (true if at the end of a previous match).
2213     //
2214     {
2215         int32_t             flags=0;
2216         UParseError         pe;
2217         UErrorCode          status=U_ZERO_ERROR;
2218         UText               re=UTEXT_INITIALIZER;
2219         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2220         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2221
2222         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2223
2224         REGEX_CHECK_STATUS;
2225         UText input = UTEXT_INITIALIZER;
2226         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2227         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2228         //                      012345678901234567
2229
2230         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2231         REGEX_CHECK_STATUS;
2232         REGEX_ASSERT(matcher->find());
2233         REGEX_ASSERT(matcher->start(status) == 0);
2234         REGEX_ASSERT(matcher->start(1, status) == -1);
2235         REGEX_ASSERT(matcher->start(2, status) == 1);
2236
2237         REGEX_ASSERT(matcher->find());
2238         REGEX_ASSERT(matcher->start(status) == 4);
2239         REGEX_ASSERT(matcher->start(1, status) == 4);
2240         REGEX_ASSERT(matcher->start(2, status) == -1);
2241         REGEX_CHECK_STATUS;
2242
2243         delete matcher;
2244         delete pat;
2245
2246         utext_close(&input);
2247         utext_close(&re);
2248     }
2249
2250     //
2251     //   find with zero length matches, match position should bump ahead
2252     //     to prevent loops.
2253     //
2254     {
2255         int32_t                 i;
2256         UErrorCode          status=U_ZERO_ERROR;
2257         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2258                                                       //   using an always-true look-ahead.
2259         REGEX_CHECK_STATUS;
2260         UText s = UTEXT_INITIALIZER;
2261         utext_openUTF8(&s, "    ", -1, &status);
2262         m.reset(&s);
2263         for (i=0; ; i++) {
2264             if (m.find() == FALSE) {
2265                 break;
2266             }
2267             REGEX_ASSERT(m.start(status) == i);
2268             REGEX_ASSERT(m.end(status) == i);
2269         }
2270         REGEX_ASSERT(i==5);
2271
2272         // Check that the bump goes over characters outside the BMP OK
2273         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2274         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2275         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2276         m.reset(&s);
2277         for (i=0; ; i+=4) {
2278             if (m.find() == FALSE) {
2279                 break;
2280             }
2281             REGEX_ASSERT(m.start(status) == i);
2282             REGEX_ASSERT(m.end(status) == i);
2283         }
2284         REGEX_ASSERT(i==20);
2285
2286         utext_close(&s);
2287     }
2288     {
2289         // find() loop breaking test.
2290         //        with pattern of /.?/, should see a series of one char matches, then a single
2291         //        match of zero length at the end of the input string.
2292         int32_t                 i;
2293         UErrorCode          status=U_ZERO_ERROR;
2294         RegexMatcher        m(".?", 0, status);
2295         REGEX_CHECK_STATUS;
2296         UText s = UTEXT_INITIALIZER;
2297         utext_openUTF8(&s, "    ", -1, &status);
2298         m.reset(&s);
2299         for (i=0; ; i++) {
2300             if (m.find() == FALSE) {
2301                 break;
2302             }
2303             REGEX_ASSERT(m.start(status) == i);
2304             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2305         }
2306         REGEX_ASSERT(i==5);
2307
2308         utext_close(&s);
2309     }
2310
2311
2312     //
2313     // Matchers with no input string behave as if they had an empty input string.
2314     //
2315
2316     {
2317         UErrorCode status = U_ZERO_ERROR;
2318         RegexMatcher  m(".?", 0, status);
2319         REGEX_CHECK_STATUS;
2320         REGEX_ASSERT(m.find());
2321         REGEX_ASSERT(m.start(status) == 0);
2322         REGEX_ASSERT(m.input() == "");
2323     }
2324     {
2325         UErrorCode status = U_ZERO_ERROR;
2326         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2327         RegexMatcher  *m = p->matcher(status);
2328         REGEX_CHECK_STATUS;
2329
2330         REGEX_ASSERT(m->find() == FALSE);
2331         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2332         delete m;
2333         delete p;
2334     }
2335
2336     //
2337     // Regions
2338     //
2339     {
2340         UErrorCode status = U_ZERO_ERROR;
2341         UText testPattern = UTEXT_INITIALIZER;
2342         UText testText    = UTEXT_INITIALIZER;
2343         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2344         REGEX_VERBOSE_TEXT(&testPattern);
2345         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2346         REGEX_VERBOSE_TEXT(&testText);
2347
2348         RegexMatcher m(&testPattern, &testText, 0, status);
2349         REGEX_CHECK_STATUS;
2350         REGEX_ASSERT(m.regionStart() == 0);
2351         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2352         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2353         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2354
2355         m.region(2,4, status);
2356         REGEX_CHECK_STATUS;
2357         REGEX_ASSERT(m.matches(status));
2358         REGEX_ASSERT(m.start(status)==2);
2359         REGEX_ASSERT(m.end(status)==4);
2360         REGEX_CHECK_STATUS;
2361
2362         m.reset();
2363         REGEX_ASSERT(m.regionStart() == 0);
2364         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2365
2366         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2367         REGEX_VERBOSE_TEXT(&testText);
2368         m.reset(&testText);
2369         REGEX_ASSERT(m.regionStart() == 0);
2370         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2371
2372         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2373         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2374         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2375         REGEX_ASSERT(&m == &m.reset());
2376         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2377
2378         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2379         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2380         REGEX_ASSERT(&m == &m.reset());
2381         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2382
2383         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2384         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2385         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2386         REGEX_ASSERT(&m == &m.reset());
2387         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2388
2389         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2390         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2391         REGEX_ASSERT(&m == &m.reset());
2392         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2393
2394         utext_close(&testText);
2395         utext_close(&testPattern);
2396     }
2397
2398     //
2399     // hitEnd() and requireEnd()
2400     //
2401     {
2402         UErrorCode status = U_ZERO_ERROR;
2403         UText testPattern = UTEXT_INITIALIZER;
2404         UText testText    = UTEXT_INITIALIZER;
2405         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2406         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2407         utext_openUTF8(&testPattern, str_, -1, &status);
2408         utext_openUTF8(&testText, str_aabb, -1, &status);
2409
2410         RegexMatcher m1(&testPattern, &testText,  0, status);
2411         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2412         REGEX_ASSERT(m1.hitEnd() == TRUE);
2413         REGEX_ASSERT(m1.requireEnd() == FALSE);
2414         REGEX_CHECK_STATUS;
2415
2416         status = U_ZERO_ERROR;
2417         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2418         utext_openUTF8(&testPattern, str_a, -1, &status);
2419         RegexMatcher m2(&testPattern, &testText, 0, status);
2420         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2421         REGEX_ASSERT(m2.hitEnd() == FALSE);
2422         REGEX_ASSERT(m2.requireEnd() == FALSE);
2423         REGEX_CHECK_STATUS;
2424
2425         status = U_ZERO_ERROR;
2426         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2427         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2428         RegexMatcher m3(&testPattern, &testText, 0, status);
2429         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2430         REGEX_ASSERT(m3.hitEnd() == TRUE);
2431         REGEX_ASSERT(m3.requireEnd() == TRUE);
2432         REGEX_CHECK_STATUS;
2433
2434         utext_close(&testText);
2435         utext_close(&testPattern);
2436     }
2437 }
2438
2439
2440 //---------------------------------------------------------------------------
2441 //
2442 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2443 //                         Replace family of functions.
2444 //
2445 //---------------------------------------------------------------------------
2446 void RegexTest::API_Replace_UTF8() {
2447     //
2448     //  Replace
2449     //
2450     int32_t             flags=0;
2451     UParseError         pe;
2452     UErrorCode          status=U_ZERO_ERROR;
2453
2454     UText               re=UTEXT_INITIALIZER;
2455     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2456     REGEX_VERBOSE_TEXT(&re);
2457     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2458     REGEX_CHECK_STATUS;
2459
2460     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2461     //             012345678901234567
2462     UText dataText = UTEXT_INITIALIZER;
2463     utext_openUTF8(&dataText, data, -1, &status);
2464     REGEX_CHECK_STATUS;
2465     REGEX_VERBOSE_TEXT(&dataText);
2466     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2467
2468     //
2469     //  Plain vanilla matches.
2470     //
2471     UnicodeString  dest;
2472     UText destText = UTEXT_INITIALIZER;
2473     utext_openUnicodeString(&destText, &dest, &status);
2474     UText *result;
2475
2476     UText replText = UTEXT_INITIALIZER;
2477
2478     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2479     utext_openUTF8(&replText, str_yz, -1, &status);
2480     REGEX_VERBOSE_TEXT(&replText);
2481     result = matcher->replaceFirst(&replText, NULL, status);
2482     REGEX_CHECK_STATUS;
2483     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2484     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2485     utext_close(result);
2486     result = matcher->replaceFirst(&replText, &destText, status);
2487     REGEX_CHECK_STATUS;
2488     REGEX_ASSERT(result == &destText);
2489     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2490
2491     result = matcher->replaceAll(&replText, NULL, status);
2492     REGEX_CHECK_STATUS;
2493     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2494     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2495     utext_close(result);
2496
2497     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2498     result = matcher->replaceAll(&replText, &destText, status);
2499     REGEX_CHECK_STATUS;
2500     REGEX_ASSERT(result == &destText);
2501     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2502
2503     //
2504     //  Plain vanilla non-matches.
2505     //
2506     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2507     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2508     matcher->reset(&dataText);
2509
2510     result = matcher->replaceFirst(&replText, NULL, status);
2511     REGEX_CHECK_STATUS;
2512     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2513     utext_close(result);
2514     result = matcher->replaceFirst(&replText, &destText, status);
2515     REGEX_CHECK_STATUS;
2516     REGEX_ASSERT(result == &destText);
2517     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2518
2519     result = matcher->replaceAll(&replText, NULL, status);
2520     REGEX_CHECK_STATUS;
2521     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2522     utext_close(result);
2523     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2524     result = matcher->replaceAll(&replText, &destText, status);
2525     REGEX_CHECK_STATUS;
2526     REGEX_ASSERT(result == &destText);
2527     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2528
2529     //
2530     // Empty source string
2531     //
2532     utext_openUTF8(&dataText, NULL, 0, &status);
2533     matcher->reset(&dataText);
2534
2535     result = matcher->replaceFirst(&replText, NULL, status);
2536     REGEX_CHECK_STATUS;
2537     REGEX_ASSERT_UTEXT_UTF8("", result);
2538     utext_close(result);
2539     result = matcher->replaceFirst(&replText, &destText, status);
2540     REGEX_CHECK_STATUS;
2541     REGEX_ASSERT(result == &destText);
2542     REGEX_ASSERT_UTEXT_UTF8("", result);
2543
2544     result = matcher->replaceAll(&replText, NULL, status);
2545     REGEX_CHECK_STATUS;
2546     REGEX_ASSERT_UTEXT_UTF8("", result);
2547     utext_close(result);
2548     result = matcher->replaceAll(&replText, &destText, status);
2549     REGEX_CHECK_STATUS;
2550     REGEX_ASSERT(result == &destText);
2551     REGEX_ASSERT_UTEXT_UTF8("", result);
2552
2553     //
2554     // Empty substitution string
2555     //
2556     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2557     matcher->reset(&dataText);
2558
2559     utext_openUTF8(&replText, NULL, 0, &status);
2560     result = matcher->replaceFirst(&replText, NULL, status);
2561     REGEX_CHECK_STATUS;
2562     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2563     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2564     utext_close(result);
2565     result = matcher->replaceFirst(&replText, &destText, status);
2566     REGEX_CHECK_STATUS;
2567     REGEX_ASSERT(result == &destText);
2568     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2569
2570     result = matcher->replaceAll(&replText, NULL, status);
2571     REGEX_CHECK_STATUS;
2572     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2573     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2574     utext_close(result);
2575     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2576     result = matcher->replaceAll(&replText, &destText, status);
2577     REGEX_CHECK_STATUS;
2578     REGEX_ASSERT(result == &destText);
2579     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2580
2581     //
2582     // match whole string
2583     //
2584     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2585     utext_openUTF8(&dataText, str_abc, -1, &status);
2586     matcher->reset(&dataText);
2587
2588     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2589     utext_openUTF8(&replText, str_xyz, -1, &status);
2590     result = matcher->replaceFirst(&replText, NULL, status);
2591     REGEX_CHECK_STATUS;
2592     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2593     utext_close(result);
2594     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2595     result = matcher->replaceFirst(&replText, &destText, status);
2596     REGEX_CHECK_STATUS;
2597     REGEX_ASSERT(result == &destText);
2598     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2599
2600     result = matcher->replaceAll(&replText, NULL, status);
2601     REGEX_CHECK_STATUS;
2602     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2603     utext_close(result);
2604     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2605     result = matcher->replaceAll(&replText, &destText, status);
2606     REGEX_CHECK_STATUS;
2607     REGEX_ASSERT(result == &destText);
2608     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2609
2610     //
2611     // Capture Group, simple case
2612     //
2613     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2614     utext_openUTF8(&re, str_add, -1, &status);
2615     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2616     REGEX_CHECK_STATUS;
2617
2618     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2619     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2620     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2621     REGEX_CHECK_STATUS;
2622
2623     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2624     utext_openUTF8(&replText, str_11, -1, &status);
2625     result = matcher2->replaceFirst(&replText, NULL, status);
2626     REGEX_CHECK_STATUS;
2627     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2628     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2629     utext_close(result);
2630     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2631     result = matcher2->replaceFirst(&replText, &destText, status);
2632     REGEX_CHECK_STATUS;
2633     REGEX_ASSERT(result == &destText);
2634     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2635
2636     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2637     utext_openUTF8(&replText, str_v, -1, &status);
2638     REGEX_VERBOSE_TEXT(&replText);
2639     result = matcher2->replaceFirst(&replText, NULL, status);
2640     REGEX_CHECK_STATUS;
2641     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2642     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2643     utext_close(result);
2644     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2645     result = matcher2->replaceFirst(&replText, &destText, status);
2646     REGEX_CHECK_STATUS;
2647     REGEX_ASSERT(result == &destText);
2648     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2649
2650     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2651                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2652                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2653     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2654     result = matcher2->replaceFirst(&replText, NULL, status);
2655     REGEX_CHECK_STATUS;
2656     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2657     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2658     utext_close(result);
2659     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2660     result = matcher2->replaceFirst(&replText, &destText, status);
2661     REGEX_CHECK_STATUS;
2662     REGEX_ASSERT(result == &destText);
2663     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2664
2665     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2666     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2667     //                                 012345678901234567890123456
2668     supplDigitChars[22] = 0xF0;
2669     supplDigitChars[23] = 0x9D;
2670     supplDigitChars[24] = 0x9F;
2671     supplDigitChars[25] = 0x8F;
2672     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2673
2674     result = matcher2->replaceFirst(&replText, NULL, status);
2675     REGEX_CHECK_STATUS;
2676     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2677     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2678     utext_close(result);
2679     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2680     result = matcher2->replaceFirst(&replText, &destText, status);
2681     REGEX_CHECK_STATUS;
2682     REGEX_ASSERT(result == &destText);
2683     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2684     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2685     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2686     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2687 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2688     utext_close(result);
2689     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2690     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2691     REGEX_ASSERT(result == &destText);
2692 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2693
2694     //
2695     // Replacement String with \u hex escapes
2696     //
2697     {
2698       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2699       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2700         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2701         utext_openUTF8(&replText, str_u0043, -1, &status);
2702         matcher->reset(&dataText);
2703
2704         result = matcher->replaceAll(&replText, NULL, status);
2705         REGEX_CHECK_STATUS;
2706         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2707         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2708         utext_close(result);
2709         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2710         result = matcher->replaceAll(&replText, &destText, status);
2711         REGEX_CHECK_STATUS;
2712         REGEX_ASSERT(result == &destText);
2713         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2714     }
2715     {
2716       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2717         utext_openUTF8(&dataText, str_abc, -1, &status);
2718         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2719         utext_openUTF8(&replText, str_U00010000, -1, &status);
2720         matcher->reset(&dataText);
2721
2722         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2723         //                          0123456789
2724         expected[2] = 0xF0;
2725         expected[3] = 0x90;
2726         expected[4] = 0x80;
2727         expected[5] = 0x80;
2728
2729         result = matcher->replaceAll(&replText, NULL, status);
2730         REGEX_CHECK_STATUS;
2731         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2732         utext_close(result);
2733         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2734         result = matcher->replaceAll(&replText, &destText, status);
2735         REGEX_CHECK_STATUS;
2736         REGEX_ASSERT(result == &destText);
2737         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2738     }
2739     // TODO:  need more through testing of capture substitutions.
2740
2741     // Bug 4057
2742     //
2743     {
2744         status = U_ZERO_ERROR;
2745 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2746 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2747 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2748         utext_openUTF8(&re, str_ssee, -1, &status);
2749         utext_openUTF8(&dataText, str_blah, -1, &status);
2750         utext_openUTF8(&replText, str_ooh, -1, &status);
2751
2752         RegexMatcher m(&re, 0, status);
2753         REGEX_CHECK_STATUS;
2754
2755         UnicodeString result;
2756         UText resultText = UTEXT_INITIALIZER;
2757         utext_openUnicodeString(&resultText, &result, &status);
2758
2759         // Multiple finds do NOT bump up the previous appendReplacement postion.
2760         m.reset(&dataText);
2761         m.find();
2762         m.find();
2763         m.appendReplacement(&resultText, &replText, status);
2764         REGEX_CHECK_STATUS;
2765         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2766         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2767
2768         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2769         status = U_ZERO_ERROR;
2770         result.truncate(0);
2771         utext_openUnicodeString(&resultText, &result, &status);
2772         m.reset(10, status);
2773         m.find();
2774         m.find();
2775         m.appendReplacement(&resultText, &replText, status);
2776         REGEX_CHECK_STATUS;
2777         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2778         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2779
2780         // find() at interior of string, appendReplacement still starts at beginning.
2781         status = U_ZERO_ERROR;
2782         result.truncate(0);
2783         utext_openUnicodeString(&resultText, &result, &status);
2784         m.reset();
2785         m.find(10, status);
2786         m.find();
2787         m.appendReplacement(&resultText, &replText, status);
2788         REGEX_CHECK_STATUS;
2789         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2790         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2791
2792         m.appendTail(&resultText, status);
2793         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2794         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2795
2796         utext_close(&resultText);
2797     }
2798
2799     delete matcher2;
2800     delete pat2;
2801     delete matcher;
2802     delete pat;
2803
2804     utext_close(&dataText);
2805     utext_close(&replText);
2806     utext_close(&destText);
2807     utext_close(&re);
2808 }
2809
2810
2811 //---------------------------------------------------------------------------
2812 //
2813 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2814 //                        present and nominally working.
2815 //
2816 //---------------------------------------------------------------------------
2817 void RegexTest::API_Pattern_UTF8() {
2818     RegexPattern        pata;    // Test default constructor to not crash.
2819     RegexPattern        patb;
2820
2821     REGEX_ASSERT(pata == patb);
2822     REGEX_ASSERT(pata == pata);
2823
2824     UText         re1 = UTEXT_INITIALIZER;
2825     UText         re2 = UTEXT_INITIALIZER;
2826     UErrorCode    status = U_ZERO_ERROR;
2827     UParseError   pe;
2828
2829     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2830     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2831     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2832     utext_openUTF8(&re2, str_def, -1, &status);
2833
2834     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2835     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2836     REGEX_CHECK_STATUS;
2837     REGEX_ASSERT(*pat1 == *pat1);
2838     REGEX_ASSERT(*pat1 != pata);
2839
2840     // Assign
2841     patb = *pat1;
2842     REGEX_ASSERT(patb == *pat1);
2843
2844     // Copy Construct
2845     RegexPattern patc(*pat1);
2846     REGEX_ASSERT(patc == *pat1);
2847     REGEX_ASSERT(patb == patc);
2848     REGEX_ASSERT(pat1 != pat2);
2849     patb = *pat2;
2850     REGEX_ASSERT(patb != patc);
2851     REGEX_ASSERT(patb == *pat2);
2852
2853     // Compile with no flags.
2854     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2855     REGEX_ASSERT(*pat1a == *pat1);
2856
2857     REGEX_ASSERT(pat1a->flags() == 0);
2858
2859     // Compile with different flags should be not equal
2860     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2861     REGEX_CHECK_STATUS;
2862
2863     REGEX_ASSERT(*pat1b != *pat1a);
2864     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2865     REGEX_ASSERT(pat1a->flags() == 0);
2866     delete pat1b;
2867
2868     // clone
2869     RegexPattern *pat1c = pat1->clone();
2870     REGEX_ASSERT(*pat1c == *pat1);
2871     REGEX_ASSERT(*pat1c != *pat2);
2872
2873     delete pat1c;
2874     delete pat1a;
2875     delete pat1;
2876     delete pat2;
2877
2878     utext_close(&re1);
2879     utext_close(&re2);
2880
2881
2882     //
2883     //   Verify that a matcher created from a cloned pattern works.
2884     //     (Jitterbug 3423)
2885     //
2886     {
2887         UErrorCode     status     = U_ZERO_ERROR;
2888         UText          pattern    = UTEXT_INITIALIZER;
2889         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2890         utext_openUTF8(&pattern, str_pL, -1, &status);
2891
2892         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2893         RegexPattern  *pClone     = pSource->clone();
2894         delete         pSource;
2895         RegexMatcher  *mFromClone = pClone->matcher(status);
2896         REGEX_CHECK_STATUS;
2897
2898         UText          input      = UTEXT_INITIALIZER;
2899         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2900         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2901         mFromClone->reset(&input);
2902         REGEX_ASSERT(mFromClone->find() == TRUE);
2903         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2904         REGEX_ASSERT(mFromClone->find() == TRUE);
2905         REGEX_ASSERT(mFromClone->group(status) == "World");
2906         REGEX_ASSERT(mFromClone->find() == FALSE);
2907         delete mFromClone;
2908         delete pClone;
2909
2910         utext_close(&input);
2911         utext_close(&pattern);
2912     }
2913
2914     //
2915     //   matches convenience API
2916     //
2917     {
2918         UErrorCode status  = U_ZERO_ERROR;
2919         UText      pattern = UTEXT_INITIALIZER;
2920         UText      input   = UTEXT_INITIALIZER;
2921
2922         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2923         utext_openUTF8(&input, str_randominput, -1, &status);
2924
2925         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2926         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2927         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2928         REGEX_CHECK_STATUS;
2929
2930         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2931         utext_openUTF8(&pattern, str_abc, -1, &status);
2932         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2933         REGEX_CHECK_STATUS;
2934
2935         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2936         utext_openUTF8(&pattern, str_nput, -1, &status);
2937         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2938         REGEX_CHECK_STATUS;
2939
2940         utext_openUTF8(&pattern, str_randominput, -1, &status);
2941         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2942         REGEX_CHECK_STATUS;
2943
2944         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2945         utext_openUTF8(&pattern, str_u, -1, &status);
2946         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2947         REGEX_CHECK_STATUS;
2948
2949         utext_openUTF8(&input, str_abc, -1, &status);
2950         utext_openUTF8(&pattern, str_abc, -1, &status);
2951         status = U_INDEX_OUTOFBOUNDS_ERROR;
2952         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2953         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2954
2955         utext_close(&input);
2956         utext_close(&pattern);
2957     }
2958
2959
2960     //
2961     // Split()
2962     //
2963     status = U_ZERO_ERROR;
2964     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2965     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2966     pat1 = RegexPattern::compile(&re1, pe, status);
2967     REGEX_CHECK_STATUS;
2968     UnicodeString  fields[10];
2969
2970     int32_t n;
2971     n = pat1->split("Now is the time", fields, 10, status);
2972     REGEX_CHECK_STATUS;
2973     REGEX_ASSERT(n==4);
2974     REGEX_ASSERT(fields[0]=="Now");
2975     REGEX_ASSERT(fields[1]=="is");
2976     REGEX_ASSERT(fields[2]=="the");
2977     REGEX_ASSERT(fields[3]=="time");
2978     REGEX_ASSERT(fields[4]=="");
2979
2980     n = pat1->split("Now is the time", fields, 2, status);
2981     REGEX_CHECK_STATUS;
2982     REGEX_ASSERT(n==2);
2983     REGEX_ASSERT(fields[0]=="Now");
2984     REGEX_ASSERT(fields[1]=="is the time");
2985     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2986
2987     fields[1] = "*";
2988     status = U_ZERO_ERROR;
2989     n = pat1->split("Now is the time", fields, 1, status);
2990     REGEX_CHECK_STATUS;
2991     REGEX_ASSERT(n==1);
2992     REGEX_ASSERT(fields[0]=="Now is the time");
2993     REGEX_ASSERT(fields[1]=="*");
2994     status = U_ZERO_ERROR;
2995
2996     n = pat1->split("    Now       is the time   ", fields, 10, status);
2997     REGEX_CHECK_STATUS;
2998     REGEX_ASSERT(n==6);
2999     REGEX_ASSERT(fields[0]=="");
3000     REGEX_ASSERT(fields[1]=="Now");
3001     REGEX_ASSERT(fields[2]=="is");
3002     REGEX_ASSERT(fields[3]=="the");
3003     REGEX_ASSERT(fields[4]=="time");
3004     REGEX_ASSERT(fields[5]=="");
3005     REGEX_ASSERT(fields[6]=="");
3006
3007     fields[2] = "*";
3008     n = pat1->split("     ", fields, 10, status);
3009     REGEX_CHECK_STATUS;
3010     REGEX_ASSERT(n==2);
3011     REGEX_ASSERT(fields[0]=="");
3012     REGEX_ASSERT(fields[1]=="");
3013     REGEX_ASSERT(fields[2]=="*");
3014
3015     fields[0] = "foo";
3016     n = pat1->split("", fields, 10, status);
3017     REGEX_CHECK_STATUS;
3018     REGEX_ASSERT(n==0);
3019     REGEX_ASSERT(fields[0]=="foo");
3020
3021     delete pat1;
3022
3023     //  split, with a pattern with (capture)
3024     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
3025     pat1 = RegexPattern::compile(&re1,  pe, status);
3026     REGEX_CHECK_STATUS;
3027
3028     status = U_ZERO_ERROR;
3029     fields[6] = fields[7] = "*";
3030     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3031     REGEX_CHECK_STATUS;
3032     REGEX_ASSERT(n==7);
3033     REGEX_ASSERT(fields[0]=="");
3034     REGEX_ASSERT(fields[1]=="a");
3035     REGEX_ASSERT(fields[2]=="Now is ");
3036     REGEX_ASSERT(fields[3]=="b");
3037     REGEX_ASSERT(fields[4]=="the time");
3038     REGEX_ASSERT(fields[5]=="c");
3039     REGEX_ASSERT(fields[6]=="");
3040     REGEX_ASSERT(fields[7]=="*");
3041     REGEX_ASSERT(status==U_ZERO_ERROR);
3042
3043     fields[6] = fields[7] = "*";
3044     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
3045     REGEX_CHECK_STATUS;
3046     REGEX_ASSERT(n==7);
3047     REGEX_ASSERT(fields[0]=="  ");
3048     REGEX_ASSERT(fields[1]=="a");
3049     REGEX_ASSERT(fields[2]=="Now is ");
3050     REGEX_ASSERT(fields[3]=="b");
3051     REGEX_ASSERT(fields[4]=="the time");
3052     REGEX_ASSERT(fields[5]=="c");
3053     REGEX_ASSERT(fields[6]=="");
3054     REGEX_ASSERT(fields[7]=="*");
3055
3056     status = U_ZERO_ERROR;
3057     fields[6] = "foo";
3058     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
3059     REGEX_CHECK_STATUS;
3060     REGEX_ASSERT(n==6);
3061     REGEX_ASSERT(fields[0]=="  ");
3062     REGEX_ASSERT(fields[1]=="a");
3063     REGEX_ASSERT(fields[2]=="Now is ");
3064     REGEX_ASSERT(fields[3]=="b");
3065     REGEX_ASSERT(fields[4]=="the time");
3066     REGEX_ASSERT(fields[5]==" ");
3067     REGEX_ASSERT(fields[6]=="foo");
3068
3069     status = U_ZERO_ERROR;
3070     fields[5] = "foo";
3071     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3072     REGEX_CHECK_STATUS;
3073     REGEX_ASSERT(n==5);
3074     REGEX_ASSERT(fields[0]=="  ");
3075     REGEX_ASSERT(fields[1]=="a");
3076     REGEX_ASSERT(fields[2]=="Now is ");
3077     REGEX_ASSERT(fields[3]=="b");
3078     REGEX_ASSERT(fields[4]=="the time<c>");
3079     REGEX_ASSERT(fields[5]=="foo");
3080
3081     status = U_ZERO_ERROR;
3082     fields[5] = "foo";
3083     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3084     REGEX_CHECK_STATUS;
3085     REGEX_ASSERT(n==5);
3086     REGEX_ASSERT(fields[0]=="  ");
3087     REGEX_ASSERT(fields[1]=="a");
3088     REGEX_ASSERT(fields[2]=="Now is ");
3089     REGEX_ASSERT(fields[3]=="b");
3090     REGEX_ASSERT(fields[4]=="the time");
3091     REGEX_ASSERT(fields[5]=="foo");
3092
3093     status = U_ZERO_ERROR;
3094     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3095     REGEX_CHECK_STATUS;
3096     REGEX_ASSERT(n==4);
3097     REGEX_ASSERT(fields[0]=="  ");
3098     REGEX_ASSERT(fields[1]=="a");
3099     REGEX_ASSERT(fields[2]=="Now is ");
3100     REGEX_ASSERT(fields[3]=="the time<c>");
3101     status = U_ZERO_ERROR;
3102     delete pat1;
3103
3104     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3105     pat1 = RegexPattern::compile(&re1, pe, status);
3106     REGEX_CHECK_STATUS;
3107     n = pat1->split("1-10,20", fields, 10, status);
3108     REGEX_CHECK_STATUS;
3109     REGEX_ASSERT(n==5);
3110     REGEX_ASSERT(fields[0]=="1");
3111     REGEX_ASSERT(fields[1]=="-");
3112     REGEX_ASSERT(fields[2]=="10");
3113     REGEX_ASSERT(fields[3]==",");
3114     REGEX_ASSERT(fields[4]=="20");
3115     delete pat1;
3116
3117
3118     //
3119     // split of a UText based string, with library allocating output UTexts.
3120     //
3121     {
3122         status = U_ZERO_ERROR;
3123         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3124         UnicodeString stringToSplit("first:second:third");
3125         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3126         REGEX_CHECK_STATUS;
3127
3128         UText *splits[10] = {NULL};
3129         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3130         REGEX_CHECK_STATUS;
3131         REGEX_ASSERT(numFields == 5);
3132         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3133         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3134         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3135         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3136         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3137         REGEX_ASSERT(splits[5] == NULL);
3138
3139         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3140             if (splits[i]) {
3141                 utext_close(splits[i]);
3142                 splits[i] = NULL;
3143             }
3144         }
3145         utext_close(textToSplit);
3146     }
3147
3148
3149     //
3150     // RegexPattern::pattern() and patternText()
3151     //
3152     pat1 = new RegexPattern();
3153     REGEX_ASSERT(pat1->pattern() == "");
3154     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3155     delete pat1;
3156     const char *helloWorldInvariant = "(Hello, world)*";
3157     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3158     pat1 = RegexPattern::compile(&re1, pe, status);
3159     REGEX_CHECK_STATUS;
3160     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3161     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3162     delete pat1;
3163
3164     utext_close(&re1);
3165 }
3166
3167
3168 //---------------------------------------------------------------------------
3169 //
3170 //      Extended       A more thorough check for features of regex patterns
3171 //                     The test cases are in a separate data file,
3172 //                       source/tests/testdata/regextst.txt
3173 //                     A description of the test data format is included in that file.
3174 //
3175 //---------------------------------------------------------------------------
3176
3177 const char *
3178 RegexTest::getPath(char buffer[2048], const char *filename) {
3179     UErrorCode status=U_ZERO_ERROR;
3180     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3181     if (U_FAILURE(status)) {
3182         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3183         return NULL;
3184     }
3185
3186     strcpy(buffer, testDataDirectory);
3187     strcat(buffer, filename);
3188     return buffer;
3189 }
3190
3191 void RegexTest::Extended() {
3192     char tdd[2048];
3193     const char *srcPath;
3194     UErrorCode  status  = U_ZERO_ERROR;
3195     int32_t     lineNum = 0;
3196
3197     //
3198     //  Open and read the test data file.
3199     //
3200     srcPath=getPath(tdd, "regextst.txt");
3201     if(srcPath==NULL) {
3202         return; /* something went wrong, error already output */
3203     }
3204
3205     int32_t    len;
3206     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3207     if (U_FAILURE(status)) {
3208         return; /* something went wrong, error already output */
3209     }
3210
3211     //
3212     //  Put the test data into a UnicodeString
3213     //
3214     UnicodeString testString(FALSE, testData, len);
3215
3216     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3217     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3218     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3219
3220     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3221     UnicodeString   testPattern;   // The pattern for test from the test file.
3222     UnicodeString   testFlags;     // the flags   for a test.
3223     UnicodeString   matchString;   // The marked up string to be used as input
3224
3225     if (U_FAILURE(status)){
3226         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3227         delete [] testData;
3228         return;
3229     }
3230
3231     //
3232     //  Loop over the test data file, once per line.
3233     //
3234     while (lineMat.find()) {
3235         lineNum++;
3236         if (U_FAILURE(status)) {
3237           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3238         }
3239
3240         status = U_ZERO_ERROR;
3241         UnicodeString testLine = lineMat.group(1, status);
3242         if (testLine.length() == 0) {
3243             continue;
3244         }
3245
3246         //
3247         // Parse the test line.  Skip blank and comment only lines.
3248         // Separate out the three main fields - pattern, flags, target.
3249         //
3250
3251         commentMat.reset(testLine);
3252         if (commentMat.lookingAt(status)) {
3253             // This line is a comment, or blank.
3254             continue;
3255         }
3256
3257         //
3258         //  Pull out the pattern field, remove it from the test file line.
3259         //
3260         quotedStuffMat.reset(testLine);
3261         if (quotedStuffMat.lookingAt(status)) {
3262             testPattern = quotedStuffMat.group(2, status);
3263             testLine.remove(0, quotedStuffMat.end(0, status));
3264         } else {
3265             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3266             continue;
3267         }
3268
3269
3270         //
3271         //  Pull out the flags from the test file line.
3272         //
3273         flagsMat.reset(testLine);
3274         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3275         testFlags = flagsMat.group(1, status);
3276         if (flagsMat.group(2, status).length() > 0) {
3277             errln("Bad Match flag at line %d. Scanning %c\n",
3278                 lineNum, flagsMat.group(2, status).charAt(0));
3279             continue;
3280         }
3281         testLine.remove(0, flagsMat.end(0, status));
3282
3283         //
3284         //  Pull out the match string, as a whole.
3285         //    We'll process the <tags> later.
3286         //
3287         quotedStuffMat.reset(testLine);
3288         if (quotedStuffMat.lookingAt(status)) {
3289             matchString = quotedStuffMat.group(2, status);
3290             testLine.remove(0, quotedStuffMat.end(0, status));
3291         } else {
3292             errln("Bad match string at test file line %d", lineNum);
3293             continue;
3294         }
3295
3296         //
3297         //  The only thing left from the input line should be an optional trailing comment.
3298         //
3299         commentMat.reset(testLine);
3300         if (commentMat.lookingAt(status) == FALSE) {
3301             errln("Line %d: unexpected characters at end of test line.", lineNum);
3302             continue;
3303         }
3304
3305         //
3306         //  Run the test
3307         //
3308         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3309     }
3310
3311     delete [] testData;
3312
3313 }
3314
3315
3316
3317 //---------------------------------------------------------------------------
3318 //
3319 //    regex_find(pattern, flags, inputString, lineNumber)
3320 //
3321 //         Function to run a single test from the Extended (data driven) tests.
3322 //         See file test/testdata/regextst.txt for a description of the
3323 //         pattern and inputString fields, and the allowed flags.
3324 //         lineNumber is the source line in regextst.txt of the test.
3325 //
3326 //---------------------------------------------------------------------------
3327
3328
3329 //  Set a value into a UVector at position specified by a decimal number in
3330 //   a UnicodeString.   This is a utility function needed by the actual test function,
3331 //   which follows.
3332 static void set(UVector &vec, int32_t val, UnicodeString index) {
3333     UErrorCode  status=U_ZERO_ERROR;
3334     int32_t  idx = 0;
3335     for (int32_t i=0; i<index.length(); i++) {
3336         int32_t d=u_charDigitValue(index.charAt(i));
3337         if (d<0) {return;}
3338         idx = idx*10 + d;
3339     }
3340     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3341     vec.setElementAt(val, idx);
3342 }
3343
3344 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3345     UErrorCode  status=U_ZERO_ERROR;
3346     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3347     vec.setElementAt(val, idx);
3348 }
3349
3350 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3351 {
3352     UBool couldFind = TRUE;
3353     UTEXT_SETNATIVEINDEX(utext, 0);
3354     int32_t i = 0;
3355     while (i < unistrOffset) {
3356         UChar32 c = UTEXT_NEXT32(utext);
3357         if (c != U_SENTINEL) {
3358             i += U16_LENGTH(c);
3359         } else {
3360             couldFind = FALSE;
3361             break;
3362         }
3363     }
3364     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3365     return couldFind;
3366 }
3367
3368
3369 void RegexTest::regex_find(const UnicodeString &pattern,
3370                            const UnicodeString &flags,
3371                            const UnicodeString &inputString,
3372                            const char *srcPath,
3373                            int32_t line) {
3374     UnicodeString       unEscapedInput;
3375     UnicodeString       deTaggedInput;
3376
3377     int32_t             patternUTF8Length,      inputUTF8Length;
3378     char                *patternChars  = NULL, *inputChars = NULL;
3379     UText               patternText    = UTEXT_INITIALIZER;
3380     UText               inputText      = UTEXT_INITIALIZER;
3381     UConverter          *UTF8Converter = NULL;
3382
3383     UErrorCode          status         = U_ZERO_ERROR;
3384     UParseError         pe;
3385     RegexPattern        *parsePat      = NULL;
3386     RegexMatcher        *parseMatcher  = NULL;
3387     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3388     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3389     UVector             groupStarts(status);
3390     UVector             groupEnds(status);
3391     UVector             groupStartsUTF8(status);
3392     UVector             groupEndsUTF8(status);
3393     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3394     UBool               failed         = FALSE;
3395     int32_t             numFinds;
3396     int32_t             i;
3397     UBool               useMatchesFunc   = FALSE;
3398     UBool               useLookingAtFunc = FALSE;
3399     int32_t             regionStart      = -1;
3400     int32_t             regionEnd        = -1;
3401     int32_t             regionStartUTF8  = -1;
3402     int32_t             regionEndUTF8    = -1;
3403
3404
3405     //
3406     //  Compile the caller's pattern
3407     //
3408     uint32_t bflags = 0;
3409     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3410         bflags |= UREGEX_CASE_INSENSITIVE;
3411     }
3412     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3413         bflags |= UREGEX_COMMENTS;
3414     }
3415     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3416         bflags |= UREGEX_DOTALL;
3417     }
3418     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3419         bflags |= UREGEX_MULTILINE;
3420     }
3421
3422     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3423         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3424     }
3425     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3426         bflags |= UREGEX_UNIX_LINES;
3427     }
3428     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3429         bflags |= UREGEX_LITERAL;
3430     }
3431
3432
3433     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3434     if (status != U_ZERO_ERROR) {
3435         #if UCONFIG_NO_BREAK_ITERATION==1
3436         // 'v' test flag means that the test pattern should not compile if ICU was configured
3437         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3438         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3439             goto cleanupAndReturn;
3440         }
3441         #endif
3442         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3443             // Expected pattern compilation error.
3444             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3445                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3446             }
3447             goto cleanupAndReturn;
3448         } else {
3449             // Unexpected pattern compilation error.
3450             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3451             goto cleanupAndReturn;
3452         }
3453     }
3454
3455     UTF8Converter = ucnv_open("UTF8", &status);
3456     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3457
3458     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3459     status = U_ZERO_ERROR; // buffer overflow
3460     patternChars = new char[patternUTF8Length+1];
3461     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3462     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3463
3464     if (status == U_ZERO_ERROR) {
3465         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3466
3467         if (status != U_ZERO_ERROR) {
3468 #if UCONFIG_NO_BREAK_ITERATION==1
3469             // 'v' test flag means that the test pattern should not compile if ICU was configured
3470             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3471             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3472                 goto cleanupAndReturn;
3473             }
3474 #endif
3475             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3476                 // Expected pattern compilation error.
3477                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3478                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3479                 }
3480                 goto cleanupAndReturn;
3481             } else {
3482                 // Unexpected pattern compilation error.
3483                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3484                 goto cleanupAndReturn;
3485             }
3486         }
3487     }
3488
3489     if (UTF8Pattern == NULL) {
3490         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3491         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3492         status = U_ZERO_ERROR;
3493     }
3494
3495     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3496         callerPattern->dumpPattern();
3497     }
3498
3499     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3500         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3501         goto cleanupAndReturn;
3502     }
3503
3504
3505     //
3506     // Number of times find() should be called on the test string, default to 1
3507     //
3508     numFinds = 1;
3509     for (i=2; i<=9; i++) {
3510         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3511             if (numFinds != 1) {
3512                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3513                 goto cleanupAndReturn;
3514             }
3515             numFinds = i;
3516         }
3517     }
3518
3519     // 'M' flag.  Use matches() instead of find()
3520     if (flags.indexOf((UChar)0x4d) >= 0) {
3521         useMatchesFunc = TRUE;
3522     }
3523     if (flags.indexOf((UChar)0x4c) >= 0) {
3524         useLookingAtFunc = TRUE;
3525     }
3526
3527     //
3528     //  Find the tags in the input data, remove them, and record the group boundary
3529     //    positions.
3530     //
3531     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3532     REGEX_CHECK_STATUS_L(line);
3533
3534     unEscapedInput = inputString.unescape();
3535     parseMatcher = parsePat->matcher(unEscapedInput, status);
3536     REGEX_CHECK_STATUS_L(line);
3537     while(parseMatcher->find()) {
3538         parseMatcher->appendReplacement(deTaggedInput, "", status);
3539         REGEX_CHECK_STATUS;
3540         UnicodeString groupNum = parseMatcher->group(2, status);
3541         if (groupNum == "r") {
3542             // <r> or </r>, a region specification within the string
3543             if (parseMatcher->group(1, status) == "/") {
3544                 regionEnd = deTaggedInput.length();
3545             } else {
3546                 regionStart = deTaggedInput.length();
3547             }
3548         } else {
3549             // <digits> or </digits>, a group match boundary tag.
3550             if (parseMatcher->group(1, status) == "/") {
3551                 set(groupEnds, deTaggedInput.length(), groupNum);
3552             } else {
3553                 set(groupStarts, deTaggedInput.length(), groupNum);
3554             }
3555         }
3556     }
3557     parseMatcher->appendTail(deTaggedInput);
3558     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3559     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3560       errln("mismatched <r> tags");
3561       failed = TRUE;
3562       goto cleanupAndReturn;
3563     }
3564
3565     //
3566     //  Configure the matcher according to the flags specified with this test.
3567     //
3568     matcher = callerPattern->matcher(deTaggedInput, status);
3569     REGEX_CHECK_STATUS_L(line);
3570     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3571         matcher->setTrace(TRUE);
3572     }
3573
3574     if (UTF8Pattern != NULL) {
3575         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3576         status = U_ZERO_ERROR; // buffer overflow
3577         inputChars = new char[inputUTF8Length+1];
3578         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3579         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3580
3581         if (status == U_ZERO_ERROR) {
3582             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3583             REGEX_CHECK_STATUS_L(line);
3584         }
3585
3586         if (UTF8Matcher == NULL) {
3587             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3588             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3589             status = U_ZERO_ERROR;
3590         }
3591     }
3592
3593     //
3594     //  Generate native indices for UTF8 versions of region and capture group info
3595     //
3596     if (UTF8Matcher != NULL) {
3597         if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3598             UTF8Matcher->setTrace(TRUE);
3599         }
3600         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3601         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3602
3603         //  Fill out the native index UVector info.
3604         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3605         for (i=0; i<groupStarts.size(); i++) {
3606             int32_t  start = groupStarts.elementAti(i);
3607             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3608             if (start >= 0) {
3609                 int32_t  startUTF8;
3610                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3611                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3612                     failed = TRUE;
3613                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3614                 }
3615                 setInt(groupStartsUTF8, startUTF8, i);
3616             }
3617
3618             int32_t  end = groupEnds.elementAti(i);
3619             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3620             if (end >= 0) {
3621                 int32_t  endUTF8;
3622                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3623                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3624                     failed = TRUE;
3625                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3626                 }
3627                 setInt(groupEndsUTF8, endUTF8, i);
3628             }
3629         }
3630     }
3631
3632     if (regionStart>=0) {
3633        matcher->region(regionStart, regionEnd, status);
3634        REGEX_CHECK_STATUS_L(line);
3635        if (UTF8Matcher != NULL) {
3636            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3637            REGEX_CHECK_STATUS_L(line);
3638        }
3639     }
3640     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3641         matcher->useAnchoringBounds(FALSE);
3642         if (UTF8Matcher != NULL) {
3643             UTF8Matcher->useAnchoringBounds(FALSE);
3644         }
3645     }
3646     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3647         matcher->useTransparentBounds(TRUE);
3648         if (UTF8Matcher != NULL) {
3649             UTF8Matcher->useTransparentBounds(TRUE);
3650         }
3651     }
3652
3653
3654
3655     //
3656     // Do a find on the de-tagged input using the caller's pattern
3657     //     TODO: error on count>1 and not find().
3658     //           error on both matches() and lookingAt().
3659     //
3660     for (i=0; i<numFinds; i++) {
3661         if (useMatchesFunc) {
3662             isMatch = matcher->matches(status);
3663             if (UTF8Matcher != NULL) {
3664                isUTF8Match = UTF8Matcher->matches(status);
3665             }
3666         } else  if (useLookingAtFunc) {
3667             isMatch = matcher->lookingAt(status);
3668             if (UTF8Matcher != NULL) {
3669                 isUTF8Match = UTF8Matcher->lookingAt(status);
3670             }
3671         } else {
3672             isMatch = matcher->find();
3673             if (UTF8Matcher != NULL) {
3674                 isUTF8Match = UTF8Matcher->find();
3675             }
3676         }
3677     }
3678     matcher->setTrace(FALSE);
3679     if (UTF8Matcher) {
3680         UTF8Matcher->setTrace(FALSE);
3681     }
3682     if (U_FAILURE(status)) {
3683         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3684     }
3685
3686     //
3687     // Match up the groups from the find() with the groups from the tags
3688     //
3689
3690     // number of tags should match number of groups from find operation.
3691     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3692     //   G option in test means that capture group data is not available in the
3693     //     expected results, so the check needs to be suppressed.
3694     if (isMatch == FALSE && groupStarts.size() != 0) {
3695         dataerrln("Error at line %d:  Match expected, but none found.", line);
3696         failed = TRUE;
3697         goto cleanupAndReturn;
3698     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3699         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3700         failed = TRUE;
3701         goto cleanupAndReturn;
3702     }
3703     if (isMatch && groupStarts.size() == 0) {
3704         errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3705         failed = TRUE;
3706     }
3707     if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3708         errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3709         failed = TRUE;
3710     }
3711
3712     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3713         // Only check for match / no match.  Don't check capture groups.
3714         goto cleanupAndReturn;
3715     }
3716
3717     REGEX_CHECK_STATUS_L(line);
3718     for (i=0; i<=matcher->groupCount(); i++) {
3719         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3720         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3721         if (matcher->start(i, status) != expectedStart) {
3722             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3723                 line, i, expectedStart, matcher->start(i, status));
3724             failed = TRUE;
3725             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3726         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3727             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3728                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3729             failed = TRUE;
3730             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3731         }
3732
3733         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3734         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3735         if (matcher->end(i, status) != expectedEnd) {
3736             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3737                 line, i, expectedEnd, matcher->end(i, status));
3738             failed = TRUE;
3739             // Error on end position;  keep going; real error is probably yet to come as group
3740             //   end positions work from end of the input data towards the front.
3741         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3742             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3743                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3744             failed = TRUE;
3745             // Error on end position;  keep going; real error is probably yet to come as group
3746             //   end positions work from end of the input data towards the front.
3747         }
3748     }
3749     if ( matcher->groupCount()+1 < groupStarts.size()) {
3750         errln("Error at line %d: Expected %d capture groups, found %d.",
3751             line, groupStarts.size()-1, matcher->groupCount());
3752         failed = TRUE;
3753         }
3754     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3755         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3756               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3757         failed = TRUE;
3758     }
3759
3760     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3761         matcher->requireEnd() == TRUE) {
3762         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3763         failed = TRUE;
3764     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3765         UTF8Matcher->requireEnd() == TRUE) {
3766         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3767         failed = TRUE;
3768     }
3769
3770     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3771         matcher->requireEnd() == FALSE) {
3772         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3773         failed = TRUE;
3774     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3775         UTF8Matcher->requireEnd() == FALSE) {
3776         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3777         failed = TRUE;
3778     }
3779
3780     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3781         matcher->hitEnd() == TRUE) {
3782         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3783         failed = TRUE;
3784     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3785                UTF8Matcher->hitEnd() == TRUE) {
3786         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3787         failed = TRUE;
3788     }
3789
3790     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3791         matcher->hitEnd() == FALSE) {
3792         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3793         failed = TRUE;
3794     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3795                UTF8Matcher->hitEnd() == FALSE) {
3796         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3797         failed = TRUE;
3798     }
3799
3800
3801 cleanupAndReturn:
3802     if (failed) {
3803         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3804             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3805         // callerPattern->dump();
3806     }
3807     delete parseMatcher;
3808     delete parsePat;
3809     delete UTF8Matcher;
3810     delete UTF8Pattern;
3811     delete matcher;
3812     delete callerPattern;
3813
3814     utext_close(&inputText);
3815     delete[] inputChars;
3816     utext_close(&patternText);
3817     delete[] patternChars;
3818     ucnv_close(UTF8Converter);
3819 }
3820
3821
3822
3823
3824 //---------------------------------------------------------------------------
3825 //
3826 //      Errors     Check for error handling in patterns.
3827 //
3828 //---------------------------------------------------------------------------
3829 void RegexTest::Errors() {
3830     // \escape sequences that aren't implemented yet.
3831     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3832
3833     // Missing close parentheses
3834     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3835     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3836     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3837
3838     // Extra close paren
3839     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3840     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3841     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3842
3843     // Look-ahead, Look-behind
3844     //  TODO:  add tests for unbounded length look-behinds.
3845     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3846
3847     // Attempt to use non-default flags
3848     {
3849         UParseError   pe;
3850         UErrorCode    status = U_ZERO_ERROR;
3851         int32_t       flags  = UREGEX_CANON_EQ |
3852                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3853                                UREGEX_MULTILINE;
3854         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3855         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3856         delete pat1;
3857     }
3858
3859
3860     // Quantifiers are allowed only after something that can be quantified.
3861     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3862     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3863     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3864
3865     // Mal-formed {min,max} quantifiers
3866     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3867     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3868     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3869     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3870     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3871     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3872     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3873     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3874     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3875
3876     // Ticket 5389
3877     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3878
3879     // Invalid Back Reference \0
3880     //    For ICU 3.8 and earlier
3881     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3882     //
3883     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3884
3885 }
3886
3887
3888 //-------------------------------------------------------------------------------
3889 //
3890 //  Read a text data file, convert it to UChars, and return the data
3891 //    in one big UChar * buffer, which the caller must delete.
3892 //
3893 //--------------------------------------------------------------------------------
3894 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3895                                      const char *defEncoding, UErrorCode &status) {
3896     UChar       *retPtr  = NULL;
3897     char        *fileBuf = NULL;
3898     UConverter* conv     = NULL;
3899     FILE        *f       = NULL;
3900
3901     ulen = 0;
3902     if (U_FAILURE(status)) {
3903         return retPtr;
3904     }
3905
3906     //
3907     //  Open the file.
3908     //
3909     f = fopen(fileName, "rb");
3910     if (f == 0) {
3911         dataerrln("Error opening test data file %s\n", fileName);
3912         status = U_FILE_ACCESS_ERROR;
3913         return NULL;
3914     }
3915     //
3916     //  Read it in
3917     //
3918     int32_t            fileSize;
3919     int32_t            amt_read;
3920
3921     fseek( f, 0, SEEK_END);
3922     fileSize = ftell(f);
3923     fileBuf = new char[fileSize];
3924     fseek(f, 0, SEEK_SET);
3925     amt_read = fread(fileBuf, 1, fileSize, f);
3926     if (amt_read != fileSize || fileSize <= 0) {
3927         errln("Error reading test data file.");
3928         goto cleanUpAndReturn;
3929     }
3930
3931     //
3932     // Look for a Unicode Signature (BOM) on the data just read
3933     //
3934     int32_t        signatureLength;
3935     const char *   fileBufC;
3936     const char*    encoding;
3937
3938     fileBufC = fileBuf;
3939     encoding = ucnv_detectUnicodeSignature(
3940         fileBuf, fileSize, &signatureLength, &status);
3941     if(encoding!=NULL ){
3942         fileBufC  += signatureLength;
3943         fileSize  -= signatureLength;
3944     } else {
3945         encoding = defEncoding;
3946         if (strcmp(encoding, "utf-8") == 0) {
3947             errln("file %s is missing its BOM", fileName);
3948         }
3949     }
3950
3951     //
3952     // Open a converter to take the rule file to UTF-16
3953     //
3954     conv = ucnv_open(encoding, &status);
3955     if (U_FAILURE(status)) {
3956         goto cleanUpAndReturn;
3957     }
3958
3959     //
3960     // Convert the rules to UChar.
3961     //  Preflight first to determine required buffer size.
3962     //
3963     ulen = ucnv_toUChars(conv,
3964         NULL,           //  dest,
3965         0,              //  destCapacity,
3966         fileBufC,
3967         fileSize,
3968         &status);
3969     if (status == U_BUFFER_OVERFLOW_ERROR) {
3970         // Buffer Overflow is expected from the preflight operation.
3971         status = U_ZERO_ERROR;
3972
3973         retPtr = new UChar[ulen+1];
3974         ucnv_toUChars(conv,
3975             retPtr,       //  dest,
3976             ulen+1,
3977             fileBufC,
3978             fileSize,
3979             &status);
3980     }
3981
3982 cleanUpAndReturn:
3983     fclose(f);
3984     delete[] fileBuf;
3985     ucnv_close(conv);
3986     if (U_FAILURE(status)) {
3987         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3988         delete []retPtr;
3989         retPtr = 0;
3990         ulen   = 0;
3991     };
3992     return retPtr;
3993 }
3994
3995
3996 //-------------------------------------------------------------------------------
3997 //
3998 //   PerlTests  - Run Perl's regular expression tests
3999 //                The input file for this test is re_tests, the standard regular
4000 //                expression test data distributed with the Perl source code.
4001 //
4002 //                Here is Perl's description of the test data file:
4003 //
4004 //        # The tests are in a separate file 't/op/re_tests'.
4005 //        # Each line in that file is a separate test.
4006 //        # There are five columns, separated by tabs.
4007 //        #
4008 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
4009 //        # Modifiers can be put after the closing C<'>.
4010 //        #
4011 //        # Column 2 contains the string to be matched.
4012 //        #
4013 //        # Column 3 contains the expected result:
4014 //        #     y   expect a match
4015 //        #     n   expect no match
4016 //        #     c   expect an error
4017 //        # B   test exposes a known bug in Perl, should be skipped
4018 //        # b   test exposes a known bug in Perl, should be skipped if noamp
4019 //        #
4020 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4021 //        #
4022 //        # Column 4 contains a string, usually C<$&>.
4023 //        #
4024 //        # Column 5 contains the expected result of double-quote
4025 //        # interpolating that string after the match, or start of error message.
4026 //        #
4027 //        # Column 6, if present, contains a reason why the test is skipped.
4028 //        # This is printed with "skipped", for harness to pick up.
4029 //        #
4030 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
4031 //        #
4032 //        # If you want to add a regular expression test that can't be expressed
4033 //        # in this format, don't add it here: put it in op/pat.t instead.
4034 //
4035 //        For ICU, if field 3 contains an 'i', the test will be skipped.
4036 //        The test exposes is some known incompatibility between ICU and Perl regexps.
4037 //        (The i is in addition to whatever was there before.)
4038 //
4039 //-------------------------------------------------------------------------------
4040 void RegexTest::PerlTests() {
4041     char tdd[2048];
4042     const char *srcPath;
4043     UErrorCode  status = U_ZERO_ERROR;
4044     UParseError pe;
4045
4046     //
4047     //  Open and read the test data file.
4048     //
4049     srcPath=getPath(tdd, "re_tests.txt");
4050     if(srcPath==NULL) {
4051         return; /* something went wrong, error already output */
4052     }
4053
4054     int32_t    len;
4055     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4056     if (U_FAILURE(status)) {
4057         return; /* something went wrong, error already output */
4058     }
4059
4060     //
4061     //  Put the test data into a UnicodeString
4062     //
4063     UnicodeString testDataString(FALSE, testData, len);
4064
4065     //
4066     //  Regex to break the input file into lines, and strip the new lines.
4067     //     One line per match, capture group one is the desired data.
4068     //
4069     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4070     if (U_FAILURE(status)) {
4071         dataerrln("RegexPattern::compile() error");
4072         return;
4073     }
4074     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4075
4076     //
4077     //  Regex to split a test file line into fields.
4078     //    There are six fields, separated by tabs.
4079     //
4080     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4081
4082     //
4083     //  Regex to identify test patterns with flag settings, and to separate them.
4084     //    Test patterns with flags look like 'pattern'i
4085     //    Test patterns without flags are not quoted:   pattern
4086     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4087     //
4088     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4089     RegexMatcher* flagMat = flagPat->matcher(status);
4090
4091     //
4092     // The Perl tests reference several perl-isms, which are evaluated/substituted
4093     //   in the test data.  Not being perl, this must be done explicitly.  Here
4094     //   are string constants and REs for these constructs.
4095     //
4096     UnicodeString nulnulSrc("${nulnul}");
4097     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4098     nulnul = nulnul.unescape();
4099
4100     UnicodeString ffffSrc("${ffff}");
4101     UnicodeString ffff("\\uffff", -1, US_INV);
4102     ffff = ffff.unescape();
4103
4104     //  regexp for $-[0], $+[2], etc.
4105     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4106     RegexMatcher *groupsMat = groupsPat->matcher(status);
4107
4108     //  regexp for $0, $1, $2, etc.
4109     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4110     RegexMatcher *cgMat = cgPat->matcher(status);
4111
4112
4113     //
4114     // Main Loop for the Perl Tests, runs once per line from the
4115     //   test data file.
4116     //
4117     int32_t  lineNum = 0;
4118     int32_t  skippedUnimplementedCount = 0;
4119     while (lineMat->find()) {
4120         lineNum++;
4121
4122         //
4123         //  Get a line, break it into its fields, do the Perl
4124         //    variable substitutions.
4125         //
4126         UnicodeString line = lineMat->group(1, status);
4127         UnicodeString fields[7];
4128         fieldPat->split(line, fields, 7, status);
4129
4130         flagMat->reset(fields[0]);
4131         flagMat->matches(status);
4132         UnicodeString pattern  = flagMat->group(2, status);
4133         pattern.findAndReplace("${bang}", "!");
4134         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4135         pattern.findAndReplace(ffffSrc, ffff);
4136
4137         //
4138         //  Identify patterns that include match flag settings,
4139         //    split off the flags, remove the extra quotes.
4140         //
4141         UnicodeString flagStr = flagMat->group(3, status);
4142         if (U_FAILURE(status)) {
4143             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4144             return;
4145         }
4146         int32_t flags = 0;
4147         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4148         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4149         const UChar UChar_m = 0x6d;
4150         const UChar UChar_x = 0x78;
4151         const UChar UChar_y = 0x79;
4152         if (flagStr.indexOf(UChar_i) != -1) {
4153             flags |= UREGEX_CASE_INSENSITIVE;
4154         }
4155         if (flagStr.indexOf(UChar_m) != -1) {
4156             flags |= UREGEX_MULTILINE;
4157         }
4158         if (flagStr.indexOf(UChar_x) != -1) {
4159             flags |= UREGEX_COMMENTS;
4160         }
4161
4162         //
4163         // Compile the test pattern.
4164         //
4165         status = U_ZERO_ERROR;
4166         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4167         if (status == U_REGEX_UNIMPLEMENTED) {
4168             //
4169             // Test of a feature that is planned for ICU, but not yet implemented.
4170             //   skip the test.
4171             skippedUnimplementedCount++;
4172             delete testPat;
4173             status = U_ZERO_ERROR;
4174             continue;
4175         }
4176
4177         if (U_FAILURE(status)) {
4178             // Some tests are supposed to generate errors.
4179             //   Only report an error for tests that are supposed to succeed.
4180             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4181                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4182             {
4183                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4184             }
4185             status = U_ZERO_ERROR;
4186             delete testPat;
4187             continue;
4188         }
4189
4190         if (fields[2].indexOf(UChar_i) >= 0) {
4191             // ICU should skip this test.
4192             delete testPat;
4193             continue;
4194         }
4195
4196         if (fields[2].indexOf(UChar_c) >= 0) {
4197             // This pattern should have caused a compilation error, but didn't/
4198             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4199             delete testPat;
4200             continue;
4201         }
4202
4203         //
4204         // replace the Perl variables that appear in some of the
4205         //   match data strings.
4206         //
4207         UnicodeString matchString = fields[1];
4208         matchString.findAndReplace(nulnulSrc, nulnul);
4209         matchString.findAndReplace(ffffSrc,   ffff);
4210
4211         // Replace any \n in the match string with an actual new-line char.
4212         //  Don't do full unescape, as this unescapes more than Perl does, which
4213         //  causes other spurious failures in the tests.
4214         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4215
4216
4217
4218         //
4219         // Run the test, check for expected match/don't match result.
4220         //
4221         RegexMatcher *testMat = testPat->matcher(matchString, status);
4222         UBool found = testMat->find();
4223         UBool expected = FALSE;
4224         if (fields[2].indexOf(UChar_y) >=0) {
4225             expected = TRUE;
4226         }
4227         if (expected != found) {
4228             errln("line %d: Expected %smatch, got %smatch",
4229                 lineNum, expected?"":"no ", found?"":"no " );
4230             continue;
4231         }
4232
4233         // Don't try to check expected results if there is no match.
4234         //   (Some have stuff in the expected fields)
4235         if (!found) {
4236             delete testMat;
4237             delete testPat;
4238             continue;
4239         }
4240
4241         //
4242         // Interpret the Perl expression from the fourth field of the data file,
4243         // building up an ICU string from the results of the ICU match.
4244         //   The Perl expression will contain references to the results of
4245         //     a regex match, including the matched string, capture group strings,
4246         //     group starting and ending indicies, etc.
4247         //
4248         UnicodeString resultString;
4249         UnicodeString perlExpr = fields[3];
4250 #if SUPPORT_MUTATING_INPUT_STRING
4251         groupsMat->reset(perlExpr);
4252         cgMat->reset(perlExpr);
4253 #endif
4254
4255         while (perlExpr.length() > 0) {
4256 #if !SUPPORT_MUTATING_INPUT_STRING
4257             //  Perferred usage.  Reset after any modification to input string.
4258             groupsMat->reset(perlExpr);
4259             cgMat->reset(perlExpr);
4260 #endif
4261
4262             if (perlExpr.startsWith("$&")) {
4263                 resultString.append(testMat->group(status));
4264                 perlExpr.remove(0, 2);
4265             }
4266
4267             else if (groupsMat->lookingAt(status)) {
4268                 // $-[0]   $+[2]  etc.
4269                 UnicodeString digitString = groupsMat->group(2, status);
4270                 int32_t t = 0;
4271                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4272                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4273                 int32_t matchPosition;
4274                 if (plusOrMinus.compare("+") == 0) {
4275                     matchPosition = testMat->end(groupNum, status);
4276                 } else {
4277                     matchPosition = testMat->start(groupNum, status);
4278                 }
4279                 if (matchPosition != -1) {
4280                     ICU_Utility::appendNumber(resultString, matchPosition);
4281                 }
4282                 perlExpr.remove(0, groupsMat->end(status));
4283             }
4284
4285             else if (cgMat->lookingAt(status)) {
4286                 // $1, $2, $3, etc.
4287                 UnicodeString digitString = cgMat->group(1, status);
4288                 int32_t t = 0;
4289                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4290                 if (U_SUCCESS(status)) {
4291                     resultString.append(testMat->group(groupNum, status));
4292                     status = U_ZERO_ERROR;
4293                 }
4294                 perlExpr.remove(0, cgMat->end(status));
4295             }
4296
4297             else if (perlExpr.startsWith("@-")) {
4298                 int32_t i;
4299                 for (i=0; i<=testMat->groupCount(); i++) {
4300                     if (i>0) {
4301                         resultString.append(" ");
4302                     }
4303                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4304                 }
4305                 perlExpr.remove(0, 2);
4306             }
4307
4308             else if (perlExpr.startsWith("@+")) {
4309                 int32_t i;
4310                 for (i=0; i<=testMat->groupCount(); i++) {
4311                     if (i>0) {
4312                         resultString.append(" ");
4313                     }
4314                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4315                 }
4316                 perlExpr.remove(0, 2);
4317             }
4318
4319             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4320                                                      //           or as an escaped sequence (e.g. \n)
4321                 if (perlExpr.length() > 1) {
4322                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4323                 }
4324                 UChar c = perlExpr.charAt(0);
4325                 switch (c) {
4326                 case 'n':   c = '\n'; break;
4327                 // add any other escape sequences that show up in the test expected results.
4328                 }
4329                 resultString.append(c);
4330                 perlExpr.remove(0, 1);
4331             }
4332
4333             else  {
4334                 // Any characters from the perl expression that we don't explicitly
4335                 //  recognize before here are assumed to be literals and copied
4336                 //  as-is to the expected results.
4337                 resultString.append(perlExpr.charAt(0));
4338                 perlExpr.remove(0, 1);
4339             }
4340
4341             if (U_FAILURE(status)) {
4342                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4343                 break;
4344             }
4345         }
4346
4347         //
4348         // Expected Results Compare
4349         //
4350         UnicodeString expectedS(fields[4]);
4351         expectedS.findAndReplace(nulnulSrc, nulnul);
4352         expectedS.findAndReplace(ffffSrc,   ffff);
4353         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4354
4355
4356         if (expectedS.compare(resultString) != 0) {
4357             err("Line %d: Incorrect perl expression results.", lineNum);
4358             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4359         }
4360
4361         delete testMat;
4362         delete testPat;
4363     }
4364
4365     //
4366     // All done.  Clean up allocated stuff.
4367     //
4368     delete cgMat;
4369     delete cgPat;
4370
4371     delete groupsMat;
4372     delete groupsPat;
4373
4374     delete flagMat;
4375     delete flagPat;
4376
4377     delete lineMat;
4378     delete linePat;
4379
4380     delete fieldPat;
4381     delete [] testData;
4382
4383
4384     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4385
4386 }
4387
4388
4389 //-------------------------------------------------------------------------------
4390 //
4391 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4392 //                  (instead of using UnicodeStrings) to test the alternate engine.
4393 //                  The input file for this test is re_tests, the standard regular
4394 //                  expression test data distributed with the Perl source code.
4395 //                  See PerlTests() for more information.
4396 //
4397 //-------------------------------------------------------------------------------
4398 void RegexTest::PerlTestsUTF8() {
4399     char tdd[2048];
4400     const char *srcPath;
4401     UErrorCode  status = U_ZERO_ERROR;
4402     UParseError pe;
4403     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4404     UText       patternText = UTEXT_INITIALIZER;
4405     char       *patternChars = NULL;
4406     int32_t     patternLength;
4407     int32_t     patternCapacity = 0;
4408     UText       inputText = UTEXT_INITIALIZER;
4409     char       *inputChars = NULL;
4410     int32_t     inputLength;
4411     int32_t     inputCapacity = 0;
4412
4413     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4414
4415     //
4416     //  Open and read the test data file.
4417     //
4418     srcPath=getPath(tdd, "re_tests.txt");
4419     if(srcPath==NULL) {
4420         return; /* something went wrong, error already output */
4421     }
4422
4423     int32_t    len;
4424     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4425     if (U_FAILURE(status)) {
4426         return; /* something went wrong, error already output */
4427     }
4428
4429     //
4430     //  Put the test data into a UnicodeString
4431     //
4432     UnicodeString testDataString(FALSE, testData, len);
4433
4434     //
4435     //  Regex to break the input file into lines, and strip the new lines.
4436     //     One line per match, capture group one is the desired data.
4437     //
4438     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4439     if (U_FAILURE(status)) {
4440         dataerrln("RegexPattern::compile() error");
4441         return;
4442     }
4443     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4444
4445     //
4446     //  Regex to split a test file line into fields.
4447     //    There are six fields, separated by tabs.
4448     //
4449     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4450
4451     //
4452     //  Regex to identify test patterns with flag settings, and to separate them.
4453     //    Test patterns with flags look like 'pattern'i
4454     //    Test patterns without flags are not quoted:   pattern
4455     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4456     //
4457     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4458     RegexMatcher* flagMat = flagPat->matcher(status);
4459
4460     //
4461     // The Perl tests reference several perl-isms, which are evaluated/substituted
4462     //   in the test data.  Not being perl, this must be done explicitly.  Here
4463     //   are string constants and REs for these constructs.
4464     //
4465     UnicodeString nulnulSrc("${nulnul}");
4466     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4467     nulnul = nulnul.unescape();
4468
4469     UnicodeString ffffSrc("${ffff}");
4470     UnicodeString ffff("\\uffff", -1, US_INV);
4471     ffff = ffff.unescape();
4472
4473     //  regexp for $-[0], $+[2], etc.
4474     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4475     RegexMatcher *groupsMat = groupsPat->matcher(status);
4476
4477     //  regexp for $0, $1, $2, etc.
4478     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4479     RegexMatcher *cgMat = cgPat->matcher(status);
4480
4481
4482     //
4483     // Main Loop for the Perl Tests, runs once per line from the
4484     //   test data file.
4485     //
4486     int32_t  lineNum = 0;
4487     int32_t  skippedUnimplementedCount = 0;
4488     while (lineMat->find()) {
4489         lineNum++;
4490
4491         //
4492         //  Get a line, break it into its fields, do the Perl
4493         //    variable substitutions.
4494         //
4495         UnicodeString line = lineMat->group(1, status);
4496         UnicodeString fields[7];
4497         fieldPat->split(line, fields, 7, status);
4498
4499         flagMat->reset(fields[0]);
4500         flagMat->matches(status);
4501         UnicodeString pattern  = flagMat->group(2, status);
4502         pattern.findAndReplace("${bang}", "!");
4503         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4504         pattern.findAndReplace(ffffSrc, ffff);
4505
4506         //
4507         //  Identify patterns that include match flag settings,
4508         //    split off the flags, remove the extra quotes.
4509         //
4510         UnicodeString flagStr = flagMat->group(3, status);
4511         if (U_FAILURE(status)) {
4512             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4513             return;
4514         }
4515         int32_t flags = 0;
4516         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4517         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4518         const UChar UChar_m = 0x6d;
4519         const UChar UChar_x = 0x78;
4520         const UChar UChar_y = 0x79;
4521         if (flagStr.indexOf(UChar_i) != -1) {
4522             flags |= UREGEX_CASE_INSENSITIVE;
4523         }
4524         if (flagStr.indexOf(UChar_m) != -1) {
4525             flags |= UREGEX_MULTILINE;
4526         }
4527         if (flagStr.indexOf(UChar_x) != -1) {
4528             flags |= UREGEX_COMMENTS;
4529         }
4530
4531         //
4532         // Put the pattern in a UTF-8 UText
4533         //
4534         status = U_ZERO_ERROR;
4535         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4536         if (status == U_BUFFER_OVERFLOW_ERROR) {
4537             status = U_ZERO_ERROR;
4538             delete[] patternChars;
4539             patternCapacity = patternLength + 1;
4540             patternChars = new char[patternCapacity];
4541             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4542         }
4543         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4544
4545         //
4546         // Compile the test pattern.
4547         //
4548         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4549         if (status == U_REGEX_UNIMPLEMENTED) {
4550             //
4551             // Test of a feature that is planned for ICU, but not yet implemented.
4552             //   skip the test.
4553             skippedUnimplementedCount++;
4554             delete testPat;
4555             status = U_ZERO_ERROR;
4556             continue;
4557         }
4558
4559         if (U_FAILURE(status)) {
4560             // Some tests are supposed to generate errors.
4561             //   Only report an error for tests that are supposed to succeed.
4562             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4563                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4564             {
4565                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4566             }
4567             status = U_ZERO_ERROR;
4568             delete testPat;
4569             continue;
4570         }
4571
4572         if (fields[2].indexOf(UChar_i) >= 0) {
4573             // ICU should skip this test.
4574             delete testPat;
4575             continue;
4576         }
4577
4578         if (fields[2].indexOf(UChar_c) >= 0) {
4579             // This pattern should have caused a compilation error, but didn't/
4580             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4581             delete testPat;
4582             continue;
4583         }
4584
4585
4586         //
4587         // replace the Perl variables that appear in some of the
4588         //   match data strings.
4589         //
4590         UnicodeString matchString = fields[1];
4591         matchString.findAndReplace(nulnulSrc, nulnul);
4592         matchString.findAndReplace(ffffSrc,   ffff);
4593
4594         // Replace any \n in the match string with an actual new-line char.
4595         //  Don't do full unescape, as this unescapes more than Perl does, which
4596         //  causes other spurious failures in the tests.
4597         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4598
4599         //
4600         // Put the input in a UTF-8 UText
4601         //
4602         status = U_ZERO_ERROR;
4603         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4604         if (status == U_BUFFER_OVERFLOW_ERROR) {
4605             status = U_ZERO_ERROR;
4606             delete[] inputChars;
4607             inputCapacity = inputLength + 1;
4608             inputChars = new char[inputCapacity];
4609             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4610         }
4611         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4612
4613         //
4614         // Run the test, check for expected match/don't match result.
4615         //
4616         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4617         UBool found = testMat->find();
4618         UBool expected = FALSE;
4619         if (fields[2].indexOf(UChar_y) >=0) {
4620             expected = TRUE;
4621         }
4622         if (expected != found) {
4623             errln("line %d: Expected %smatch, got %smatch",
4624                 lineNum, expected?"":"no ", found?"":"no " );
4625             continue;
4626         }
4627
4628         // Don't try to check expected results if there is no match.
4629         //   (Some have stuff in the expected fields)
4630         if (!found) {
4631             delete testMat;
4632             delete testPat;
4633             continue;
4634         }
4635
4636         //
4637         // Interpret the Perl expression from the fourth field of the data file,
4638         // building up an ICU string from the results of the ICU match.
4639         //   The Perl expression will contain references to the results of
4640         //     a regex match, including the matched string, capture group strings,
4641         //     group starting and ending indicies, etc.
4642         //
4643         UnicodeString resultString;
4644         UnicodeString perlExpr = fields[3];
4645
4646         while (perlExpr.length() > 0) {
4647             groupsMat->reset(perlExpr);
4648             cgMat->reset(perlExpr);
4649
4650             if (perlExpr.startsWith("$&")) {
4651                 resultString.append(testMat->group(status));
4652                 perlExpr.remove(0, 2);
4653             }
4654
4655             else if (groupsMat->lookingAt(status)) {
4656                 // $-[0]   $+[2]  etc.
4657                 UnicodeString digitString = groupsMat->group(2, status);
4658                 int32_t t = 0;
4659                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4660                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4661                 int32_t matchPosition;
4662                 if (plusOrMinus.compare("+") == 0) {
4663                     matchPosition = testMat->end(groupNum, status);
4664                 } else {
4665                     matchPosition = testMat->start(groupNum, status);
4666                 }
4667                 if (matchPosition != -1) {
4668                     ICU_Utility::appendNumber(resultString, matchPosition);
4669                 }
4670                 perlExpr.remove(0, groupsMat->end(status));
4671             }
4672
4673             else if (cgMat->lookingAt(status)) {
4674                 // $1, $2, $3, etc.
4675                 UnicodeString digitString = cgMat->group(1, status);
4676                 int32_t t = 0;
4677                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4678                 if (U_SUCCESS(status)) {
4679                     resultString.append(testMat->group(groupNum, status));
4680                     status = U_ZERO_ERROR;
4681                 }
4682                 perlExpr.remove(0, cgMat->end(status));
4683             }
4684
4685             else if (perlExpr.startsWith("@-")) {
4686                 int32_t i;
4687                 for (i=0; i<=testMat->groupCount(); i++) {
4688                     if (i>0) {
4689                         resultString.append(" ");
4690                     }
4691                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4692                 }
4693                 perlExpr.remove(0, 2);
4694             }
4695
4696             else if (perlExpr.startsWith("@+")) {
4697                 int32_t i;
4698                 for (i=0; i<=testMat->groupCount(); i++) {
4699                     if (i>0) {
4700                         resultString.append(" ");
4701                     }
4702                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4703                 }
4704                 perlExpr.remove(0, 2);
4705             }
4706
4707             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4708                                                      //           or as an escaped sequence (e.g. \n)
4709                 if (perlExpr.length() > 1) {
4710                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4711                 }
4712                 UChar c = perlExpr.charAt(0);
4713                 switch (c) {
4714                 case 'n':   c = '\n'; break;
4715                 // add any other escape sequences that show up in the test expected results.
4716                 }
4717                 resultString.append(c);
4718                 perlExpr.remove(0, 1);
4719             }
4720
4721             else  {
4722                 // Any characters from the perl expression that we don't explicitly
4723                 //  recognize before here are assumed to be literals and copied
4724                 //  as-is to the expected results.
4725                 resultString.append(perlExpr.charAt(0));
4726                 perlExpr.remove(0, 1);
4727             }
4728
4729             if (U_FAILURE(status)) {
4730                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4731                 break;
4732             }
4733         }
4734
4735         //
4736         // Expected Results Compare
4737         //
4738         UnicodeString expectedS(fields[4]);
4739         expectedS.findAndReplace(nulnulSrc, nulnul);
4740         expectedS.findAndReplace(ffffSrc,   ffff);
4741         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4742
4743
4744         if (expectedS.compare(resultString) != 0) {
4745             err("Line %d: Incorrect perl expression results.", lineNum);
4746             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4747         }
4748
4749         delete testMat;
4750         delete testPat;
4751     }
4752
4753     //
4754     // All done.  Clean up allocated stuff.
4755     //
4756     delete cgMat;
4757     delete cgPat;
4758
4759     delete groupsMat;
4760     delete groupsPat;
4761
4762     delete flagMat;
4763     delete flagPat;
4764
4765     delete lineMat;
4766     delete linePat;
4767
4768     delete fieldPat;
4769     delete [] testData;
4770
4771     utext_close(&patternText);
4772     utext_close(&inputText);
4773
4774     delete [] patternChars;
4775     delete [] inputChars;
4776
4777
4778     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4779
4780 }
4781
4782
4783 //--------------------------------------------------------------
4784 //
4785 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4786 //             Use this pattern,
4787 //                 "(a?){1,8000000}"
4788 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4789 //                   This test is likely to be fragile, as further optimizations stop
4790 //                   more cases of pointless looping in the match engine.
4791 //
4792 //---------------------------------------------------------------
4793 void RegexTest::Bug6149() {
4794     UnicodeString pattern("(a?){1,8000000}");
4795     UnicodeString s("xyz");
4796     uint32_t flags = 0;
4797     UErrorCode status = U_ZERO_ERROR;
4798
4799     RegexMatcher  matcher(pattern, s, flags, status);
4800     UBool result = false;
4801     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4802     REGEX_ASSERT(result == FALSE);
4803  }
4804
4805
4806 //
4807 //   Callbacks()    Test the callback function.
4808 //                  When set, callbacks occur periodically during matching operations,
4809 //                  giving the application code the ability to abort the operation
4810 //                  before it's normal completion.
4811 //
4812
4813 struct callBackContext {
4814     RegexTest        *test;
4815     int32_t          maxCalls;
4816     int32_t          numCalls;
4817     int32_t          lastSteps;
4818     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4819 };
4820
4821 U_CDECL_BEGIN
4822 static UBool U_CALLCONV
4823 testCallBackFn(const void *context, int32_t steps) {
4824     callBackContext  *info = (callBackContext *)context;
4825     if (info->lastSteps+1 != steps) {
4826         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4827     }
4828     info->lastSteps = steps;
4829     info->numCalls++;
4830     return (info->numCalls < info->maxCalls);
4831 }
4832 U_CDECL_END
4833
4834 void RegexTest::Callbacks() {
4835    {
4836         // Getter returns NULLs if no callback has been set
4837
4838         //   The variables that the getter will fill in.
4839         //   Init to non-null values so that the action of the getter can be seen.
4840         const void          *returnedContext = &returnedContext;
4841         URegexMatchCallback *returnedFn = &testCallBackFn;
4842
4843         UErrorCode status = U_ZERO_ERROR;
4844         RegexMatcher matcher("x", 0, status);
4845         REGEX_CHECK_STATUS;
4846         matcher.getMatchCallback(returnedFn, returnedContext, status);
4847         REGEX_CHECK_STATUS;
4848         REGEX_ASSERT(returnedFn == NULL);
4849         REGEX_ASSERT(returnedContext == NULL);
4850     }
4851
4852    {
4853         // Set and Get work
4854         callBackContext cbInfo = {this, 0, 0, 0};
4855         const void          *returnedContext;
4856         URegexMatchCallback *returnedFn;
4857         UErrorCode status = U_ZERO_ERROR;
4858         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4859         REGEX_CHECK_STATUS;
4860         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4861         REGEX_CHECK_STATUS;
4862         matcher.getMatchCallback(returnedFn, returnedContext, status);
4863         REGEX_CHECK_STATUS;
4864         REGEX_ASSERT(returnedFn == testCallBackFn);
4865         REGEX_ASSERT(returnedContext == &cbInfo);
4866
4867         // A short-running match shouldn't invoke the callback
4868         status = U_ZERO_ERROR;
4869         cbInfo.reset(1);
4870         UnicodeString s = "xxx";
4871         matcher.reset(s);
4872         REGEX_ASSERT(matcher.matches(status));
4873         REGEX_CHECK_STATUS;
4874         REGEX_ASSERT(cbInfo.numCalls == 0);
4875
4876         // A medium-length match that runs long enough to invoke the
4877         //   callback, but not so long that the callback aborts it.
4878         status = U_ZERO_ERROR;
4879         cbInfo.reset(4);
4880         s = "aaaaaaaaaaaaaaaaaaab";
4881         matcher.reset(s);
4882         REGEX_ASSERT(matcher.matches(status)==FALSE);
4883         REGEX_CHECK_STATUS;
4884         REGEX_ASSERT(cbInfo.numCalls > 0);
4885
4886         // A longer running match that the callback function will abort.
4887         status = U_ZERO_ERROR;
4888         cbInfo.reset(4);
4889         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4890         matcher.reset(s);
4891         REGEX_ASSERT(matcher.matches(status)==FALSE);
4892         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4893         REGEX_ASSERT(cbInfo.numCalls == 4);
4894
4895         // A longer running find that the callback function will abort.
4896         status = U_ZERO_ERROR;
4897         cbInfo.reset(4);
4898         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4899         matcher.reset(s);
4900         REGEX_ASSERT(matcher.find(status)==FALSE);
4901         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4902         REGEX_ASSERT(cbInfo.numCalls == 4);
4903     }
4904
4905
4906 }
4907
4908
4909 //
4910 //   FindProgressCallbacks()    Test the find "progress" callback function.
4911 //                  When set, the find progress callback will be invoked during a find operations
4912 //                  after each return from a match attempt, giving the application the opportunity
4913 //                  to terminate a long-running find operation before it's normal completion.
4914 //
4915
4916 struct progressCallBackContext {
4917     RegexTest        *test;
4918     int64_t          lastIndex;
4919     int32_t          maxCalls;
4920     int32_t          numCalls;
4921     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4922 };
4923
4924 // call-back function for find().
4925 // Return TRUE to continue the find().
4926 // Return FALSE to stop the find().
4927 U_CDECL_BEGIN
4928 static UBool U_CALLCONV
4929 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4930     progressCallBackContext  *info = (progressCallBackContext *)context;
4931     info->numCalls++;
4932     info->lastIndex = matchIndex;
4933 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4934     return (info->numCalls < info->maxCalls);
4935 }
4936 U_CDECL_END
4937
4938 void RegexTest::FindProgressCallbacks() {
4939    {
4940         // Getter returns NULLs if no callback has been set
4941
4942         //   The variables that the getter will fill in.
4943         //   Init to non-null values so that the action of the getter can be seen.
4944         const void                  *returnedContext = &returnedContext;
4945         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4946
4947         UErrorCode status = U_ZERO_ERROR;
4948         RegexMatcher matcher("x", 0, status);
4949         REGEX_CHECK_STATUS;
4950         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4951         REGEX_CHECK_STATUS;
4952         REGEX_ASSERT(returnedFn == NULL);
4953         REGEX_ASSERT(returnedContext == NULL);
4954     }
4955
4956    {
4957         // Set and Get work
4958         progressCallBackContext cbInfo = {this, 0, 0, 0};
4959         const void                  *returnedContext;
4960         URegexFindProgressCallback  *returnedFn;
4961         UErrorCode status = U_ZERO_ERROR;
4962         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4963         REGEX_CHECK_STATUS;
4964         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4965         REGEX_CHECK_STATUS;
4966         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4967         REGEX_CHECK_STATUS;
4968         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4969         REGEX_ASSERT(returnedContext == &cbInfo);
4970
4971         // A find that matches on the initial position does NOT invoke the callback.
4972         status = U_ZERO_ERROR;
4973         cbInfo.reset(100);
4974         UnicodeString s = "aaxxx";
4975         matcher.reset(s);
4976 #if 0
4977         matcher.setTrace(TRUE);
4978 #endif
4979         REGEX_ASSERT(matcher.find(0, status));
4980         REGEX_CHECK_STATUS;
4981         REGEX_ASSERT(cbInfo.numCalls == 0);
4982
4983         // A medium running find() that causes matcher.find() to invoke our callback for each index,
4984         //   but not so many times that we interrupt the operation.
4985         status = U_ZERO_ERROR;
4986         s = "aaaaaaaaaaaaaaaaaaab";
4987         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4988         matcher.reset(s);
4989         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4990         REGEX_CHECK_STATUS;
4991         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4992
4993         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4994         status = U_ZERO_ERROR;
4995         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4996         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4997         matcher.reset(s1);
4998         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4999         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
5000         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
5001
5002         // Now a match that will succeed, but after an interruption
5003         status = U_ZERO_ERROR;
5004         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
5005         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
5006         matcher.reset(s2);
5007         REGEX_ASSERT(matcher.find(0, status)==FALSE);
5008         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
5009         // Now retry the match from where left off
5010         cbInfo.maxCalls = 100; //  No callback limit
5011         status = U_ZERO_ERROR;
5012         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
5013         REGEX_CHECK_STATUS;
5014     }
5015
5016
5017 }
5018
5019
5020 //---------------------------------------------------------------------------
5021 //
5022 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
5023 //                             UTexts. The pure-C implementation of UText
5024 //                             has no mutable backing stores, but we can
5025 //                             use UnicodeString here to test the functionality.
5026 //
5027 //---------------------------------------------------------------------------
5028 void RegexTest::PreAllocatedUTextCAPI () {
5029     UErrorCode           status = U_ZERO_ERROR;
5030     URegularExpression  *re;
5031     UText                patternText = UTEXT_INITIALIZER;
5032     UnicodeString        buffer;
5033     UText                bufferText = UTEXT_INITIALIZER;
5034
5035     utext_openUnicodeString(&bufferText, &buffer, &status);
5036
5037     /*
5038      *  getText() and getUText()
5039      */
5040     {
5041         UText  text1 = UTEXT_INITIALIZER;
5042         UText  text2 = UTEXT_INITIALIZER;
5043         UChar  text2Chars[20];
5044         UText  *resultText;
5045
5046         status = U_ZERO_ERROR;
5047         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5048         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5049         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5050         utext_openUChars(&text2, text2Chars, -1, &status);
5051
5052         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5053         re = uregex_openUText(&patternText, 0, NULL, &status);
5054
5055         /* First set a UText */
5056         uregex_setUText(re, &text1, &status);
5057         resultText = uregex_getUText(re, &bufferText, &status);
5058         REGEX_CHECK_STATUS;
5059         REGEX_ASSERT(resultText == &bufferText);
5060         utext_setNativeIndex(resultText, 0);
5061         utext_setNativeIndex(&text1, 0);
5062         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5063
5064         resultText = uregex_getUText(re, &bufferText, &status);
5065         REGEX_CHECK_STATUS;
5066         REGEX_ASSERT(resultText == &bufferText);
5067         utext_setNativeIndex(resultText, 0);
5068         utext_setNativeIndex(&text1, 0);
5069         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5070
5071         /* Then set a UChar * */
5072         uregex_setText(re, text2Chars, 7, &status);
5073         resultText = uregex_getUText(re, &bufferText, &status);
5074         REGEX_CHECK_STATUS;
5075         REGEX_ASSERT(resultText == &bufferText);
5076         utext_setNativeIndex(resultText, 0);
5077         utext_setNativeIndex(&text2, 0);
5078         REGEX_ASSERT(testUTextEqual(resultText, &text2));
5079
5080         uregex_close(re);
5081         utext_close(&text1);
5082         utext_close(&text2);
5083     }
5084
5085     /*
5086      *  group()
5087      */
5088     {
5089         UChar    text1[80];
5090         UText   *actual;
5091         UBool    result;
5092         int64_t  length = 0;
5093
5094         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5095         //                  012345678901234567890123456789012345678901234567
5096         //                  0         1         2         3         4
5097
5098         status = U_ZERO_ERROR;
5099         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5100         REGEX_CHECK_STATUS;
5101
5102         uregex_setText(re, text1, -1, &status);
5103         result = uregex_find(re, 0, &status);
5104         REGEX_ASSERT(result==TRUE);
5105
5106         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5107         status = U_ZERO_ERROR;
5108         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5109         REGEX_CHECK_STATUS;
5110         REGEX_ASSERT(actual == &bufferText);
5111         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5112         REGEX_ASSERT(length == 16);
5113         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5114
5115         /*  Capture group #1.  Should succeed, matching " interior ". */
5116         status = U_ZERO_ERROR;
5117         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5118         REGEX_CHECK_STATUS;
5119         REGEX_ASSERT(actual == &bufferText);
5120         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5121         REGEX_ASSERT(length == 10);
5122         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5123
5124         /*  Capture group out of range.  Error. */
5125         status = U_ZERO_ERROR;
5126         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5127         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5128         REGEX_ASSERT(actual == &bufferText);
5129         uregex_close(re);
5130
5131     }
5132
5133     /*
5134      *  replaceFirst()
5135      */
5136     {
5137         UChar    text1[80];
5138         UChar    text2[80];
5139         UText    replText = UTEXT_INITIALIZER;
5140         UText   *result;
5141         status = U_ZERO_ERROR;
5142         utext_openUnicodeString(&bufferText, &buffer, &status);
5143
5144         status = U_ZERO_ERROR;
5145         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5146         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5147         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5148
5149         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5150         REGEX_CHECK_STATUS;
5151
5152         /*  Normal case, with match */
5153         uregex_setText(re, text1, -1, &status);
5154         REGEX_CHECK_STATUS;
5155         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5156         REGEX_CHECK_STATUS;
5157         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5158         REGEX_CHECK_STATUS;
5159         REGEX_ASSERT(result == &bufferText);
5160         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5161
5162         /* No match.  Text should copy to output with no changes.  */
5163         uregex_setText(re, text2, -1, &status);
5164         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5165         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5166         REGEX_CHECK_STATUS;
5167         REGEX_ASSERT(result == &bufferText);
5168         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5169
5170         /* Unicode escapes */
5171         uregex_setText(re, text1, -1, &status);
5172         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5173         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5174         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5175         REGEX_CHECK_STATUS;
5176         REGEX_ASSERT(result == &bufferText);
5177         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5178
5179         uregex_close(re);
5180         utext_close(&replText);
5181     }
5182
5183
5184     /*
5185      *  replaceAll()
5186      */
5187     {
5188         UChar    text1[80];
5189         UChar    text2[80];
5190         UText    replText = UTEXT_INITIALIZER;
5191         UText   *result;
5192
5193         status = U_ZERO_ERROR;
5194         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5195         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5196         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5197
5198         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5199         REGEX_CHECK_STATUS;
5200
5201         /*  Normal case, with match */
5202         uregex_setText(re, text1, -1, &status);
5203         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5204         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5205         REGEX_CHECK_STATUS;
5206         REGEX_ASSERT(result == &bufferText);
5207         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5208
5209         /* No match.  Text should copy to output with no changes.  */
5210         uregex_setText(re, text2, -1, &status);
5211         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5212         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5213         REGEX_CHECK_STATUS;
5214         REGEX_ASSERT(result == &bufferText);
5215         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5216
5217         uregex_close(re);
5218         utext_close(&replText);
5219     }
5220
5221
5222     /*
5223      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5224      *   so we don't need to test it here.
5225      */
5226
5227     utext_close(&bufferText);
5228     utext_close(&patternText);
5229 }
5230
5231
5232 //--------------------------------------------------------------
5233 //
5234 //  NamedCapture   Check basic named capture group functionality
5235 //
5236 //--------------------------------------------------------------
5237 void RegexTest::NamedCapture() {
5238     UErrorCode status = U_ZERO_ERROR;
5239     RegexPattern *pat = RegexPattern::compile(UnicodeString(
5240             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5241     REGEX_CHECK_STATUS;
5242     int32_t group = pat->groupNumberFromName("five", -1, status);
5243     REGEX_CHECK_STATUS;
5244     REGEX_ASSERT(5 == group);
5245     group = pat->groupNumberFromName("three", -1, status);
5246     REGEX_CHECK_STATUS;
5247     REGEX_ASSERT(3 == group);
5248
5249     status = U_ZERO_ERROR;
5250     group = pat->groupNumberFromName(UnicodeString("six"), status);
5251     REGEX_CHECK_STATUS;
5252     REGEX_ASSERT(6 == group);
5253
5254     status = U_ZERO_ERROR;
5255     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5256     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5257
5258     status = U_ZERO_ERROR;
5259
5260     // After copying a pattern, named capture should still work in the copy.
5261     RegexPattern *copiedPat = new RegexPattern(*pat);
5262     REGEX_ASSERT(*copiedPat == *pat);
5263     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5264
5265     group = copiedPat->groupNumberFromName("five", -1, status);
5266     REGEX_CHECK_STATUS;
5267     REGEX_ASSERT(5 == group);
5268     group = copiedPat->groupNumberFromName("three", -1, status);
5269     REGEX_CHECK_STATUS;
5270     REGEX_ASSERT(3 == group);
5271     delete copiedPat;
5272
5273     // ReplaceAll with named capture group.
5274     status = U_ZERO_ERROR;
5275     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5276     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5277     REGEX_CHECK_STATUS;
5278     // m.pattern().dumpPattern();
5279     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5280     REGEX_CHECK_STATUS;
5281     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5282     delete m;
5283
5284     // ReplaceAll, allowed capture group numbers.
5285     text = UnicodeString("abcmxyz");
5286     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5287     REGEX_CHECK_STATUS;
5288
5289     status = U_ZERO_ERROR;
5290     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5291     REGEX_CHECK_STATUS;
5292     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5293
5294     status = U_ZERO_ERROR;
5295     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5296     REGEX_CHECK_STATUS;
5297     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5298
5299     status = U_ZERO_ERROR;
5300     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5301     REGEX_CHECK_STATUS;
5302     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5303
5304     status = U_ZERO_ERROR;
5305     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5306     REGEX_CHECK_STATUS;
5307     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5308
5309     status = U_ZERO_ERROR;
5310     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5311     REGEX_CHECK_STATUS;
5312     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5313
5314     status = U_ZERO_ERROR;
5315     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5316     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5317
5318     status = U_ZERO_ERROR;
5319     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5320     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5321     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5322
5323     status = U_ZERO_ERROR;
5324     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5325     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5326     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5327
5328     status = U_ZERO_ERROR;
5329     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5330     REGEX_CHECK_STATUS;
5331     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5332
5333     status = U_ZERO_ERROR;
5334     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5335     REGEX_CHECK_STATUS;
5336     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5337
5338     status = U_ZERO_ERROR;
5339     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5340     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5341
5342     status = U_ZERO_ERROR;
5343     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5344     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5345
5346     status = U_ZERO_ERROR;
5347     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5348     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5349
5350     status = U_ZERO_ERROR;
5351     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5352     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5353
5354     delete m;
5355
5356     // Repeat the above replaceAll() tests using the plain C API, which
5357     //  has a separate implementation internally.
5358     //  TODO: factor out the test data.
5359
5360     status = U_ZERO_ERROR;
5361     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5362     REGEX_CHECK_STATUS;
5363     text = UnicodeString("abcmxyz");
5364     uregex_setText(re, text.getBuffer(), text.length(), &status);
5365     REGEX_CHECK_STATUS;
5366
5367     UChar resultBuf[100];
5368     int32_t resultLength;
5369     UnicodeString repl;
5370
5371     status = U_ZERO_ERROR;
5372     repl = UnicodeString("<$0>");
5373     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5374     REGEX_CHECK_STATUS;
5375     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5376
5377     status = U_ZERO_ERROR;
5378     repl = UnicodeString("<$1>");
5379     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5380     REGEX_CHECK_STATUS;
5381     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5382
5383     status = U_ZERO_ERROR;
5384     repl = UnicodeString("<${one}>");
5385     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5386     REGEX_CHECK_STATUS;
5387     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5388
5389     status = U_ZERO_ERROR;
5390     repl = UnicodeString("<$2>");
5391     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5392     REGEX_CHECK_STATUS;
5393     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5394
5395     status = U_ZERO_ERROR;
5396     repl = UnicodeString("<$3>");
5397     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5398     REGEX_CHECK_STATUS;
5399     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5400
5401     status = U_ZERO_ERROR;
5402     repl = UnicodeString("<$4>");
5403     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5404     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5405
5406     status = U_ZERO_ERROR;
5407     repl = UnicodeString("<$04>");
5408     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5409     REGEX_CHECK_STATUS;
5410     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5411
5412     status = U_ZERO_ERROR;
5413     repl = UnicodeString("<$000016>");
5414     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5415     REGEX_CHECK_STATUS;
5416     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5417
5418     status = U_ZERO_ERROR;
5419     repl = UnicodeString("<$3$2$1${one}>");
5420     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5421     REGEX_CHECK_STATUS;
5422     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5423
5424     status = U_ZERO_ERROR;
5425     repl = UnicodeString("$3$2$1${one}");
5426     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5427     REGEX_CHECK_STATUS;
5428     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5429
5430     status = U_ZERO_ERROR;
5431     repl = UnicodeString("<${noSuchName}>");
5432     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5433     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5434
5435     status = U_ZERO_ERROR;
5436     repl = UnicodeString("<${invalid-name}>");
5437     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5438     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5439
5440     status = U_ZERO_ERROR;
5441     repl = UnicodeString("<${one");
5442     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5443     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5444
5445     status = U_ZERO_ERROR;
5446     repl = UnicodeString("$not a capture group");
5447     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5448     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5449
5450     uregex_close(re);
5451 }
5452
5453 //--------------------------------------------------------------
5454 //
5455 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5456 //                       The point is not so much what the exact limit is,
5457 //                       but that a largish number doesn't hit bad non-linear performance,
5458 //                       and that exceeding the limit fails cleanly.
5459 //
5460 //--------------------------------------------------------------
5461 void RegexTest::NamedCaptureLimits() {
5462     if (quick) {
5463         logln("Skipping test. Runs in exhuastive mode only.");
5464         return;
5465     }
5466     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5467     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5468     char nnbuf[100];
5469     UnicodeString pattern;
5470     int32_t nn;
5471
5472     for (nn=1; nn<goodLimit; nn++) {
5473         sprintf(nnbuf, "(?<nn%d>)", nn);
5474         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5475     }
5476     UErrorCode status = U_ZERO_ERROR;
5477     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5478     REGEX_CHECK_STATUS;
5479     for (nn=1; nn<goodLimit; nn++) {
5480         sprintf(nnbuf, "nn%d", nn);
5481         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5482         REGEX_ASSERT(nn == groupNum);
5483         if (nn != groupNum) {
5484             break;
5485         }
5486     }
5487     delete pat;
5488
5489     pattern.remove();
5490     for (nn=1; nn<failLimit; nn++) {
5491         sprintf(nnbuf, "(?<nn%d>)", nn);
5492         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5493     }
5494     status = U_ZERO_ERROR;
5495     pat = RegexPattern::compile(pattern, 0, status);
5496     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5497     delete pat;
5498 }
5499
5500
5501 //--------------------------------------------------------------
5502 //
5503 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5504 //
5505 //---------------------------------------------------------------
5506 void RegexTest::Bug7651() {
5507     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5508     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5509     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5510     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5511     UnicodeString s("#ff @abcd This is test");
5512     RegexPattern  *REPattern = NULL;
5513     RegexMatcher  *REMatcher = NULL;
5514     UErrorCode status = U_ZERO_ERROR;
5515     UParseError pe;
5516
5517     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5518     REGEX_CHECK_STATUS;
5519     REMatcher = REPattern->matcher(s, status);
5520     REGEX_CHECK_STATUS;
5521     REGEX_ASSERT(REMatcher->find());
5522     REGEX_ASSERT(REMatcher->start(status) == 0);
5523     delete REPattern;
5524     delete REMatcher;
5525     status = U_ZERO_ERROR;
5526
5527     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5528     REGEX_CHECK_STATUS;
5529     REMatcher = REPattern->matcher(s, status);
5530     REGEX_CHECK_STATUS;
5531     REGEX_ASSERT(REMatcher->find());
5532     REGEX_ASSERT(REMatcher->start(status) == 0);
5533     delete REPattern;
5534     delete REMatcher;
5535     status = U_ZERO_ERROR;
5536  }
5537
5538 void RegexTest::Bug7740() {
5539     UErrorCode status = U_ZERO_ERROR;
5540     UnicodeString pattern = "(a)";
5541     UnicodeString text = "abcdef";
5542     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5543     REGEX_CHECK_STATUS;
5544     REGEX_ASSERT(m->lookingAt(status));
5545     REGEX_CHECK_STATUS;
5546     status = U_ILLEGAL_ARGUMENT_ERROR;
5547     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5548     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5549     REGEX_ASSERT(s == "");
5550     delete m;
5551 }
5552
5553 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5554
5555 void RegexTest::Bug8479() {
5556     UErrorCode status = U_ZERO_ERROR;
5557
5558     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5559     REGEX_CHECK_STATUS;
5560     if (U_SUCCESS(status))
5561     {
5562         UnicodeString str;
5563         str.setToBogus();
5564         pMatcher->reset(str);
5565         status = U_ZERO_ERROR;
5566         pMatcher->matches(status);
5567         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5568         delete pMatcher;
5569     }
5570 }
5571
5572
5573 // Bug 7029
5574 void RegexTest::Bug7029() {
5575     UErrorCode status = U_ZERO_ERROR;
5576
5577     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5578     UnicodeString text = "abc.def";
5579     UnicodeString splits[10];
5580     REGEX_CHECK_STATUS;
5581     int32_t numFields = pMatcher->split(text, splits, 10, status);
5582     REGEX_CHECK_STATUS;
5583     REGEX_ASSERT(numFields == 8);
5584     delete pMatcher;
5585 }
5586
5587 // Bug 9283
5588 //   This test is checking for the existance of any supplemental characters that case-fold
5589 //   to a bmp character.
5590 //
5591 //   At the time of this writing there are none. If any should appear in a subsequent release
5592 //   of Unicode, the code in regular expressions compilation that determines the longest
5593 //   posssible match for a literal string  will need to be enhanced.
5594 //
5595 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5596 //   for details on what to do in case of a failure of this test.
5597 //
5598 void RegexTest::Bug9283() {
5599 #if !UCONFIG_NO_NORMALIZATION
5600     UErrorCode status = U_ZERO_ERROR;
5601     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5602     REGEX_CHECK_STATUS;
5603     int32_t index;
5604     UChar32 c;
5605     for (index=0; ; index++) {
5606         c = supplementalsWithCaseFolding.charAt(index);
5607         if (c == -1) {
5608             break;
5609         }
5610         UnicodeString cf = UnicodeString(c).foldCase();
5611         REGEX_ASSERT(cf.length() >= 2);
5612     }
5613 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5614 }
5615
5616
5617 void RegexTest::CheckInvBufSize() {
5618   if(inv_next>=INV_BUFSIZ) {
5619     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5620           __FILE__, INV_BUFSIZ, inv_next);
5621   } else {
5622     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5623   }
5624 }
5625
5626
5627 void RegexTest::Bug10459() {
5628     UErrorCode status = U_ZERO_ERROR;
5629     UnicodeString patternString("(txt)");
5630     UnicodeString txtString("txt");
5631
5632     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5633     REGEX_CHECK_STATUS;
5634     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5635     REGEX_CHECK_STATUS;
5636
5637     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5638     REGEX_CHECK_STATUS;
5639
5640     uregex_setUText(icu_re, utext_txt, &status);
5641     REGEX_CHECK_STATUS;
5642
5643     // The bug was that calling uregex_group() before doing a matching operation
5644     //   was causing a segfault. Only for Regular Expressions created from UText.
5645     //   It should set an U_REGEX_INVALID_STATE.
5646
5647     UChar buf[100];
5648     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5649     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5650     REGEX_ASSERT(len == 0);
5651
5652     uregex_close(icu_re);
5653     utext_close(utext_pat);
5654     utext_close(utext_txt);
5655 }
5656
5657 void RegexTest::TestCaseInsensitiveStarters() {
5658     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5659     //  become stale because of new Unicode characters.
5660     // If it is stale, rerun the generation tool
5661     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5662     // and replace the embedded data in i18n/regexcmp.cpp
5663
5664     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5665         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5666             continue;
5667         }
5668         UnicodeSet s(cp, cp);
5669         s.closeOver(USET_CASE_INSENSITIVE);
5670         UnicodeSetIterator setIter(s);
5671         while (setIter.next()) {
5672             if (!setIter.isString()) {
5673                 continue;
5674             }
5675             const UnicodeString &str = setIter.getString();
5676             UChar32 firstChar = str.char32At(0);
5677             UnicodeSet starters;
5678             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5679             if (!starters.contains(cp)) {
5680                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5681                 return;
5682             }
5683         }
5684     }
5685 }
5686
5687
5688 void RegexTest::TestBug11049() {
5689     // Original bug report: pattern with match start consisting of one of several individual characters,
5690     //  and the text being matched ending with a supplementary character. find() would read past the
5691     //  end of the input text when searching for potential match starting points.
5692
5693     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5694     // detect the bad read.
5695
5696     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5697     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5698
5699     // Test again with a pattern starting with a single character,
5700     // which takes a different code path than starting with an OR expression,
5701     // but with similar logic.
5702     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5703     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5704 }
5705
5706 // Run a single test case from TestBug11049(). Internal function.
5707 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5708     UErrorCode status = U_ZERO_ERROR;
5709     UnicodeString patternString = UnicodeString(pattern).unescape();
5710     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5711
5712     UnicodeString dataString = UnicodeString(data).unescape();
5713     UChar *exactBuffer = new UChar[dataString.length()];
5714     dataString.extract(exactBuffer, dataString.length(), status);
5715     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5716
5717     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5718     REGEX_CHECK_STATUS;
5719     matcher->reset(ut);
5720     UBool result = matcher->find();
5721     if (result != expectMatch) {
5722         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5723               __FILE__, lineNumber, expectMatch, result, pattern, data);
5724     }
5725
5726     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5727     //   off-by-one on find() with match at the last code point.
5728     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5729     //   because string.unescape() will only shrink it.
5730     char * utf8Buffer = new char[uprv_strlen(data)+1];
5731     u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5732     REGEX_CHECK_STATUS;
5733     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5734     REGEX_CHECK_STATUS;
5735     matcher->reset(ut);
5736     result = matcher->find();
5737     if (result != expectMatch) {
5738         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5739               __FILE__, lineNumber, expectMatch, result, pattern, data);
5740     }
5741     delete [] utf8Buffer;
5742
5743     utext_close(ut);
5744     delete [] exactBuffer;
5745 }
5746
5747
5748 void RegexTest::TestBug11371() {
5749     if (quick) {
5750         logln("Skipping test. Runs in exhuastive mode only.");
5751         return;
5752     }
5753     UErrorCode status = U_ZERO_ERROR;
5754     UnicodeString patternString;
5755
5756     for (int i=0; i<8000000; i++) {
5757         patternString.append(UnicodeString("()"));
5758     }
5759     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5760     if (status != U_REGEX_PATTERN_TOO_BIG) {
5761         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5762               __FILE__, __LINE__, u_errorName(status));
5763     }
5764
5765     status = U_ZERO_ERROR;
5766     patternString = "(";
5767     for (int i=0; i<20000000; i++) {
5768         patternString.append(UnicodeString("A++"));
5769     }
5770     patternString.append(UnicodeString("){0}B++"));
5771     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5772     if (status != U_REGEX_PATTERN_TOO_BIG) {
5773         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5774               __FILE__, __LINE__, u_errorName(status));
5775     }
5776
5777     // Pattern with too much string data, such that string indexes overflow operand data field size
5778     // in compiled instruction.
5779     status = U_ZERO_ERROR;
5780     patternString = "";
5781     while (patternString.length() < 0x00ffffff) {
5782         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5783     }
5784     patternString.append(UnicodeString("X? trailing string"));
5785     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5786     if (status != U_REGEX_PATTERN_TOO_BIG) {
5787         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5788               __FILE__, __LINE__, u_errorName(status));
5789     }
5790 }
5791
5792 void RegexTest::TestBug11480() {
5793     // C API, get capture group of a group that does not participate in the match.
5794     //        (Returns a zero length string, with nul termination,
5795     //         indistinguishable from a group with a zero length match.)
5796
5797     UErrorCode status = U_ZERO_ERROR;
5798     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5799     REGEX_CHECK_STATUS;
5800     UnicodeString text = UNICODE_STRING_SIMPLE("A");
5801     uregex_setText(re, text.getBuffer(), text.length(), &status);
5802     REGEX_CHECK_STATUS;
5803     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5804     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5805     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5806     REGEX_ASSERT(length == 0);
5807     REGEX_ASSERT(buf[0] == 13);
5808     REGEX_ASSERT(buf[1] == 0);
5809     REGEX_ASSERT(buf[2] == 13);
5810     uregex_close(re);
5811
5812     // UText C++ API, length of match is 0 for non-participating matches.
5813     UText ut = UTEXT_INITIALIZER;
5814     utext_openUnicodeString(&ut, &text, &status);
5815     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5816     REGEX_CHECK_STATUS;
5817     matcher.reset(&ut);
5818     REGEX_ASSERT(matcher.lookingAt(0, status));
5819
5820     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5821     int64_t groupLen = -666;
5822     UText group = UTEXT_INITIALIZER;
5823     matcher.group(1, &group, groupLen, status);
5824     REGEX_CHECK_STATUS;
5825     REGEX_ASSERT(groupLen == 1);
5826     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5827
5828     // Capture group 2, the (B), does not participate in the match.
5829     matcher.group(2, &group, groupLen, status);
5830     REGEX_CHECK_STATUS;
5831     REGEX_ASSERT(groupLen == 0);
5832     REGEX_ASSERT(matcher.start(2, status) == -1);
5833     REGEX_CHECK_STATUS;
5834 }
5835
5836
5837 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */