source/test/intltest/rbbitst.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 1999-2016, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************/
   8 /************************************************************************
   9 *   Date        Name        Description
  10 *   12/15/99    Madhu        Creation.
  11 *   01/12/2000  Madhu        Updated for changed API and added new tests
  12 ************************************************************************/
  13
  14 #include "unicode/utypes.h"
  15 #if !UCONFIG_NO_BREAK_ITERATION
  16
  17 #include <stdio.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20
  21 #include "unicode/brkiter.h"
  22 #include "unicode/localpointer.h"
  23 #include "unicode/numfmt.h"
  24 #include "unicode/rbbi.h"
  25 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  26 #include "unicode/regex.h"
  27 #endif
  28 #include "unicode/schriter.h"
  29 #include "unicode/uchar.h"
  30 #include "unicode/utf16.h"
  31 #include "unicode/ucnv.h"
  32 #include "unicode/uniset.h"
  33 #include "unicode/uscript.h"
  34 #include "unicode/ustring.h"
  35 #include "unicode/utext.h"
  36
  37 #include "charstr.h"
  38 #include "cmemory.h"
  39 #include "intltest.h"
  40 #include "rbbitst.h"
  41 #include "utypeinfo.h"  // for 'typeid' to work
  42 #include "uvector.h"
  43 #include "uvectr32.h"
  44
  45 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  46 #include "unicode/filteredbrk.h"
  47 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
  48
  49 #define TEST_ASSERT(x) {if (!(x)) { \
  50     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  51
  52 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  53     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  54
  55
  56 //---------------------------------------------
  57 // runIndexedTest
  58 //---------------------------------------------
  59
  60
  61 //  Note:  Before adding new tests to this file, check whether the desired test data can
  62 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
  63 //         it's much less work than writing a new test, diagnostic output in the event of failures
  64 //         is good, and the test data file will is shared with ICU4J, so eventually the test
  65 //         will run there as well, without additional effort.
  66
  67 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  68 {
  69     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  70     fTestParams = params;
  71
  72     TESTCASE_AUTO_BEGIN;
  73 #if !UCONFIG_NO_FILE_IO
  74     TESTCASE_AUTO(TestBug4153072);
  75 #endif
  76     TESTCASE_AUTO(TestStatusReturn);
  77 #if !UCONFIG_NO_FILE_IO
  78     TESTCASE_AUTO(TestUnicodeFiles);
  79     TESTCASE_AUTO(TestEmptyString);
  80 #endif
  81     TESTCASE_AUTO(TestGetAvailableLocales);
  82     TESTCASE_AUTO(TestGetDisplayName);
  83 #if !UCONFIG_NO_FILE_IO
  84     TESTCASE_AUTO(TestEndBehaviour);
  85     TESTCASE_AUTO(TestWordBreaks);
  86     TESTCASE_AUTO(TestWordBoundary);
  87     TESTCASE_AUTO(TestLineBreaks);
  88     TESTCASE_AUTO(TestSentBreaks);
  89     TESTCASE_AUTO(TestExtended);
  90 #endif
  91 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
  92     TESTCASE_AUTO(TestMonkey);
  93 #endif
  94 #if !UCONFIG_NO_FILE_IO
  95     TESTCASE_AUTO(TestBug3818);
  96 #endif
  97     TESTCASE_AUTO(TestDebug);
  98 #if !UCONFIG_NO_FILE_IO
  99     TESTCASE_AUTO(TestBug5775);
 100 #endif
 101     TESTCASE_AUTO(TestBug9983);
 102     TESTCASE_AUTO(TestDictRules);
 103     TESTCASE_AUTO(TestBug5532);
 104     TESTCASE_AUTO(TestBug7547);
 105     TESTCASE_AUTO(TestBug12797);
 106     TESTCASE_AUTO_END;
 107 }
 108
 109
 110 //---------------------------------------------------------------------------
 111 //
 112 //   class BITestData   Holds a set of Break iterator test data and results
 113 //                      Includes
 114 //                         - the string data to be broken
 115 //                         - a vector of the expected break positions.
 116 //                         - a vector of source line numbers for the data,
 117 //                               (to help see where errors occured.)
 118 //                         - The expected break tag values.
 119 //                         - Vectors of actual break positions and tag values.
 120 //                         - Functions for comparing actual with expected and
 121 //                            reporting errors.
 122 //
 123 //----------------------------------------------------------------------------
 124 class BITestData {
 125 public:
 126     UnicodeString    fDataToBreak;
 127     UVector          fExpectedBreakPositions;
 128     UVector          fExpectedTags;
 129     UVector          fLineNum;
 130     UVector          fActualBreakPositions;   // Test Results.
 131     UVector          fActualTags;
 132
 133     BITestData(UErrorCode &status);
 134     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
 135     void             checkResults(const char *heading, RBBITest *test);
 136     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
 137     void             clearResults();
 138 };
 139
 140 //
 141 // Constructor.
 142 //
 143 BITestData::BITestData(UErrorCode &status)
 144 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
 145   fActualTags(status)
 146 {
 147 }
 148
 149 //
 150 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
 151 //                 The macro form collects the line number, which is helpful
 152 //                 when tracking down failures.
 153 //
 154 //                 A null data item is inserted at the start of each test's data
 155 //                  to put the starting zero into the data list.  The position saved for
 156 //                  each non-null item is its ending position.
 157 //
 158 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
 159 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
 160     if (U_FAILURE(status)) {return;}
 161     if (data != NULL) {
 162         fDataToBreak.append(CharsToUnicodeString(data));
 163     }
 164     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
 165     fExpectedTags.addElement(tag, status);
 166     fLineNum.addElement(lineNum, status);
 167 }
 168
 169
 170 //
 171 //  checkResults.   Compare the actual and expected break positions, report any differences.
 172 //
 173 void BITestData::checkResults(const char *heading, RBBITest *test) {
 174     int32_t   expectedIndex = 0;
 175     int32_t   actualIndex = 0;
 176
 177     for (;;) {
 178         // If we've run through both the expected and actual results vectors, we're done.
 179         //   break out of the loop.
 180         if (expectedIndex >= fExpectedBreakPositions.size() &&
 181             actualIndex   >= fActualBreakPositions.size()) {
 182             break;
 183         }
 184
 185
 186         if (expectedIndex >= fExpectedBreakPositions.size()) {
 187             err(heading, test, expectedIndex-1, actualIndex);
 188             actualIndex++;
 189             continue;
 190         }
 191
 192         if (actualIndex >= fActualBreakPositions.size()) {
 193             err(heading, test, expectedIndex, actualIndex-1);
 194             expectedIndex++;
 195             continue;
 196         }
 197
 198         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 199             err(heading, test, expectedIndex, actualIndex);
 200             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 201             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 202                 actualIndex++;
 203             } else {
 204                 expectedIndex++;
 205             }
 206             continue;
 207         }
 208
 209         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 210             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 211                 heading, fLineNum.elementAt(expectedIndex),
 212                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 213         }
 214
 215         actualIndex++;
 216         expectedIndex++;
 217     }
 218 }
 219
 220 //
 221 //  err   -  An error was found.  Report it, along with information about where the
 222 //                                incorrectly broken test data appeared in the source file.
 223 //
 224 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 225 {
 226     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 227     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 228     int32_t   o        = 0;
 229     int32_t   line     = fLineNum.elementAti(expectedIdx);
 230     if (expectedIdx > 0) {
 231         // The line numbers are off by one because a premature break occurs somewhere
 232         //    within the previous item, rather than at the start of the current (expected) item.
 233         //    We want to report the offset of the unexpected break from the start of
 234         //      this previous item.
 235         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 236     }
 237     if (actual < expected) {
 238         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
 239     } else {
 240         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
 241     }
 242 }
 243
 244
 245 void BITestData::clearResults() {
 246     fActualBreakPositions.removeAllElements();
 247     fActualTags.removeAllElements();
 248 }
 249
 250
 251 //--------------------------------------------------------------------------------------
 252 //
 253 //    RBBITest    constructor and destructor
 254 //
 255 //--------------------------------------------------------------------------------------
 256
 257 RBBITest::RBBITest() {
 258     fTestParams = NULL;
 259 }
 260
 261
 262 RBBITest::~RBBITest() {
 263 }
 264
 265 //-----------------------------------------------------------------------------------
 266 //
 267 //   Test for status {tag} return value from break rules.
 268 //        TODO:  a more thorough test.
 269 //
 270 //-----------------------------------------------------------------------------------
 271 void RBBITest::TestStatusReturn() {
 272      UnicodeString rulesString1("$Letters = [:L:];\n"
 273                                   "$Numbers = [:N:];\n"
 274                                   "$Letters+{1};\n"
 275                                   "$Numbers+{2};\n"
 276                                   "Help\\ /me\\!{4};\n"
 277                                   "[^$Letters $Numbers];\n"
 278                                   "!.*;\n", -1, US_INV);
 279      UnicodeString testString1  = "abc123..abc Help me Help me!";
 280                                 // 01234567890123456789012345678
 281      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 282      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 283
 284      UErrorCode status=U_ZERO_ERROR;
 285      UParseError    parseError;
 286
 287      LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
 288      if(U_FAILURE(status)) {
 289          dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__,  u_errorName(status));
 290          return;
 291      }
 292      int32_t  pos;
 293      int32_t  i = 0;
 294      bi->setText(testString1);
 295      for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 296          if (pos != bounds1[i]) {
 297              errln("%s:%d  expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
 298              break;
 299          }
 300
 301          int tag = bi->getRuleStatus();
 302          if (tag != brkStatus[i]) {
 303              errln("%s:%d  break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
 304              break;
 305          }
 306          i++;
 307      }
 308 }
 309
 310
 311 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
 312     UErrorCode status = U_ZERO_ERROR;
 313     char name[100];
 314     printf("code    alpha extend alphanum type word sent line name\n");
 315     int nextExpectedIndex = 0;
 316     utext_setNativeIndex(tstr, 0);
 317     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
 318         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
 319             printf("------------------------------------------------ %d\n", j);
 320             ++nextExpectedIndex;
 321         }
 322
 323         UChar32 c = utext_next32(tstr);
 324         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 325         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 326                            u_isUAlphabetic(c),
 327                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 328                            u_isalnum(c),
 329                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 330                                                   u_charType(c),
 331                                                   U_SHORT_PROPERTY_NAME),
 332                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 333                                                   u_getIntPropertyValue(c,
 334                                                           UCHAR_WORD_BREAK),
 335                                                   U_SHORT_PROPERTY_NAME),
 336                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 337                                    u_getIntPropertyValue(c,
 338                                            UCHAR_SENTENCE_BREAK),
 339                                    U_SHORT_PROPERTY_NAME),
 340                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 341                                    u_getIntPropertyValue(c,
 342                                            UCHAR_LINE_BREAK),
 343                                    U_SHORT_PROPERTY_NAME),
 344                            name);
 345     }
 346 }
 347
 348
 349 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
 350    UErrorCode status = U_ZERO_ERROR;
 351    UText *tstr = NULL;
 352    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
 353    if (U_FAILURE(status)) {
 354        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
 355        return;
 356     }
 357    printStringBreaks(tstr, expected, expectedCount);
 358    utext_close(tstr);
 359 }
 360
 361
 362 void RBBITest::TestBug3818() {
 363     UErrorCode  status = U_ZERO_ERROR;
 364
 365     // Four Thai words...
 366     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 367                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 368     UnicodeString  thaiStr(thaiWordData);
 369
 370     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
 371     if (U_FAILURE(status) || bi == NULL) {
 372         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 373         return;
 374     }
 375     bi->setText(thaiStr);
 376
 377     int32_t  startOfSecondWord = bi->following(1);
 378     if (startOfSecondWord != 4) {
 379         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 380             __FILE__, __LINE__, startOfSecondWord);
 381     }
 382     startOfSecondWord = bi->following(0);
 383     if (startOfSecondWord != 4) {
 384         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 385             __FILE__, __LINE__, startOfSecondWord);
 386     }
 387     delete bi;
 388 }
 389
 390 //----------------------------------------------------------------------------
 391 //
 392 // generalIteratorTest      Given a break iterator and a set of test data,
 393 //                          Run the tests and report the results.
 394 //
 395 //----------------------------------------------------------------------------
 396 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 397 {
 398
 399     bi.setText(td.fDataToBreak);
 400
 401     testFirstAndNext(bi, td);
 402
 403     testLastAndPrevious(bi, td);
 404
 405     testFollowing(bi, td);
 406     testPreceding(bi, td);
 407     testIsBoundary(bi, td);
 408     doMultipleSelectionTest(bi, td);
 409 }
 410
 411
 412 //
 413 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 414 //                       kind of loop.
 415 //
 416 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 417 {
 418     UErrorCode  status = U_ZERO_ERROR;
 419     int32_t     p;
 420     int32_t     lastP = -1;
 421     int32_t     tag;
 422
 423     logln("Test first and next");
 424     bi.setText(td.fDataToBreak);
 425     td.clearResults();
 426
 427     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 428         td.fActualBreakPositions.addElement(p, status);  // Save result.
 429         tag = bi.getRuleStatus();
 430         td.fActualTags.addElement(tag, status);
 431         if (p <= lastP) {
 432             // If the iterator is not making forward progress, stop.
 433             //  No need to raise an error here, it'll be detected in the normal check of results.
 434             break;
 435         }
 436         lastP = p;
 437     }
 438     td.checkResults("testFirstAndNext", this);
 439 }
 440
 441
 442 //
 443 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 444 //
 445 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 446 {
 447     UErrorCode  status = U_ZERO_ERROR;
 448     int32_t     p;
 449     int32_t     lastP  = 0x7ffffffe;
 450     int32_t     tag;
 451
 452     logln("Test last and previous");
 453     bi.setText(td.fDataToBreak);
 454     td.clearResults();
 455
 456     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 457         // Save break position.  Insert it at start of vector of results, shoving
 458         //    already-saved results further towards the end.
 459         td.fActualBreakPositions.insertElementAt(p, 0, status);
 460         // bi.previous();   // TODO:  Why does this fix things up????
 461         // bi.next();
 462         tag = bi.getRuleStatus();
 463         td.fActualTags.insertElementAt(tag, 0, status);
 464         if (p >= lastP) {
 465             // If the iterator is not making progress, stop.
 466             //  No need to raise an error here, it'll be detected in the normal check of results.
 467             break;
 468         }
 469         lastP = p;
 470     }
 471     td.checkResults("testLastAndPrevious", this);
 472 }
 473
 474
 475 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 476 {
 477     UErrorCode  status = U_ZERO_ERROR;
 478     int32_t     p;
 479     int32_t     tag;
 480     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 481                                  //   cannot be -1; that is returned for DONE.
 482     int         i;
 483
 484     logln("testFollowing():");
 485     bi.setText(td.fDataToBreak);
 486     td.clearResults();
 487
 488     // Save the starting point, since we won't get that out of following.
 489     p = bi.first();
 490     td.fActualBreakPositions.addElement(p, status);  // Save result.
 491     tag = bi.getRuleStatus();
 492     td.fActualTags.addElement(tag, status);
 493
 494     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 495         p = bi.following(i);
 496         if (p != lastP) {
 497             if (p == RuleBasedBreakIterator::DONE) {
 498                 break;
 499             }
 500             // We've reached a new break position.  Save it.
 501             td.fActualBreakPositions.addElement(p, status);  // Save result.
 502             tag = bi.getRuleStatus();
 503             td.fActualTags.addElement(tag, status);
 504             lastP = p;
 505         }
 506     }
 507     // The loop normally exits by means of the break in the middle.
 508     // Make sure that the index was at the correct position for the break iterator to have
 509     //   returned DONE.
 510     if (i != td.fDataToBreak.length()) {
 511         errln("testFollowing():  iterator returned DONE prematurely.");
 512     }
 513
 514     // Full check of all results.
 515     td.checkResults("testFollowing", this);
 516 }
 517
 518
 519
 520 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 521     UErrorCode  status = U_ZERO_ERROR;
 522     int32_t     p;
 523     int32_t     tag;
 524     int32_t     lastP  = 0x7ffffffe;
 525     int         i;
 526
 527     logln("testPreceding():");
 528     bi.setText(td.fDataToBreak);
 529     td.clearResults();
 530
 531     p = bi.last();
 532     td.fActualBreakPositions.addElement(p, status);
 533     tag = bi.getRuleStatus();
 534     td.fActualTags.addElement(tag, status);
 535
 536     for (i = td.fDataToBreak.length(); i>=-1; i--) {
 537         p = bi.preceding(i);
 538         if (p != lastP) {
 539             if (p == RuleBasedBreakIterator::DONE) {
 540                 break;
 541             }
 542             // We've reached a new break position.  Save it.
 543             td.fActualBreakPositions.insertElementAt(p, 0, status);
 544             lastP = p;
 545             tag = bi.getRuleStatus();
 546             td.fActualTags.insertElementAt(tag, 0, status);
 547         }
 548     }
 549     // The loop normally exits by means of the break in the middle.
 550     // Make sure that the index was at the correct position for the break iterator to have
 551     //   returned DONE.
 552     if (i != 0) {
 553         errln("testPreceding():  iterator returned DONE prematurely.");
 554     }
 555
 556     // Full check of all results.
 557     td.checkResults("testPreceding", this);
 558 }
 559
 560
 561
 562 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 563     UErrorCode  status = U_ZERO_ERROR;
 564     int         i;
 565     int32_t     tag;
 566
 567     logln("testIsBoundary():");
 568     bi.setText(td.fDataToBreak);
 569     td.clearResults();
 570
 571     for (i = 0; i <= td.fDataToBreak.length(); i++) {
 572         if (bi.isBoundary(i)) {
 573             td.fActualBreakPositions.addElement(i, status);  // Save result.
 574             tag = bi.getRuleStatus();
 575             td.fActualTags.addElement(tag, status);
 576         }
 577     }
 578     td.checkResults("testIsBoundary: ", this);
 579 }
 580
 581
 582
 583 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
 584 {
 585     iterator.setText(td.fDataToBreak);
 586
 587     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
 588     int32_t offset = iterator.first();
 589     int32_t testOffset;
 590     int32_t count = 0;
 591
 592     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
 593
 594     if (*testIterator != iterator)
 595         errln("clone() or operator!= failed: two clones compared unequal");
 596
 597     do {
 598         testOffset = testIterator->first();
 599         testOffset = testIterator->next(count);
 600         if (offset != testOffset)
 601             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 602
 603         if (offset != RuleBasedBreakIterator::DONE) {
 604             count++;
 605             offset = iterator.next();
 606
 607             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
 608                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
 609                 if (count > 10000 || offset == -1) {
 610                     errln("operator== failed too many times. Stopping test.");
 611                     if (offset == -1) {
 612                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
 613                     }
 614                     return;
 615                 }
 616             }
 617         }
 618     } while (offset != RuleBasedBreakIterator::DONE);
 619
 620     // now do it backwards...
 621     offset = iterator.last();
 622     count = 0;
 623
 624     do {
 625         testOffset = testIterator->last();
 626         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
 627         if (offset != testOffset)
 628             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 629
 630         if (offset != RuleBasedBreakIterator::DONE) {
 631             count--;
 632             offset = iterator.previous();
 633         }
 634     } while (offset != RuleBasedBreakIterator::DONE);
 635
 636     delete testIterator;
 637 }
 638
 639
 640 //---------------------------------------------
 641 //
 642 //     other tests
 643 //
 644 //---------------------------------------------
 645 void RBBITest::TestEmptyString()
 646 {
 647     UnicodeString text = "";
 648     UErrorCode status = U_ZERO_ERROR;
 649
 650     BITestData x(status);
 651     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
 652     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
 653     if (U_FAILURE(status))
 654     {
 655         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
 656         return;
 657     }
 658     generalIteratorTest(*bi, x);
 659     delete bi;
 660 }
 661
 662 void RBBITest::TestGetAvailableLocales()
 663 {
 664     int32_t locCount = 0;
 665     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 666
 667     if (locCount == 0)
 668         dataerrln("getAvailableLocales() returned an empty list!");
 669     // Just make sure that it's returning good memory.
 670     int32_t i;
 671     for (i = 0; i < locCount; ++i) {
 672         logln(locList[i].getName());
 673     }
 674 }
 675
 676 //Testing the BreakIterator::getDisplayName() function
 677 void RBBITest::TestGetDisplayName()
 678 {
 679     UnicodeString   result;
 680
 681     BreakIterator::getDisplayName(Locale::getUS(), result);
 682     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 683         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 684                 + result);
 685
 686     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 687     if (result != "French (France)")
 688         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 689                 + result);
 690 }
 691 /**
 692  * Test End Behaviour
 693  * @bug 4068137
 694  */
 695 void RBBITest::TestEndBehaviour()
 696 {
 697     UErrorCode status = U_ZERO_ERROR;
 698     UnicodeString testString("boo.");
 699     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 700     if (U_FAILURE(status))
 701     {
 702         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
 703         return;
 704     }
 705     wb->setText(testString);
 706
 707     if (wb->first() != 0)
 708         errln("Didn't get break at beginning of string.");
 709     if (wb->next() != 3)
 710         errln("Didn't get break before period in \"boo.\"");
 711     if (wb->current() != 4 && wb->next() != 4)
 712         errln("Didn't get break at end of string.");
 713     delete wb;
 714 }
 715 /*
 716  * @bug 4153072
 717  */
 718 void RBBITest::TestBug4153072() {
 719     UErrorCode status = U_ZERO_ERROR;
 720     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 721     if (U_FAILURE(status))
 722     {
 723         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
 724         return;
 725     }
 726     UnicodeString str("...Hello, World!...");
 727     int32_t begin = 3;
 728     int32_t end = str.length() - 3;
 729     UBool onBoundary;
 730
 731     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 732     iter->adoptText(textIterator);
 733     int index;
 734     // Note: with the switch to UText, there is no way to restrict the
 735     //       iteration range to begin at an index other than zero.
 736     //       String character iterators created with a non-zero bound are
 737     //         treated by RBBI as being empty.
 738     for (index = -1; index < begin + 1; ++index) {
 739         onBoundary = iter->isBoundary(index);
 740         if (index == 0?  !onBoundary : onBoundary) {
 741             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
 742                             " and begin index = " + begin);
 743         }
 744     }
 745     delete iter;
 746 }
 747
 748
 749 //
 750 // Test for problem reported by Ashok Matoria on 9 July 2007
 751 //    One.<kSoftHyphen><kSpace>Two.
 752 //
 753 //    Sentence break at start (0) and then on calling next() it breaks at
 754 //   'T' of "Two". Now, at this point if I do next() and
 755 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
 756 //
 757 void RBBITest::TestBug5775() {
 758     UErrorCode status = U_ZERO_ERROR;
 759     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
 760     TEST_ASSERT_SUCCESS(status);
 761     if (U_FAILURE(status)) {
 762         return;
 763     }
 764 // Check for status first for better handling of no data errors.
 765     TEST_ASSERT(bi != NULL);
 766     if (bi == NULL) {
 767         return;
 768     }
 769
 770     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
 771     //               01234      56789
 772     s = s.unescape();
 773     bi->setText(s);
 774     int pos = bi->next();
 775     TEST_ASSERT(pos == 6);
 776     pos = bi->next();
 777     TEST_ASSERT(pos == 10);
 778     pos = bi->previous();
 779     TEST_ASSERT(pos == 6);
 780     delete bi;
 781 }
 782
 783
 784
 785 //------------------------------------------------------------------------------
 786 //
 787 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
 788 //
 789 //------------------------------------------------------------------------------
 790
 791 struct TestParams {
 792     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
 793                                            //   Changed out whenever test data changes break type.
 794
 795     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
 796     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
 797     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
 798     UVector32       *srcCol;
 799
 800     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
 801     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
 802     CharString       utf8String;           // UTF-8 form of text to break.
 803
 804     TestParams(UErrorCode &status) : dataToBreak() {
 805         bi               = NULL;
 806         expectedBreaks   = new UVector32(status);
 807         srcLine          = new UVector32(status);
 808         srcCol           = new UVector32(status);
 809         textToBreak      = NULL;
 810         textMap          = new UVector32(status);
 811     }
 812
 813     ~TestParams() {
 814         delete bi;
 815         delete expectedBreaks;
 816         delete srcLine;
 817         delete srcCol;
 818         utext_close(textToBreak);
 819         delete textMap;
 820     }
 821
 822     int32_t getSrcLine(int32_t bp);
 823     int32_t getExpectedBreak(int32_t bp);
 824     int32_t getSrcCol(int32_t bp);
 825
 826     void setUTF16(UErrorCode &status);
 827     void setUTF8(UErrorCode &status);
 828 };
 829
 830 // Append a UnicodeString to a CharString with UTF-8 encoding.
 831 // Substitute any invalid chars.
 832 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
 833 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
 834     if (U_FAILURE(status)) {
 835         return;
 836     }
 837     int32_t utf8Length;
 838     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
 839                        src.getBuffer(), src.length(),   // UTF-16 data
 840                        0xfffd, NULL,                    // Substitution char, number of subs.
 841                        &status);
 842     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 843         return;
 844     }
 845     status = U_ZERO_ERROR;
 846     int32_t capacity;
 847     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
 848     u_strToUTF8WithSub(buffer, utf8Length, NULL,
 849                        src.getBuffer(), src.length(),
 850                        0xfffd, NULL, &status);
 851     dest.append(buffer, utf8Length, status);
 852 }
 853
 854
 855 void TestParams::setUTF16(UErrorCode &status) {
 856     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
 857     textMap->removeAllElements();
 858     for (int32_t i=0; i<dataToBreak.length(); i++) {
 859         if (i == dataToBreak.getChar32Start(i)) {
 860             textMap->addElement(i, status);
 861         } else {
 862             textMap->addElement(-1, status);
 863         }
 864     }
 865     textMap->addElement(dataToBreak.length(), status);
 866     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
 867 }
 868
 869
 870 void TestParams::setUTF8(UErrorCode &status) {
 871     if (U_FAILURE(status)) {
 872         return;
 873     }
 874     utf8String.clear();
 875     CharStringAppend(utf8String, dataToBreak, status);
 876     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
 877     if (U_FAILURE(status)) {
 878         return;
 879     }
 880
 881     textMap->removeAllElements();
 882     int32_t utf16Index = 0;
 883     for (;;) {
 884         textMap->addElement(utf16Index, status);
 885         UChar32 c32 = utext_current32(textToBreak);
 886         if (c32 < 0) {
 887             break;
 888         }
 889         utf16Index += U16_LENGTH(c32);
 890         utext_next32(textToBreak);
 891         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
 892             textMap->addElement(-1, status);
 893         }
 894     }
 895     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
 896 }
 897
 898
 899 int32_t TestParams::getSrcLine(int32_t bp) {
 900     if (bp >= textMap->size()) {
 901         bp = textMap->size() - 1;
 902     }
 903     int32_t i = 0;
 904     for(; bp >= 0 ; --bp) {
 905         // Move to a character boundary if we are not on one already.
 906         i = textMap->elementAti(bp);
 907         if (i >= 0) {
 908             break;
 909         }
 910     }
 911     return srcLine->elementAti(i);
 912 }
 913
 914
 915 int32_t TestParams::getExpectedBreak(int32_t bp) {
 916     if (bp >= textMap->size()) {
 917         return 0;
 918     }
 919     int32_t i = textMap->elementAti(bp);
 920     int32_t retVal = 0;
 921     if (i >= 0) {
 922         retVal = expectedBreaks->elementAti(i);
 923     }
 924     return retVal;
 925 }
 926
 927
 928 int32_t TestParams::getSrcCol(int32_t bp) {
 929     if (bp >= textMap->size()) {
 930         bp = textMap->size() - 1;
 931     }
 932     int32_t i = 0;
 933     for(; bp >= 0; --bp) {
 934         // Move bp to a character boundary if we are not on one already.
 935         i = textMap->elementAti(bp);
 936         if (i >= 0) {
 937             break;
 938         }
 939     }
 940     return srcCol->elementAti(i);
 941 }
 942
 943
 944 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
 945     int32_t    bp;
 946     int32_t    prevBP;
 947     int32_t    i;
 948
 949     TEST_ASSERT_SUCCESS(status);
 950     if (U_FAILURE(status)) {
 951         return;
 952     }
 953
 954     if (t->bi == NULL) {
 955         return;
 956     }
 957
 958     t->bi->setText(t->textToBreak, status);
 959     //
 960     //  Run the iterator forward
 961     //
 962     prevBP = -1;
 963     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
 964         if (prevBP ==  bp) {
 965             // Fail for lack of forward progress.
 966             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
 967                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 968             break;
 969         }
 970
 971         // Check that there we didn't miss an expected break between the last one
 972         //  and this one.
 973         for (i=prevBP+1; i<bp; i++) {
 974             if (t->getExpectedBreak(i) != 0) {
 975                 int expected[] = {0, i};
 976                 printStringBreaks(t->dataToBreak, expected, 2);
 977                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 978                       i, t->getSrcLine(i), t->getSrcCol(i));
 979             }
 980         }
 981
 982         // Check that the break we did find was expected
 983         if (t->getExpectedBreak(bp) == 0) {
 984             int expected[] = {0, bp};
 985             printStringBreaks(t->textToBreak, expected, 2);
 986             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 987                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 988         } else {
 989             // The break was expected.
 990             //   Check that the {nnn} tag value is correct.
 991             int32_t expectedTagVal = t->getExpectedBreak(bp);
 992             if (expectedTagVal == -1) {
 993                 expectedTagVal = 0;
 994             }
 995             int32_t line = t->getSrcLine(bp);
 996             int32_t rs = t->bi->getRuleStatus();
 997             if (rs != expectedTagVal) {
 998                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
 999                       "          Actual, Expected status = %4d, %4d",
1000                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1001             }
1002         }
1003
1004         prevBP = bp;
1005     }
1006
1007     // Verify that there were no missed expected breaks after the last one found
1008     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1009         if (t->getExpectedBreak(i) != 0) {
1010             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1011                       i, t->getSrcLine(i), t->getSrcCol(i));
1012         }
1013     }
1014
1015     //
1016     //  Run the iterator backwards, verify that the same breaks are found.
1017     //
1018     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
1019     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1020         if (prevBP ==  bp) {
1021             // Fail for lack of progress.
1022             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1023                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1024             break;
1025         }
1026
1027         // Check that we didn't miss an expected break between the last one
1028         //  and this one.  (UVector returns zeros for index out of bounds.)
1029         for (i=prevBP-1; i>bp; i--) {
1030             if (t->getExpectedBreak(i) != 0) {
1031                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1032                       i, t->getSrcLine(i), t->getSrcCol(i));
1033             }
1034         }
1035
1036         // Check that the break we did find was expected
1037         if (t->getExpectedBreak(bp) == 0) {
1038             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1039                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
1040         } else {
1041             // The break was expected.
1042             //   Check that the {nnn} tag value is correct.
1043             int32_t expectedTagVal = t->getExpectedBreak(bp);
1044             if (expectedTagVal == -1) {
1045                 expectedTagVal = 0;
1046             }
1047             int line = t->getSrcLine(bp);
1048             int32_t rs = t->bi->getRuleStatus();
1049             if (rs != expectedTagVal) {
1050                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1051                       "          Actual, Expected status = %4d, %4d",
1052                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1053             }
1054         }
1055
1056         prevBP = bp;
1057     }
1058
1059     // Verify that there were no missed breaks prior to the last one found
1060     for (i=prevBP-1; i>=0; i--) {
1061         if (t->getExpectedBreak(i) != 0) {
1062             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1063                       i, t->getSrcLine(i), t->getSrcCol(i));
1064         }
1065     }
1066
1067     // Check isBoundary()
1068     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1069         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1070         UBool boundaryFound    = t->bi->isBoundary(i);
1071         if (boundaryExpected != boundaryFound) {
1072             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1073                   "        Expected, Actual= %s, %s",
1074                   i, t->getSrcLine(i), t->getSrcCol(i),
1075                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1076         }
1077     }
1078
1079     // Check following()
1080     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1081         int32_t actualBreak = t->bi->following(i);
1082         int32_t expectedBreak = BreakIterator::DONE;
1083         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1084             if (t->getExpectedBreak(j) != 0) {
1085                 expectedBreak = j;
1086                 break;
1087             }
1088         }
1089         if (expectedBreak != actualBreak) {
1090             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1091                   "        Expected, Actual= %d, %d",
1092                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1093         }
1094     }
1095
1096     // Check preceding()
1097     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1098         int32_t actualBreak = t->bi->preceding(i);
1099         int32_t expectedBreak = BreakIterator::DONE;
1100
1101         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1102         // preceding(trailing byte) will return the index of some preceding code point,
1103         // not the lead byte of the current code point, even though that has a smaller index.
1104         // Therefore, start looking at the expected break data not at i-1, but at
1105         // the start of code point index - 1.
1106         utext_setNativeIndex(t->textToBreak, i);
1107         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1108         for (; j >= 0; j--) {
1109             if (t->getExpectedBreak(j) != 0) {
1110                 expectedBreak = j;
1111                 break;
1112             }
1113         }
1114         if (expectedBreak != actualBreak) {
1115             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1116                   "        Expected, Actual= %d, %d",
1117                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1118         }
1119     }
1120 }
1121
1122
1123 void RBBITest::TestExtended() {
1124 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1125     UErrorCode      status  = U_ZERO_ERROR;
1126     Locale          locale("");
1127
1128     UnicodeString       rules;
1129     TestParams          tp(status);
1130
1131     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1132     if (U_FAILURE(status)) {
1133         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1134     }
1135
1136
1137     //
1138     //  Open and read the test data file.
1139     //
1140     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1141     char testFileName[1000];
1142     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1143         errln("Can't open test data.  Path too long.");
1144         return;
1145     }
1146     strcpy(testFileName, testDataDirectory);
1147     strcat(testFileName, "rbbitst.txt");
1148
1149     int    len;
1150     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1151     if (U_FAILURE(status)) {
1152         return; /* something went wrong, error already output */
1153     }
1154
1155
1156     bool skipTest = false; // Skip this test?
1157
1158     //
1159     //  Put the test data into a UnicodeString
1160     //
1161     UnicodeString testString(FALSE, testFile, len);
1162
1163     enum EParseState{
1164         PARSE_COMMENT,
1165         PARSE_TAG,
1166         PARSE_DATA,
1167         PARSE_NUM
1168     }
1169     parseState = PARSE_TAG;
1170
1171     EParseState savedState = PARSE_TAG;
1172
1173     static const UChar CH_LF        = 0x0a;
1174     static const UChar CH_CR        = 0x0d;
1175     static const UChar CH_HASH      = 0x23;
1176     /*static const UChar CH_PERIOD    = 0x2e;*/
1177     static const UChar CH_LT        = 0x3c;
1178     static const UChar CH_GT        = 0x3e;
1179     static const UChar CH_BACKSLASH = 0x5c;
1180     static const UChar CH_BULLET    = 0x2022;
1181
1182     int32_t    lineNum  = 1;
1183     int32_t    colStart = 0;
1184     int32_t    column   = 0;
1185     int32_t    charIdx  = 0;
1186
1187     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1188
1189     for (charIdx = 0; charIdx < len; ) {
1190         status = U_ZERO_ERROR;
1191         UChar  c = testString.charAt(charIdx);
1192         charIdx++;
1193         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1194             // treat CRLF as a unit
1195             c = CH_LF;
1196             charIdx++;
1197         }
1198         if (c == CH_LF || c == CH_CR) {
1199             lineNum++;
1200             colStart = charIdx;
1201         }
1202         column = charIdx - colStart + 1;
1203
1204         switch (parseState) {
1205         case PARSE_COMMENT:
1206             if (c == 0x0a || c == 0x0d) {
1207                 parseState = savedState;
1208             }
1209             break;
1210
1211         case PARSE_TAG:
1212             {
1213             if (c == CH_HASH) {
1214                 parseState = PARSE_COMMENT;
1215                 savedState = PARSE_TAG;
1216                 break;
1217             }
1218             if (u_isUWhiteSpace(c)) {
1219                 break;
1220             }
1221             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1222                 delete tp.bi;
1223                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1224                 skipTest = false;
1225                 charIdx += 5;
1226                 break;
1227             }
1228             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1229                 delete tp.bi;
1230                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1231                 skipTest = false;
1232                 charIdx += 5;
1233                 break;
1234             }
1235             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1236                 delete tp.bi;
1237                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1238                 skipTest = false;
1239                 charIdx += 5;
1240                 break;
1241             }
1242             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1243                 delete tp.bi;
1244                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1245                 skipTest = false;
1246                 charIdx += 5;
1247                 break;
1248             }
1249             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1250                 delete tp.bi;
1251                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1252                 charIdx += 6;
1253                 break;
1254             }
1255
1256             // <locale  loc_name>
1257             localeMatcher.reset(testString);
1258             if (localeMatcher.lookingAt(charIdx-1, status)) {
1259                 UnicodeString localeName = localeMatcher.group(1, status);
1260                 char localeName8[100];
1261                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1262                 locale = Locale::createFromName(localeName8);
1263                 charIdx += localeMatcher.group(0, status).length() - 1;
1264                 TEST_ASSERT_SUCCESS(status);
1265                 break;
1266             }
1267             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1268                 parseState = PARSE_DATA;
1269                 charIdx += 5;
1270                 tp.dataToBreak = "";
1271                 tp.expectedBreaks->removeAllElements();
1272                 tp.srcCol ->removeAllElements();
1273                 tp.srcLine->removeAllElements();
1274                 break;
1275             }
1276
1277             errln("line %d: Tag expected in test file.", lineNum);
1278             parseState = PARSE_COMMENT;
1279             savedState = PARSE_DATA;
1280             goto end_test; // Stop the test.
1281             }
1282             break;
1283
1284         case PARSE_DATA:
1285             if (c == CH_BULLET) {
1286                 int32_t  breakIdx = tp.dataToBreak.length();
1287                 tp.expectedBreaks->setSize(breakIdx+1);
1288                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1289                 tp.srcLine->setSize(breakIdx+1);
1290                 tp.srcLine->setElementAt(lineNum, breakIdx);
1291                 tp.srcCol ->setSize(breakIdx+1);
1292                 tp.srcCol ->setElementAt(column, breakIdx);
1293                 break;
1294             }
1295
1296             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1297                 // Add final entry to mappings from break location to source file position.
1298                 //  Need one extra because last break position returned is after the
1299                 //    last char in the data, not at the last char.
1300                 tp.srcLine->addElement(lineNum, status);
1301                 tp.srcCol ->addElement(column, status);
1302
1303                 parseState = PARSE_TAG;
1304                 charIdx += 6;
1305
1306                 if (!skipTest) {
1307                     // RUN THE TEST!
1308                     status = U_ZERO_ERROR;
1309                     tp.setUTF16(status);
1310                     executeTest(&tp, status);
1311                     TEST_ASSERT_SUCCESS(status);
1312
1313                     // Run again, this time with UTF-8 text wrapped in a UText.
1314                     status = U_ZERO_ERROR;
1315                     tp.setUTF8(status);
1316                     TEST_ASSERT_SUCCESS(status);
1317                     executeTest(&tp, status);
1318                 }
1319                 break;
1320             }
1321
1322             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1323                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1324                 // Get the code point from the name and insert it into the test data.
1325                 //   (Damn, no API takes names in Unicode  !!!
1326                 //    we've got to take it back to char *)
1327                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1328                 int32_t nameLength = nameEndIdx - (charIdx+2);
1329                 char charNameBuf[200];
1330                 UChar32 theChar = -1;
1331                 if (nameEndIdx != -1) {
1332                     UErrorCode status = U_ZERO_ERROR;
1333                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1334                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1335                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1336                     if (U_FAILURE(status)) {
1337                         theChar = -1;
1338                     }
1339                 }
1340                 if (theChar == -1) {
1341                     errln("Error in named character in test file at line %d, col %d",
1342                         lineNum, column);
1343                 } else {
1344                     // Named code point was recognized.  Insert it
1345                     //   into the test data.
1346                     tp.dataToBreak.append(theChar);
1347                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1348                         tp.srcLine->addElement(lineNum, status);
1349                         tp.srcCol ->addElement(column, status);
1350                     }
1351                 }
1352                 if (nameEndIdx > charIdx) {
1353                     charIdx = nameEndIdx+1;
1354
1355                 }
1356                 break;
1357             }
1358
1359
1360
1361
1362             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1363                 charIdx++;
1364                 int32_t  breakIdx = tp.dataToBreak.length();
1365                 tp.expectedBreaks->setSize(breakIdx+1);
1366                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1367                 tp.srcLine->setSize(breakIdx+1);
1368                 tp.srcLine->setElementAt(lineNum, breakIdx);
1369                 tp.srcCol ->setSize(breakIdx+1);
1370                 tp.srcCol ->setElementAt(column, breakIdx);
1371                 break;
1372             }
1373
1374             if (c == CH_LT) {
1375                 tagValue   = 0;
1376                 parseState = PARSE_NUM;
1377                 break;
1378             }
1379
1380             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1381                 parseState = PARSE_COMMENT;
1382                 savedState = PARSE_DATA;
1383                 break;
1384             }
1385
1386             if (c == CH_BACKSLASH) {
1387                 // Check for \ at end of line, a line continuation.
1388                 //     Advance over (discard) the newline
1389                 UChar32 cp = testString.char32At(charIdx);
1390                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1391                     // We have a CR LF
1392                     //  Need an extra increment of the input ptr to move over both of them
1393                     charIdx++;
1394                 }
1395                 if (cp == CH_LF || cp == CH_CR) {
1396                     lineNum++;
1397                     colStart = charIdx;
1398                     charIdx++;
1399                     break;
1400                 }
1401
1402                 // Let unescape handle the back slash.
1403                 cp = testString.unescapeAt(charIdx);
1404                 if (cp != -1) {
1405                     // Escape sequence was recognized.  Insert the char
1406                     //   into the test data.
1407                     tp.dataToBreak.append(cp);
1408                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1409                         tp.srcLine->addElement(lineNum, status);
1410                         tp.srcCol ->addElement(column, status);
1411                     }
1412                     break;
1413                 }
1414
1415
1416                 // Not a recognized backslash escape sequence.
1417                 // Take the next char as a literal.
1418                 //  TODO:  Should this be an error?
1419                 c = testString.charAt(charIdx);
1420                 charIdx = testString.moveIndex32(charIdx, 1);
1421             }
1422
1423             // Normal, non-escaped data char.
1424             tp.dataToBreak.append(c);
1425
1426             // Save the mapping from offset in the data to line/column numbers in
1427             //   the original input file.  Will be used for better error messages only.
1428             //   If there's an expected break before this char, the slot in the mapping
1429             //     vector will already be set for this char; don't overwrite it.
1430             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1431                 tp.srcLine->addElement(lineNum, status);
1432                 tp.srcCol ->addElement(column, status);
1433             }
1434             break;
1435
1436
1437         case PARSE_NUM:
1438             // We are parsing an expected numeric tag value, like <1234>,
1439             //   within a chunk of data.
1440             if (u_isUWhiteSpace(c)) {
1441                 break;
1442             }
1443
1444             if (c == CH_GT) {
1445                 // Finished the number.  Add the info to the expected break data,
1446                 //   and switch parse state back to doing plain data.
1447                 parseState = PARSE_DATA;
1448                 if (tagValue == 0) {
1449                     tagValue = -1;
1450                 }
1451                 int32_t  breakIdx = tp.dataToBreak.length();
1452                 tp.expectedBreaks->setSize(breakIdx+1);
1453                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1454                 tp.srcLine->setSize(breakIdx+1);
1455                 tp.srcLine->setElementAt(lineNum, breakIdx);
1456                 tp.srcCol ->setSize(breakIdx+1);
1457                 tp.srcCol ->setElementAt(column, breakIdx);
1458                 break;
1459             }
1460
1461             if (u_isdigit(c)) {
1462                 tagValue = tagValue*10 + u_charDigitValue(c);
1463                 break;
1464             }
1465
1466             errln("Syntax Error in test file at line %d, col %d",
1467                 lineNum, column);
1468             parseState = PARSE_COMMENT;
1469             goto end_test; // Stop the test
1470             break;
1471         }
1472
1473
1474         if (U_FAILURE(status)) {
1475             dataerrln("ICU Error %s while parsing test file at line %d.",
1476                 u_errorName(status), lineNum);
1477             status = U_ZERO_ERROR;
1478             goto end_test; // Stop the test
1479         }
1480
1481     }
1482
1483 end_test:
1484     delete [] testFile;
1485 #endif
1486 }
1487
1488
1489 //-------------------------------------------------------------------------------
1490 //
1491 //  TestDictRules   create a break iterator from source rules that includes a
1492 //                  dictionary range.   Regression for bug #7130.  Source rules
1493 //                  do not declare a break iterator type (word, line, sentence, etc.
1494 //                  but the dictionary code, without a type, would loop.
1495 //
1496 //-------------------------------------------------------------------------------
1497 void RBBITest::TestDictRules() {
1498     const char *rules =  "$dictionary = [a-z]; \n"
1499                          "!!forward; \n"
1500                          "$dictionary $dictionary; \n"
1501                          "!!reverse; \n"
1502                          "$dictionary $dictionary; \n";
1503     const char *text = "aa";
1504     UErrorCode status = U_ZERO_ERROR;
1505     UParseError parseError;
1506
1507     RuleBasedBreakIterator bi(rules, parseError, status);
1508     if (U_SUCCESS(status)) {
1509         UnicodeString utext = text;
1510         bi.setText(utext);
1511         int32_t position;
1512         int32_t loops;
1513         for (loops = 0; loops<10; loops++) {
1514             position = bi.next();
1515             if (position == RuleBasedBreakIterator::DONE) {
1516                 break;
1517             }
1518         }
1519         TEST_ASSERT(loops == 1);
1520     } else {
1521         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1522     }
1523 }
1524
1525
1526
1527 //-------------------------------------------------------------------------------
1528 //
1529 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1530 //    return the data in one big UChar * buffer, which the caller must delete.
1531 //
1532 //    parameters:
1533 //          fileName:   the name of the file, with no directory part.  The test data directory
1534 //                      is assumed.
1535 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1536 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1537 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1538 //                      Pass NULL for the system default encoding.
1539 //          status
1540 //    returns:
1541 //                      The file data, converted to UChar.
1542 //                      The caller must delete this when done with
1543 //                           delete [] theBuffer;
1544 //
1545 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1546 //           Move this function to some common place.
1547 //
1548 //--------------------------------------------------------------------------------
1549 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1550     UChar       *retPtr  = NULL;
1551     char        *fileBuf = NULL;
1552     UConverter* conv     = NULL;
1553     FILE        *f       = NULL;
1554
1555     ulen = 0;
1556     if (U_FAILURE(status)) {
1557         return retPtr;
1558     }
1559
1560     //
1561     //  Open the file.
1562     //
1563     f = fopen(fileName, "rb");
1564     if (f == 0) {
1565         dataerrln("Error opening test data file %s\n", fileName);
1566         status = U_FILE_ACCESS_ERROR;
1567         return NULL;
1568     }
1569     //
1570     //  Read it in
1571     //
1572     int   fileSize;
1573     int   amt_read;
1574
1575     fseek( f, 0, SEEK_END);
1576     fileSize = ftell(f);
1577     fileBuf = new char[fileSize];
1578     fseek(f, 0, SEEK_SET);
1579     amt_read = fread(fileBuf, 1, fileSize, f);
1580     if (amt_read != fileSize || fileSize <= 0) {
1581         errln("Error reading test data file.");
1582         goto cleanUpAndReturn;
1583     }
1584
1585     //
1586     // Look for a Unicode Signature (BOM) on the data just read
1587     //
1588     int32_t        signatureLength;
1589     const char *   fileBufC;
1590     const char*    bomEncoding;
1591
1592     fileBufC = fileBuf;
1593     bomEncoding = ucnv_detectUnicodeSignature(
1594         fileBuf, fileSize, &signatureLength, &status);
1595     if(bomEncoding!=NULL ){
1596         fileBufC  += signatureLength;
1597         fileSize  -= signatureLength;
1598         encoding = bomEncoding;
1599     }
1600
1601     //
1602     // Open a converter to take the rule file to UTF-16
1603     //
1604     conv = ucnv_open(encoding, &status);
1605     if (U_FAILURE(status)) {
1606         goto cleanUpAndReturn;
1607     }
1608
1609     //
1610     // Convert the rules to UChar.
1611     //  Preflight first to determine required buffer size.
1612     //
1613     ulen = ucnv_toUChars(conv,
1614         NULL,           //  dest,
1615         0,              //  destCapacity,
1616         fileBufC,
1617         fileSize,
1618         &status);
1619     if (status == U_BUFFER_OVERFLOW_ERROR) {
1620         // Buffer Overflow is expected from the preflight operation.
1621         status = U_ZERO_ERROR;
1622
1623         retPtr = new UChar[ulen+1];
1624         ucnv_toUChars(conv,
1625             retPtr,       //  dest,
1626             ulen+1,
1627             fileBufC,
1628             fileSize,
1629             &status);
1630     }
1631
1632 cleanUpAndReturn:
1633     fclose(f);
1634     delete []fileBuf;
1635     ucnv_close(conv);
1636     if (U_FAILURE(status)) {
1637         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1638         delete []retPtr;
1639         retPtr = 0;
1640         ulen   = 0;
1641     };
1642     return retPtr;
1643 }
1644
1645
1646
1647 //--------------------------------------------------------------------------------------------
1648 //
1649 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1650 //
1651 //-------------------------------------------------------------------------------------------
1652 void RBBITest::TestUnicodeFiles() {
1653     RuleBasedBreakIterator  *bi;
1654     UErrorCode               status = U_ZERO_ERROR;
1655
1656     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1657     TEST_ASSERT_SUCCESS(status);
1658     if (U_SUCCESS(status)) {
1659         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1660     }
1661     delete bi;
1662
1663     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1664     TEST_ASSERT_SUCCESS(status);
1665     if (U_SUCCESS(status)) {
1666         runUnicodeTestData("WordBreakTest.txt", bi);
1667     }
1668     delete bi;
1669
1670     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1671     TEST_ASSERT_SUCCESS(status);
1672     if (U_SUCCESS(status)) {
1673         runUnicodeTestData("SentenceBreakTest.txt", bi);
1674     }
1675     delete bi;
1676
1677     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1678     TEST_ASSERT_SUCCESS(status);
1679     if (U_SUCCESS(status)) {
1680         runUnicodeTestData("LineBreakTest.txt", bi);
1681     }
1682     delete bi;
1683 }
1684
1685
1686 // Check for test cases from the Unicode test data files that are known to fail
1687 // and should be skipped because ICU is not yet able to fully implement the spec.
1688 // See ticket #7270.
1689
1690 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1691     static const UChar badTestCases[][4] = {                     // Line Numbers from Unicode 7.0.0 file.
1692         {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000},   // Line 5198
1693         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000},   // Line 5202
1694         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000},   // Line 5214
1695         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000},   // Line 5246
1696         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000},   // Line 5298
1697         {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000}    // Line 5302
1698     };
1699     if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1700         return FALSE;
1701     }
1702
1703     for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1704         if (testCase == UnicodeString(badTestCases[i])) {
1705             return logKnownIssue("7270");
1706         }
1707     }
1708     return FALSE;
1709 }
1710
1711
1712 //--------------------------------------------------------------------------------------------
1713 //
1714 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1715 //
1716 //-------------------------------------------------------------------------------------------
1717 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1718 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1719     UErrorCode  status = U_ZERO_ERROR;
1720
1721     //
1722     //  Open and read the test data file, put it into a UnicodeString.
1723     //
1724     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1725     char testFileName[1000];
1726     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1727         dataerrln("Can't open test data.  Path too long.");
1728         return;
1729     }
1730     strcpy(testFileName, testDataDirectory);
1731     strcat(testFileName, fileName);
1732
1733     logln("Opening data file %s\n", fileName);
1734
1735     int    len;
1736     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1737     if (status != U_FILE_ACCESS_ERROR) {
1738         TEST_ASSERT_SUCCESS(status);
1739         TEST_ASSERT(testFile != NULL);
1740     }
1741     if (U_FAILURE(status) || testFile == NULL) {
1742         return; /* something went wrong, error already output */
1743     }
1744     UnicodeString testFileAsString(TRUE, testFile, len);
1745
1746     //
1747     //  Parse the test data file using a regular expression.
1748     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1749     //     is identified by which group had a match.
1750     //
1751     //    Caputure Group #                  1          2            3            4           5
1752     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1753     //
1754     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1755     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1756     UnicodeString   testString;
1757     UVector32       breakPositions(status);
1758     int             lineNumber = 1;
1759     TEST_ASSERT_SUCCESS(status);
1760     if (U_FAILURE(status)) {
1761         return;
1762     }
1763
1764     //
1765     //  Scan through each test case, building up the string to be broken in testString,
1766     //   and the positions that should be boundaries in the breakPositions vector.
1767     //
1768     int spin = 0;
1769     while (tokenMatcher.find()) {
1770         if(tokenMatcher.hitEnd()) {
1771           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1772              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1773              and caused an infinite loop here on EBCDIC systems!
1774           */
1775           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1776           //       return;
1777         }
1778         if (tokenMatcher.start(1, status) >= 0) {
1779             // Scanned a divide sign, indicating a break position in the test data.
1780             if (testString.length()>0) {
1781                 breakPositions.addElement(testString.length(), status);
1782             }
1783         }
1784         else if (tokenMatcher.start(2, status) >= 0) {
1785             // Scanned an 'x', meaning no break at this position in the test data
1786             //   Nothing to be done here.
1787             }
1788         else if (tokenMatcher.start(3, status) >= 0) {
1789             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1790             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1791             int length = hexNumber.length();
1792             if (length<=8) {
1793                 char buf[10];
1794                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1795                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1796                 if (c<=0x10ffff) {
1797                     testString.append(c);
1798                 } else {
1799                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1800                        fileName, lineNumber);
1801                 }
1802             } else {
1803                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1804                        fileName, lineNumber);
1805              }
1806         }
1807         else if (tokenMatcher.start(4, status) >= 0) {
1808             // Scanned to end of a line, possibly skipping over a comment in the process.
1809             //   If the line from the file contained test data, run the test now.
1810             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1811                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1812             }
1813
1814             // Clear out this test case.
1815             //    The string and breakPositions vector will be refilled as the next
1816             //       test case is parsed.
1817             testString.remove();
1818             breakPositions.removeAllElements();
1819             lineNumber++;
1820         } else {
1821             // Scanner catchall.  Something unrecognized appeared on the line.
1822             char token[16];
1823             UnicodeString uToken = tokenMatcher.group(0, status);
1824             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1825             token[sizeof(token)-1] = 0;
1826             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1827
1828             // Clean up, in preparation for continuing with the next line.
1829             testString.remove();
1830             breakPositions.removeAllElements();
1831             lineNumber++;
1832         }
1833         TEST_ASSERT_SUCCESS(status);
1834         if (U_FAILURE(status)) {
1835             break;
1836         }
1837     }
1838
1839     delete [] testFile;
1840  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1841 }
1842
1843 //--------------------------------------------------------------------------------------------
1844 //
1845 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1846 //                            test data files.  Do only a simple, forward-only check -
1847 //                            this test is mostly to check that ICU and the Unicode
1848 //                            data agree with each other.
1849 //
1850 //--------------------------------------------------------------------------------------------
1851 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1852                          const UnicodeString &testString,   // Text data to be broken
1853                          UVector32 *breakPositions,         // Positions where breaks should be found.
1854                          RuleBasedBreakIterator *bi) {
1855     int32_t pos;                 // Break Position in the test string
1856     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1857     int32_t expectedPos;         // Expected break position (index into test string)
1858
1859     bi->setText(testString);
1860     pos = bi->first();
1861     pos = bi->next();
1862
1863     while (pos != BreakIterator::DONE) {
1864         if (expectedI >= breakPositions->size()) {
1865             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1866                 testFileName, lineNumber, pos);
1867             break;
1868         }
1869         expectedPos = breakPositions->elementAti(expectedI);
1870         if (pos < expectedPos) {
1871             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1872                 testFileName, lineNumber, pos);
1873             break;
1874         }
1875         if (pos > expectedPos) {
1876             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1877                 testFileName, lineNumber, expectedPos);
1878             break;
1879         }
1880         pos = bi->next();
1881         expectedI++;
1882     }
1883
1884     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1885         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1886             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1887     }
1888 }
1889
1890
1891
1892 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1893 //---------------------------------------------------------------------------------------
1894 //
1895 //   classs RBBIMonkeyKind
1896 //
1897 //      Monkey Test for Break Iteration
1898 //      Abstract interface class.   Concrete derived classes independently
1899 //      implement the break rules for different iterator types.
1900 //
1901 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1902 //      testing, but works purely in terms of the interface defined here.
1903 //
1904 //---------------------------------------------------------------------------------------
1905 class RBBIMonkeyKind {
1906 public:
1907     // Return a UVector of UnicodeSets, representing the character classes used
1908     //   for this type of iterator.
1909     virtual  UVector  *charClasses() = 0;
1910
1911     // Set the test text on which subsequent calls to next() will operate
1912     virtual  void      setText(const UnicodeString &s) = 0;
1913
1914     // Find the next break postion, starting from the prev break position, or from zero.
1915     // Return -1 after reaching end of string.
1916     virtual  int32_t   next(int32_t i) = 0;
1917
1918     virtual ~RBBIMonkeyKind();
1919     UErrorCode       deferredStatus;
1920
1921
1922 protected:
1923     RBBIMonkeyKind();
1924
1925 private:
1926 };
1927
1928 RBBIMonkeyKind::RBBIMonkeyKind() {
1929     deferredStatus = U_ZERO_ERROR;
1930 }
1931
1932 RBBIMonkeyKind::~RBBIMonkeyKind() {
1933 }
1934
1935
1936 //----------------------------------------------------------------------------------------
1937 //
1938 //   Random Numbers.  Similar to standard lib rand() and srand()
1939 //                    Not using library to
1940 //                      1.  Get same results on all platforms.
1941 //                      2.  Get access to current seed, to more easily reproduce failures.
1942 //
1943 //---------------------------------------------------------------------------------------
1944 static uint32_t m_seed = 1;
1945
1946 static uint32_t m_rand()
1947 {
1948     m_seed = m_seed * 1103515245 + 12345;
1949     return (uint32_t)(m_seed/65536) % 32768;
1950 }
1951
1952
1953 //
1954 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
1955 //
1956 static const char *gExtended_Pict = "["
1957     "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093"
1958     "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
1959     "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF"
1960     "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395"
1961     "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548"
1962     "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589"
1963     "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0"
1964     "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0"
1965     "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
1966     "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625"
1967     "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667"
1968     "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF"
1969     "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF"
1970     "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF"
1971     "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF"
1972     "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF"
1973     "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F"
1974     "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8"
1975     "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF"
1976     "]";
1977
1978 //------------------------------------------------------------------------------------------
1979 //
1980 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1981 //                             of RBBIMonkeyKind.
1982 //
1983 //------------------------------------------------------------------------------------------
1984 class RBBICharMonkey: public RBBIMonkeyKind {
1985 public:
1986     RBBICharMonkey();
1987     virtual          ~RBBICharMonkey();
1988     virtual  UVector *charClasses();
1989     virtual  void     setText(const UnicodeString &s);
1990     virtual  int32_t  next(int32_t i);
1991 private:
1992     UVector   *fSets;
1993
1994     UnicodeSet  *fCRLFSet;
1995     UnicodeSet  *fControlSet;
1996     UnicodeSet  *fExtendSet;
1997     UnicodeSet  *fZWJSet;
1998     UnicodeSet  *fRegionalIndicatorSet;
1999     UnicodeSet  *fPrependSet;
2000     UnicodeSet  *fSpacingSet;
2001     UnicodeSet  *fLSet;
2002     UnicodeSet  *fVSet;
2003     UnicodeSet  *fTSet;
2004     UnicodeSet  *fLVSet;
2005     UnicodeSet  *fLVTSet;
2006     UnicodeSet  *fHangulSet;
2007     UnicodeSet  *fEmojiBaseSet;
2008     UnicodeSet  *fEmojiModifierSet;
2009     UnicodeSet  *fExtendedPictSet;
2010     UnicodeSet  *fEBGSet;
2011     UnicodeSet  *fEmojiNRKSet;
2012     UnicodeSet  *fAnySet;
2013
2014     const UnicodeString *fText;
2015 };
2016
2017
2018 RBBICharMonkey::RBBICharMonkey() {
2019     UErrorCode  status = U_ZERO_ERROR;
2020
2021     fText = NULL;
2022
2023     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2024     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
2025     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
2026     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
2027     fRegionalIndicatorSet =
2028                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2029     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2030     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2031     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2032     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2033     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2034     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2035     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2036     fHangulSet  = new UnicodeSet();
2037     fHangulSet->addAll(*fLSet);
2038     fHangulSet->addAll(*fVSet);
2039     fHangulSet->addAll(*fTSet);
2040     fHangulSet->addAll(*fLVSet);
2041     fHangulSet->addAll(*fLVTSet);
2042
2043     fEmojiBaseSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
2044     fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
2045     fExtendedPictSet  = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
2046     fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
2047     fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
2048                 "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2049     fAnySet           = new UnicodeSet(0, 0x10ffff);
2050
2051     fSets             = new UVector(status);
2052     fSets->addElement(fCRLFSet,    status);
2053     fSets->addElement(fControlSet, status);
2054     fSets->addElement(fExtendSet,  status);
2055     fSets->addElement(fRegionalIndicatorSet, status);
2056     if (!fPrependSet->isEmpty()) {
2057         fSets->addElement(fPrependSet, status);
2058     }
2059     fSets->addElement(fSpacingSet, status);
2060     fSets->addElement(fHangulSet,  status);
2061     fSets->addElement(fAnySet,     status);
2062     fSets->addElement(fEmojiBaseSet, status);
2063     fSets->addElement(fEmojiModifierSet, status);
2064     fSets->addElement(fZWJSet,     status);
2065     fSets->addElement(fExtendedPictSet, status);
2066     fSets->addElement(fEBGSet,     status);
2067     fSets->addElement(fEmojiNRKSet,status);
2068     if (U_FAILURE(status)) {
2069         deferredStatus = status;
2070     }
2071 }
2072
2073
2074 void RBBICharMonkey::setText(const UnicodeString &s) {
2075     fText = &s;
2076 }
2077
2078
2079
2080 int32_t RBBICharMonkey::next(int32_t prevPos) {
2081     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2082                               //   break position being tested.  The candidate break
2083                               //   location is before p2.
2084
2085     int     breakPos = -1;
2086
2087     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2088     UChar32 cBase;            // for (X Extend*) patterns, the X character.
2089
2090     if (U_FAILURE(deferredStatus)) {
2091         return -1;
2092     }
2093
2094     // Previous break at end of string.  return DONE.
2095     if (prevPos >= fText->length()) {
2096         return -1;
2097     }
2098     p0 = p1 = p2 = p3 = prevPos;
2099     c3 =  fText->char32At(prevPos);
2100     c0 = c1 = c2 = cBase = 0;
2101     (void)p0;   // suppress set but not used warning.
2102     (void)c0;
2103
2104     // Loop runs once per "significant" character position in the input text.
2105     for (;;) {
2106         // Move all of the positions forward in the input string.
2107         p0 = p1;  c0 = c1;
2108         p1 = p2;  c1 = c2;
2109         p2 = p3;  c2 = c3;
2110
2111         // Advancd p3 by one codepoint
2112         p3 = fText->moveIndex32(p3, 1);
2113         c3 = fText->char32At(p3);
2114
2115         if (p1 == p2) {
2116             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2117             continue;
2118         }
2119         if (p2 == fText->length()) {
2120             // Reached end of string.  Always a break position.
2121             break;
2122         }
2123
2124         // Rule  GB3   CR x LF
2125         //     No Extend or Format characters may appear between the CR and LF,
2126         //     which requires the additional check for p2 immediately following p1.
2127         //
2128         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2129             continue;
2130         }
2131
2132         // Rule (GB4).   ( Control | CR | LF ) <break>
2133         if (fControlSet->contains(c1) ||
2134             c1 == 0x0D ||
2135             c1 == 0x0A)  {
2136             break;
2137         }
2138
2139         // Rule (GB5)    <break>  ( Control | CR | LF )
2140         //
2141         if (fControlSet->contains(c2) ||
2142             c2 == 0x0D ||
2143             c2 == 0x0A)  {
2144             break;
2145         }
2146
2147
2148         // Rule (GB6)  L x ( L | V | LV | LVT )
2149         if (fLSet->contains(c1) &&
2150                (fLSet->contains(c2)  ||
2151                 fVSet->contains(c2)  ||
2152                 fLVSet->contains(c2) ||
2153                 fLVTSet->contains(c2))) {
2154             continue;
2155         }
2156
2157         // Rule (GB7)    ( LV | V )  x  ( V | T )
2158         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2159             (fVSet->contains(c2) || fTSet->contains(c2)))  {
2160             continue;
2161         }
2162
2163         // Rule (GB8)    ( LVT | T)  x T
2164         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2165             fTSet->contains(c2))  {
2166             continue;
2167         }
2168
2169         // Rule (GB9)    x (Extend | ZWJ)
2170         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
2171             if (!fExtendSet->contains(c1)) {
2172                 cBase = c1;
2173             }
2174             continue;
2175         }
2176
2177         // Rule (GB9a)   x  SpacingMark
2178         if (fSpacingSet->contains(c2)) {
2179             continue;
2180         }
2181
2182         // Rule (GB9b)   Prepend x
2183         if (fPrependSet->contains(c1)) {
2184             continue;
2185         }
2186
2187         // Rule (GB10)   (Emoji_Base | EBG) Extend * x Emoji_Modifier
2188         if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
2189             continue;
2190         }
2191         if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
2192                 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
2193             continue;
2194         }
2195
2196         // Rule (GB11)   (Glue_After_ZWJ | Emoji) ZWJ x (Glue_After_ZWJ | Emoji)
2197         if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
2198                 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2199             continue;
2200         }
2201
2202         // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
2203         //                   Note: The first if condition is a little tricky. We only need to force
2204         //                      a break if there are three or more contiguous RIs. If there are
2205         //                      only two, a break following will occur via other rules, and will include
2206         //                      any trailing extend characters, which is needed behavior.
2207         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2208                 && fRegionalIndicatorSet->contains(c2)) {
2209             break;
2210         }
2211         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2212             continue;
2213         }
2214
2215         // Rule (GB999)  Any  <break>  Any
2216         break;
2217     }
2218
2219     breakPos = p2;
2220     return breakPos;
2221 }
2222
2223
2224
2225 UVector  *RBBICharMonkey::charClasses() {
2226     return fSets;
2227 }
2228
2229
2230 RBBICharMonkey::~RBBICharMonkey() {
2231     delete fSets;
2232     delete fCRLFSet;
2233     delete fControlSet;
2234     delete fExtendSet;
2235     delete fRegionalIndicatorSet;
2236     delete fPrependSet;
2237     delete fSpacingSet;
2238     delete fLSet;
2239     delete fVSet;
2240     delete fTSet;
2241     delete fLVSet;
2242     delete fLVTSet;
2243     delete fHangulSet;
2244     delete fAnySet;
2245     delete fEmojiBaseSet;
2246     delete fEmojiModifierSet;
2247     delete fZWJSet;
2248     delete fExtendedPictSet;
2249     delete fEBGSet;
2250     delete fEmojiNRKSet;
2251 }
2252
2253 //------------------------------------------------------------------------------------------
2254 //
2255 //   class RBBIWordMonkey      Word Break specific implementation
2256 //                             of RBBIMonkeyKind.
2257 //
2258 //------------------------------------------------------------------------------------------
2259 class RBBIWordMonkey: public RBBIMonkeyKind {
2260 public:
2261     RBBIWordMonkey();
2262     virtual          ~RBBIWordMonkey();
2263     virtual  UVector *charClasses();
2264     virtual  void     setText(const UnicodeString &s);
2265     virtual int32_t   next(int32_t i);
2266 private:
2267     UVector      *fSets;
2268
2269     UnicodeSet  *fCRSet;
2270     UnicodeSet  *fLFSet;
2271     UnicodeSet  *fNewlineSet;
2272     UnicodeSet  *fRegionalIndicatorSet;
2273     UnicodeSet  *fKatakanaSet;
2274     UnicodeSet  *fHebrew_LetterSet;
2275     UnicodeSet  *fALetterSet;
2276     UnicodeSet  *fSingle_QuoteSet;
2277     UnicodeSet  *fDouble_QuoteSet;
2278     UnicodeSet  *fMidNumLetSet;
2279     UnicodeSet  *fMidLetterSet;
2280     UnicodeSet  *fMidNumSet;
2281     UnicodeSet  *fNumericSet;
2282     UnicodeSet  *fFormatSet;
2283     UnicodeSet  *fOtherSet;
2284     UnicodeSet  *fExtendSet;
2285     UnicodeSet  *fExtendNumLetSet;
2286     UnicodeSet  *fDictionarySet;
2287     UnicodeSet  *fEBaseSet;
2288     UnicodeSet  *fEBGSet;
2289     UnicodeSet  *fEModifierSet;
2290     UnicodeSet  *fZWJSet;
2291     UnicodeSet  *fExtendedPictSet;
2292     UnicodeSet  *fEmojiNRKSet;
2293
2294     const UnicodeString  *fText;
2295 };
2296
2297
2298 RBBIWordMonkey::RBBIWordMonkey()
2299 {
2300     UErrorCode  status = U_ZERO_ERROR;
2301
2302     fSets            = new UVector(status);
2303
2304     fCRSet            = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2305     fLFSet            = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2306     fNewlineSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2307     fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2308     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2309     fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2310     fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2311     fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
2312     fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
2313     fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2314     fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2315     fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2316     fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2317     fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2318     fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2319     fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2320
2321     fEBaseSet         = new UnicodeSet(UNICODE_STRING_SIMPLE(
2322             "[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
2323     fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EBG}]"),          status);
2324     fEModifierSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EM}]"),           status);
2325     fZWJSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ZWJ}]"),          status);
2326     fExtendedPictSet  = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
2327     fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
2328             "[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2329
2330     fDictionarySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"), status);
2331     fDictionarySet->addAll(*fKatakanaSet);
2332     fDictionarySet->addAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2333
2334     fALetterSet->removeAll(*fDictionarySet);
2335
2336     fOtherSet        = new UnicodeSet();
2337     if(U_FAILURE(status)) {
2338       deferredStatus = status;
2339       return;
2340     }
2341
2342     fOtherSet->complement();
2343     fOtherSet->removeAll(*fCRSet);
2344     fOtherSet->removeAll(*fLFSet);
2345     fOtherSet->removeAll(*fNewlineSet);
2346     fOtherSet->removeAll(*fKatakanaSet);
2347     fOtherSet->removeAll(*fHebrew_LetterSet);
2348     fOtherSet->removeAll(*fALetterSet);
2349     fOtherSet->removeAll(*fSingle_QuoteSet);
2350     fOtherSet->removeAll(*fDouble_QuoteSet);
2351     fOtherSet->removeAll(*fMidLetterSet);
2352     fOtherSet->removeAll(*fMidNumSet);
2353     fOtherSet->removeAll(*fNumericSet);
2354     fOtherSet->removeAll(*fExtendNumLetSet);
2355     fOtherSet->removeAll(*fFormatSet);
2356     fOtherSet->removeAll(*fExtendSet);
2357     fOtherSet->removeAll(*fRegionalIndicatorSet);
2358     fOtherSet->removeAll(*fEBaseSet);
2359     fOtherSet->removeAll(*fEBGSet);
2360     fOtherSet->removeAll(*fEModifierSet);
2361     fOtherSet->removeAll(*fZWJSet);
2362     fOtherSet->removeAll(*fExtendedPictSet);
2363     fOtherSet->removeAll(*fEmojiNRKSet);
2364
2365     // Inhibit dictionary characters from being tested at all.
2366     fOtherSet->removeAll(*fDictionarySet);
2367
2368     fSets->addElement(fCRSet,                status);
2369     fSets->addElement(fLFSet,                status);
2370     fSets->addElement(fNewlineSet,           status);
2371     fSets->addElement(fRegionalIndicatorSet, status);
2372     fSets->addElement(fHebrew_LetterSet,     status);
2373     fSets->addElement(fALetterSet,           status);
2374     fSets->addElement(fSingle_QuoteSet,      status);
2375     fSets->addElement(fDouble_QuoteSet,      status);
2376     //fSets->addElement(fKatakanaSet,          status); // Omit Katakana from fSets, which omits Katakana characters
2377                                                         // from the test data. They are all in the dictionary set,
2378                                                         // which this (old, to be retired) monkey test cannot handle.
2379     fSets->addElement(fMidLetterSet,         status);
2380     fSets->addElement(fMidNumLetSet,         status);
2381     fSets->addElement(fMidNumSet,            status);
2382     fSets->addElement(fNumericSet,           status);
2383     fSets->addElement(fFormatSet,            status);
2384     fSets->addElement(fExtendSet,            status);
2385     fSets->addElement(fOtherSet,             status);
2386     fSets->addElement(fExtendNumLetSet,      status);
2387
2388     fSets->addElement(fEBaseSet,             status);
2389     fSets->addElement(fEBGSet,               status);
2390     fSets->addElement(fEModifierSet,         status);
2391     fSets->addElement(fZWJSet,               status);
2392     fSets->addElement(fExtendedPictSet,      status);
2393     fSets->addElement(fEmojiNRKSet,          status);
2394
2395     if (U_FAILURE(status)) {
2396         deferredStatus = status;
2397     }
2398 }
2399
2400 void RBBIWordMonkey::setText(const UnicodeString &s) {
2401     fText       = &s;
2402 }
2403
2404
2405 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2406     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2407                               //   break position being tested.  The candidate break
2408                               //   location is before p2.
2409
2410     int     breakPos = -1;
2411
2412     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2413
2414     if (U_FAILURE(deferredStatus)) {
2415         return -1;
2416     }
2417
2418     // Prev break at end of string.  return DONE.
2419     if (prevPos >= fText->length()) {
2420         return -1;
2421     }
2422     p0 = p1 = p2 = p3 = prevPos;
2423     c3 =  fText->char32At(prevPos);
2424     c0 = c1 = c2 = 0;
2425     (void)p0;       // Suppress set but not used warning.
2426
2427     // Loop runs once per "significant" character position in the input text.
2428     for (;;) {
2429         // Move all of the positions forward in the input string.
2430         p0 = p1;  c0 = c1;
2431         p1 = p2;  c1 = c2;
2432         p2 = p3;  c2 = c3;
2433
2434         // Advancd p3 by    X(Extend | Format)*   Rule 4
2435         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2436         do {
2437             p3 = fText->moveIndex32(p3, 1);
2438             c3 = fText->char32At(p3);
2439             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2440                break;
2441             };
2442         }
2443         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2444
2445
2446         if (p1 == p2) {
2447             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2448             continue;
2449         }
2450         if (p2 == fText->length()) {
2451             // Reached end of string.  Always a break position.
2452             break;
2453         }
2454
2455         // Rule  (3)   CR x LF
2456         //     No Extend or Format characters may appear between the CR and LF,
2457         //     which requires the additional check for p2 immediately following p1.
2458         //
2459         if (c1==0x0D && c2==0x0A) {
2460             continue;
2461         }
2462
2463         // Rule (3a)  Break before and after newlines (including CR and LF)
2464         //
2465         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2466             break;
2467         };
2468         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2469             break;
2470         };
2471
2472         // Rule (3c)    ZWJ x (Glue_after_ZWJ | EmojiNRK).
2473         //              Not ignoring extend chars, so peek into input text to
2474         //              get the potential ZWJ, the character immediately preceding c2.
2475         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2476         //              but char32At will get the full code point.
2477         if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2478             continue;
2479         }
2480
2481         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2482         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2483             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2484             continue;
2485         }
2486
2487         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2488         //
2489         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2490              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2491              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2492             continue;
2493         }
2494
2495         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2496         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2497             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2498             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2499             continue;
2500         }
2501
2502         // Rule (7a)     Hebrew_Letter x Single_Quote
2503         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2504             continue;
2505         }
2506
2507         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2508         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2509             continue;
2510         }
2511
2512         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2513         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2514             continue;
2515         }
2516
2517         // Rule (8)    Numeric x Numeric
2518         if (fNumericSet->contains(c1) &&
2519             fNumericSet->contains(c2))  {
2520             continue;
2521         }
2522
2523         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2524         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2525             fNumericSet->contains(c2))  {
2526             continue;
2527         }
2528
2529         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2530         if (fNumericSet->contains(c1) &&
2531             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2532             continue;
2533         }
2534
2535         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2536         if (fNumericSet->contains(c0) &&
2537             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2538             fNumericSet->contains(c2)) {
2539             continue;
2540         }
2541
2542         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2543         if (fNumericSet->contains(c1) &&
2544             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2545             fNumericSet->contains(c3)) {
2546             continue;
2547         }
2548
2549         // Rule (13)  Katakana x Katakana
2550         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2551         //                  all Katakana are handled by the dictionary breaker.
2552         if (fKatakanaSet->contains(c1) &&
2553             fKatakanaSet->contains(c2))  {
2554             continue;
2555         }
2556
2557         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2558         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2559              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2560              fExtendNumLetSet->contains(c2)) {
2561                 continue;
2562         }
2563
2564         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2565         if (fExtendNumLetSet->contains(c1) &&
2566                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2567                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2568             continue;
2569         }
2570
2571         // WB 14  (E_Base | EBG) x E_Modifier
2572         if ((fEBaseSet->contains(c1)  || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
2573             continue;
2574         }
2575
2576         // Rule 15 - 17   Group pairs of Regional Indicators.
2577         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2578             break;
2579         }
2580         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2581             continue;
2582         }
2583
2584         // Rule 999.  Break found here.
2585         break;
2586     }
2587
2588     breakPos = p2;
2589     return breakPos;
2590 }
2591
2592
2593 UVector  *RBBIWordMonkey::charClasses() {
2594     return fSets;
2595 }
2596
2597
2598 RBBIWordMonkey::~RBBIWordMonkey() {
2599     delete fSets;
2600     delete fCRSet;
2601     delete fLFSet;
2602     delete fNewlineSet;
2603     delete fKatakanaSet;
2604     delete fHebrew_LetterSet;
2605     delete fALetterSet;
2606     delete fSingle_QuoteSet;
2607     delete fDouble_QuoteSet;
2608     delete fMidNumLetSet;
2609     delete fMidLetterSet;
2610     delete fMidNumSet;
2611     delete fNumericSet;
2612     delete fFormatSet;
2613     delete fExtendSet;
2614     delete fExtendNumLetSet;
2615     delete fRegionalIndicatorSet;
2616     delete fDictionarySet;
2617     delete fOtherSet;
2618     delete fEBaseSet;
2619     delete fEBGSet;
2620     delete fEModifierSet;
2621     delete fZWJSet;
2622     delete fExtendedPictSet;
2623     delete fEmojiNRKSet;
2624 }
2625
2626
2627
2628
2629 //------------------------------------------------------------------------------------------
2630 //
2631 //   class RBBISentMonkey      Sentence Break specific implementation
2632 //                             of RBBIMonkeyKind.
2633 //
2634 //------------------------------------------------------------------------------------------
2635 class RBBISentMonkey: public RBBIMonkeyKind {
2636 public:
2637     RBBISentMonkey();
2638     virtual          ~RBBISentMonkey();
2639     virtual  UVector *charClasses();
2640     virtual  void     setText(const UnicodeString &s);
2641     virtual int32_t   next(int32_t i);
2642 private:
2643     int               moveBack(int posFrom);
2644     int               moveForward(int posFrom);
2645     UChar32           cAt(int pos);
2646
2647     UVector      *fSets;
2648
2649     UnicodeSet  *fSepSet;
2650     UnicodeSet  *fFormatSet;
2651     UnicodeSet  *fSpSet;
2652     UnicodeSet  *fLowerSet;
2653     UnicodeSet  *fUpperSet;
2654     UnicodeSet  *fOLetterSet;
2655     UnicodeSet  *fNumericSet;
2656     UnicodeSet  *fATermSet;
2657     UnicodeSet  *fSContinueSet;
2658     UnicodeSet  *fSTermSet;
2659     UnicodeSet  *fCloseSet;
2660     UnicodeSet  *fOtherSet;
2661     UnicodeSet  *fExtendSet;
2662
2663     const UnicodeString  *fText;
2664
2665 };
2666
2667 RBBISentMonkey::RBBISentMonkey()
2668 {
2669     UErrorCode  status = U_ZERO_ERROR;
2670
2671     fSets            = new UVector(status);
2672
2673     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2674     //                       set and made into character classes of their own.  For the monkey impl,
2675     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2676     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2677     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2678     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2679     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2680     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2681     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2682     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2683     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2684     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2685     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2686     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2687     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2688     fOtherSet        = new UnicodeSet();
2689
2690     if(U_FAILURE(status)) {
2691       deferredStatus = status;
2692       return;
2693     }
2694
2695     fOtherSet->complement();
2696     fOtherSet->removeAll(*fSepSet);
2697     fOtherSet->removeAll(*fFormatSet);
2698     fOtherSet->removeAll(*fSpSet);
2699     fOtherSet->removeAll(*fLowerSet);
2700     fOtherSet->removeAll(*fUpperSet);
2701     fOtherSet->removeAll(*fOLetterSet);
2702     fOtherSet->removeAll(*fNumericSet);
2703     fOtherSet->removeAll(*fATermSet);
2704     fOtherSet->removeAll(*fSContinueSet);
2705     fOtherSet->removeAll(*fSTermSet);
2706     fOtherSet->removeAll(*fCloseSet);
2707     fOtherSet->removeAll(*fExtendSet);
2708
2709     fSets->addElement(fSepSet,       status);
2710     fSets->addElement(fFormatSet,    status);
2711     fSets->addElement(fSpSet,        status);
2712     fSets->addElement(fLowerSet,     status);
2713     fSets->addElement(fUpperSet,     status);
2714     fSets->addElement(fOLetterSet,   status);
2715     fSets->addElement(fNumericSet,   status);
2716     fSets->addElement(fATermSet,     status);
2717     fSets->addElement(fSContinueSet, status);
2718     fSets->addElement(fSTermSet,     status);
2719     fSets->addElement(fCloseSet,     status);
2720     fSets->addElement(fOtherSet,     status);
2721     fSets->addElement(fExtendSet,    status);
2722
2723     if (U_FAILURE(status)) {
2724         deferredStatus = status;
2725     }
2726 }
2727
2728
2729
2730 void RBBISentMonkey::setText(const UnicodeString &s) {
2731     fText       = &s;
2732 }
2733
2734 UVector  *RBBISentMonkey::charClasses() {
2735     return fSets;
2736 }
2737
2738
2739 //  moveBack()   Find the "significant" code point preceding the index i.
2740 //               Skips over ($Extend | $Format)* .
2741 //
2742 int RBBISentMonkey::moveBack(int i) {
2743     if (i <= 0) {
2744         return -1;
2745     }
2746     UChar32   c;
2747     int32_t   j = i;
2748     do {
2749         j = fText->moveIndex32(j, -1);
2750         c = fText->char32At(j);
2751     }
2752     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2753     return j;
2754
2755  }
2756
2757
2758 int RBBISentMonkey::moveForward(int i) {
2759     if (i>=fText->length()) {
2760         return fText->length();
2761     }
2762     UChar32   c;
2763     int32_t   j = i;
2764     do {
2765         j = fText->moveIndex32(j, 1);
2766         c = cAt(j);
2767     }
2768     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2769     return j;
2770 }
2771
2772 UChar32 RBBISentMonkey::cAt(int pos) {
2773     if (pos<0 || pos>=fText->length()) {
2774         return -1;
2775     } else {
2776         return fText->char32At(pos);
2777     }
2778 }
2779
2780 int32_t RBBISentMonkey::next(int32_t prevPos) {
2781     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2782                               //   break position being tested.  The candidate break
2783                               //   location is before p2.
2784
2785     int     breakPos = -1;
2786
2787     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2788     UChar32 c;
2789
2790     if (U_FAILURE(deferredStatus)) {
2791         return -1;
2792     }
2793
2794     // Prev break at end of string.  return DONE.
2795     if (prevPos >= fText->length()) {
2796         return -1;
2797     }
2798     p0 = p1 = p2 = p3 = prevPos;
2799     c3 =  fText->char32At(prevPos);
2800     c0 = c1 = c2 = 0;
2801     (void)p0;     // Suppress set but not used warning.
2802
2803     // Loop runs once per "significant" character position in the input text.
2804     for (;;) {
2805         // Move all of the positions forward in the input string.
2806         p0 = p1;  c0 = c1;
2807         p1 = p2;  c1 = c2;
2808         p2 = p3;  c2 = c3;
2809
2810         // Advancd p3 by    X(Extend | Format)*   Rule 4
2811         p3 = moveForward(p3);
2812         c3 = cAt(p3);
2813
2814         // Rule (3)  CR x LF
2815         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2816             continue;
2817         }
2818
2819         // Rule (4).   Sep  <break>
2820         if (fSepSet->contains(c1)) {
2821             p2 = p1+1;   // Separators don't combine with Extend or Format.
2822             break;
2823         }
2824
2825         if (p2 >= fText->length()) {
2826             // Reached end of string.  Always a break position.
2827             break;
2828         }
2829
2830         if (p2 == prevPos) {
2831             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2832             continue;
2833         }
2834
2835         // Rule (6).   ATerm x Numeric
2836         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2837             continue;
2838         }
2839
2840         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
2841         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2842                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2843             continue;
2844         }
2845
2846         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2847         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2848         //                  note to the Unicode 5.0 documents.
2849         int p8 = p1;
2850         while (fSpSet->contains(cAt(p8))) {
2851             p8 = moveBack(p8);
2852         }
2853         while (fCloseSet->contains(cAt(p8))) {
2854             p8 = moveBack(p8);
2855         }
2856         if (fATermSet->contains(cAt(p8))) {
2857             p8=p2;
2858             for (;;) {
2859                 c = cAt(p8);
2860                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2861                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2862                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2863                     break;
2864                 }
2865                 p8 = moveForward(p8);
2866             }
2867             if (fLowerSet->contains(cAt(p8))) {
2868                 continue;
2869             }
2870         }
2871
2872         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2873         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2874             p8 = p1;
2875             while (fSpSet->contains(cAt(p8))) {
2876                 p8 = moveBack(p8);
2877             }
2878             while (fCloseSet->contains(cAt(p8))) {
2879                 p8 = moveBack(p8);
2880             }
2881             c = cAt(p8);
2882             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2883                 continue;
2884             }
2885         }
2886
2887         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2888         int p9 = p1;
2889         while (fCloseSet->contains(cAt(p9))) {
2890             p9 = moveBack(p9);
2891         }
2892         c = cAt(p9);
2893         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2894             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2895                 continue;
2896             }
2897         }
2898
2899         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2900         int p10 = p1;
2901         while (fSpSet->contains(cAt(p10))) {
2902             p10 = moveBack(p10);
2903         }
2904         while (fCloseSet->contains(cAt(p10))) {
2905             p10 = moveBack(p10);
2906         }
2907         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2908             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2909                 continue;
2910             }
2911         }
2912
2913         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2914         int p11 = p1;
2915         if (fSepSet->contains(cAt(p11))) {
2916             p11 = moveBack(p11);
2917         }
2918         while (fSpSet->contains(cAt(p11))) {
2919             p11 = moveBack(p11);
2920         }
2921         while (fCloseSet->contains(cAt(p11))) {
2922             p11 = moveBack(p11);
2923         }
2924         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2925             break;
2926         }
2927
2928         //  Rule (12)  Any x Any
2929         continue;
2930     }
2931     breakPos = p2;
2932     return breakPos;
2933 }
2934
2935 RBBISentMonkey::~RBBISentMonkey() {
2936     delete fSets;
2937     delete fSepSet;
2938     delete fFormatSet;
2939     delete fSpSet;
2940     delete fLowerSet;
2941     delete fUpperSet;
2942     delete fOLetterSet;
2943     delete fNumericSet;
2944     delete fATermSet;
2945     delete fSContinueSet;
2946     delete fSTermSet;
2947     delete fCloseSet;
2948     delete fOtherSet;
2949     delete fExtendSet;
2950 }
2951
2952
2953
2954 //-------------------------------------------------------------------------------------------
2955 //
2956 //  RBBILineMonkey
2957 //
2958 //-------------------------------------------------------------------------------------------
2959
2960 class RBBILineMonkey: public RBBIMonkeyKind {
2961 public:
2962     RBBILineMonkey();
2963     virtual          ~RBBILineMonkey();
2964     virtual  UVector *charClasses();
2965     virtual  void     setText(const UnicodeString &s);
2966     virtual  int32_t  next(int32_t i);
2967     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2968 private:
2969     UVector      *fSets;
2970
2971     UnicodeSet  *fBK;
2972     UnicodeSet  *fCR;
2973     UnicodeSet  *fLF;
2974     UnicodeSet  *fCM;
2975     UnicodeSet  *fNL;
2976     UnicodeSet  *fSG;
2977     UnicodeSet  *fWJ;
2978     UnicodeSet  *fZW;
2979     UnicodeSet  *fGL;
2980     UnicodeSet  *fCB;
2981     UnicodeSet  *fSP;
2982     UnicodeSet  *fB2;
2983     UnicodeSet  *fBA;
2984     UnicodeSet  *fBB;
2985     UnicodeSet  *fHY;
2986     UnicodeSet  *fH2;
2987     UnicodeSet  *fH3;
2988     UnicodeSet  *fCL;
2989     UnicodeSet  *fCP;
2990     UnicodeSet  *fEX;
2991     UnicodeSet  *fIN;
2992     UnicodeSet  *fJL;
2993     UnicodeSet  *fJV;
2994     UnicodeSet  *fJT;
2995     UnicodeSet  *fNS;
2996     UnicodeSet  *fOP;
2997     UnicodeSet  *fQU;
2998     UnicodeSet  *fIS;
2999     UnicodeSet  *fNU;
3000     UnicodeSet  *fPO;
3001     UnicodeSet  *fPR;
3002     UnicodeSet  *fSY;
3003     UnicodeSet  *fAI;
3004     UnicodeSet  *fAL;
3005     UnicodeSet  *fCJ;
3006     UnicodeSet  *fHL;
3007     UnicodeSet  *fID;
3008     UnicodeSet  *fRI;
3009     UnicodeSet  *fXX;
3010     UnicodeSet  *fEB;
3011     UnicodeSet  *fEM;
3012     UnicodeSet  *fZJ;
3013     UnicodeSet  *fExtendedPict;
3014     UnicodeSet  *fEmojiNRK;
3015
3016     BreakIterator        *fCharBI;
3017     const UnicodeString  *fText;
3018     RegexMatcher         *fNumberMatcher;
3019 };
3020
3021 RBBILineMonkey::RBBILineMonkey() :
3022     RBBIMonkeyKind(),
3023     fSets(NULL),
3024
3025     fCharBI(NULL),
3026     fText(NULL),
3027     fNumberMatcher(NULL)
3028
3029 {
3030     if (U_FAILURE(deferredStatus)) {
3031         return;
3032     }
3033
3034     UErrorCode  status = U_ZERO_ERROR;
3035
3036     fSets  = new UVector(status);
3037
3038     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3039     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3040     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3041     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3042     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3043     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3044     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3045     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3046     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3047     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3048     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3049     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3050     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3051     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3052     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3053     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3054     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3055     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3056     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3057     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3058     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3059     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3060     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3061     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3062     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3063     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3064     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3065     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3066     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3067     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3068     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3069     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3070     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3071     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
3072     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
3073     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3074     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
3075     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3076     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3077     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE(
3078             "[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
3079     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
3080     fZJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
3081     fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
3082     fExtendedPict = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
3083
3084     if (U_FAILURE(status)) {
3085         deferredStatus = status;
3086         return;
3087     }
3088
3089     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3090     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3091     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3092
3093     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
3094     fCM->addAll(*fZJ);     // ZWJ behaves as a CM.
3095
3096     fSets->addElement(fBK, status);
3097     fSets->addElement(fCR, status);
3098     fSets->addElement(fLF, status);
3099     fSets->addElement(fCM, status);
3100     fSets->addElement(fNL, status);
3101     fSets->addElement(fWJ, status);
3102     fSets->addElement(fZW, status);
3103     fSets->addElement(fGL, status);
3104     fSets->addElement(fCB, status);
3105     fSets->addElement(fSP, status);
3106     fSets->addElement(fB2, status);
3107     fSets->addElement(fBA, status);
3108     fSets->addElement(fBB, status);
3109     fSets->addElement(fHY, status);
3110     fSets->addElement(fH2, status);
3111     fSets->addElement(fH3, status);
3112     fSets->addElement(fCL, status);
3113     fSets->addElement(fCP, status);
3114     fSets->addElement(fEX, status);
3115     fSets->addElement(fIN, status);
3116     fSets->addElement(fJL, status);
3117     fSets->addElement(fJT, status);
3118     fSets->addElement(fJV, status);
3119     fSets->addElement(fNS, status);
3120     fSets->addElement(fOP, status);
3121     fSets->addElement(fQU, status);
3122     fSets->addElement(fIS, status);
3123     fSets->addElement(fNU, status);
3124     fSets->addElement(fPO, status);
3125     fSets->addElement(fPR, status);
3126     fSets->addElement(fSY, status);
3127     fSets->addElement(fAI, status);
3128     fSets->addElement(fAL, status);
3129     fSets->addElement(fHL, status);
3130     fSets->addElement(fID, status);
3131     fSets->addElement(fWJ, status);
3132     fSets->addElement(fRI, status);
3133     fSets->addElement(fSG, status);
3134     fSets->addElement(fEB, status);
3135     fSets->addElement(fEM, status);
3136     fSets->addElement(fZJ, status);
3137     fSets->addElement(fExtendedPict, status);
3138     fSets->addElement(fEmojiNRK, status);
3139
3140
3141     const char *rules =
3142             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
3143             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
3144             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
3145             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
3146             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
3147             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
3148
3149     fNumberMatcher = new RegexMatcher(
3150         UnicodeString(rules, -1, US_INV), 0, status);
3151
3152     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3153
3154     if (U_FAILURE(status)) {
3155         deferredStatus = status;
3156     }
3157 }
3158
3159
3160 void RBBILineMonkey::setText(const UnicodeString &s) {
3161     fText       = &s;
3162     fCharBI->setText(s);
3163     fNumberMatcher->reset(s);
3164 }
3165
3166 //
3167 //  rule9Adjust
3168 //     Line Break TR rules 9 and 10 implementation.
3169 //     This deals with combining marks and other sequences that
3170 //     that must be treated as if they were something other than what they actually are.
3171 //
3172 //     This is factored out into a separate function because it must be applied twice for
3173 //     each potential break, once to the chars before the position being checked, then
3174 //     again to the text following the possible break.
3175 //
3176 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3177     if (pos == -1) {
3178         // Invalid initial position.  Happens during the warmup iteration of the
3179         //   main loop in next().
3180         return;
3181     }
3182
3183     int32_t  nPos = *nextPos;
3184
3185     // LB 9  Keep combining sequences together.
3186     //  advance over any CM class chars.  Note that Line Break CM is different
3187     //  from the normal Grapheme Extend property.
3188     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3189           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3190         for (;;) {
3191             *nextChar = fText->char32At(nPos);
3192             if (!fCM->contains(*nextChar)) {
3193                 break;
3194             }
3195             nPos = fText->moveIndex32(nPos, 1);
3196         }
3197     }
3198
3199
3200     // LB 9 Treat X CM* as if it were x.
3201     //       No explicit action required.
3202
3203     // LB 10  Treat any remaining combining mark as AL
3204     if (fCM->contains(*posChar)) {
3205         *posChar = 0x41;   // thisChar = 'A';
3206     }
3207
3208     // Push the updated nextPos and nextChar back to our caller.
3209     // This only makes a difference if posChar got bigger by consuming a
3210     // combining sequence.
3211     *nextPos  = nPos;
3212     *nextChar = fText->char32At(nPos);
3213 }
3214
3215
3216
3217 int32_t RBBILineMonkey::next(int32_t startPos) {
3218     UErrorCode status = U_ZERO_ERROR;
3219     int32_t    pos;       //  Index of the char following a potential break position
3220     UChar32    thisChar;  //  Character at above position "pos"
3221
3222     int32_t    prevPos;   //  Index of the char preceding a potential break position
3223     UChar32    prevChar;  //  Character at above position.  Note that prevChar
3224                           //   and thisChar may not be adjacent because combining
3225                           //   characters between them will be ignored.
3226
3227     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
3228     UChar32    prevCharX2;
3229
3230     int32_t    nextPos;   //  Index of the next character following pos.
3231                           //     Usually skips over combining marks.
3232     int32_t    nextCPPos; //  Index of the code point following "pos."
3233                           //     May point to a combining mark.
3234     int32_t    tPos;      //  temp value.
3235     UChar32    c;
3236
3237     if (U_FAILURE(deferredStatus)) {
3238         return -1;
3239     }
3240
3241     if (startPos >= fText->length()) {
3242         return -1;
3243     }
3244
3245
3246     // Initial values for loop.  Loop will run the first time without finding breaks,
3247     //                           while the invalid values shift out and the "this" and
3248     //                           "prev" positions are filled in with good values.
3249     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
3250     thisChar = prevChar  = prevCharX2 = 0;
3251     nextPos  = nextCPPos = startPos;
3252
3253
3254     // Loop runs once per position in the test text, until a break position
3255     //  is found.
3256     for (;;) {
3257         prevPosX2 = prevPos;
3258         prevCharX2 = prevChar;
3259
3260         prevPos   = pos;
3261         prevChar  = thisChar;
3262
3263         pos       = nextPos;
3264         thisChar  = fText->char32At(pos);
3265
3266         nextCPPos = fText->moveIndex32(pos, 1);
3267         nextPos   = nextCPPos;
3268
3269         // Rule LB2 - Break at end of text.
3270         if (pos >= fText->length()) {
3271             break;
3272         }
3273
3274         // Rule LB 9 - adjust for combining sequences.
3275         //             We do this one out-of-order because the adjustment does not change anything
3276         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3277         //             be applied.
3278         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3279         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3280         c = fText->char32At(nextPos);
3281         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3282
3283         // If the loop is still warming up - if we haven't shifted the initial
3284         //   -1 positions out of prevPos yet - loop back to advance the
3285         //    position in the input without any further looking for breaks.
3286         if (prevPos == -1) {
3287             continue;
3288         }
3289
3290         // LB 4  Always break after hard line breaks,
3291         if (fBK->contains(prevChar)) {
3292             break;
3293         }
3294
3295         // LB 5  Break after CR, LF, NL, but not inside CR LF
3296         if (prevChar == 0x0d && thisChar == 0x0a) {
3297             continue;
3298         }
3299         if (prevChar == 0x0d ||
3300             prevChar == 0x0a ||
3301             prevChar == 0x85)  {
3302             break;
3303         }
3304
3305         // LB 6  Don't break before hard line breaks
3306         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3307             fBK->contains(thisChar)) {
3308                 continue;
3309         }
3310
3311
3312         // LB 7  Don't break before spaces or zero-width space.
3313         if (fSP->contains(thisChar)) {
3314             continue;
3315         }
3316
3317         if (fZW->contains(thisChar)) {
3318             continue;
3319         }
3320
3321         // LB 8  Break after zero width space
3322         if (fZW->contains(prevChar)) {
3323             break;
3324         }
3325
3326         // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
3327         //       The monkey test's way of ignoring combining characters doesn't work
3328         //       for this rule. ZJ is also a CM. Need to get the actual character
3329         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3330         {
3331             int32_t prevIdx = fText->moveIndex32(pos, -1);
3332             UChar32 prevC = fText->char32At(prevIdx);
3333             if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
3334                 continue;
3335             }
3336         }
3337
3338         // LB 9, 10  Already done, at top of loop.
3339         //
3340
3341
3342         // LB 11  Do not break before or after WORD JOINER and related characters.
3343         //    x  WJ
3344         //    WJ  x
3345         //
3346         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3347             continue;
3348         }
3349
3350         // LB 12
3351         //    GL  x
3352         if (fGL->contains(prevChar)) {
3353             continue;
3354         }
3355
3356         // LB 12a
3357         //    [^SP BA HY] x GL
3358         if (!(fSP->contains(prevChar) ||
3359               fBA->contains(prevChar) ||
3360               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3361             continue;
3362         }
3363
3364
3365
3366         // LB 13  Don't break before closings.
3367         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3368         //        fall into LB 17 and the more general number regular expression.
3369         //
3370         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3371             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3372                                          fEX->contains(thisChar)  ||
3373             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3374             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3375             continue;
3376         }
3377
3378         // LB 14 Don't break after OP SP*
3379         //       Scan backwards, checking for this sequence.
3380         //       The OP char could include combining marks, so we actually check for
3381         //           OP CM* SP*
3382         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3383         //       sequence into a ID char, so before scanning back through spaces,
3384         //       verify that prevChar is indeed a space.  The prevChar variable
3385         //       may differ from fText[prevPos]
3386         tPos = prevPos;
3387         if (fSP->contains(prevChar)) {
3388             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3389                 tPos=fText->moveIndex32(tPos, -1);
3390             }
3391         }
3392         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3393             tPos=fText->moveIndex32(tPos, -1);
3394         }
3395         if (fOP->contains(fText->char32At(tPos))) {
3396             continue;
3397         }
3398
3399
3400         // LB 15    QU SP* x OP
3401         if (fOP->contains(thisChar)) {
3402             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3403             int tPos = prevPos;
3404             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3405                 tPos = fText->moveIndex32(tPos, -1);
3406             }
3407             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3408                 tPos = fText->moveIndex32(tPos, -1);
3409             }
3410             if (fQU->contains(fText->char32At(tPos))) {
3411                 continue;
3412             }
3413         }
3414
3415
3416
3417         // LB 16   (CL | CP) SP* x NS
3418         //    Scan backwards for SP* CM* (CL | CP)
3419         if (fNS->contains(thisChar)) {
3420             int tPos = prevPos;
3421             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3422                 tPos = fText->moveIndex32(tPos, -1);
3423             }
3424             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3425                 tPos = fText->moveIndex32(tPos, -1);
3426             }
3427             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3428                 continue;
3429             }
3430         }
3431
3432
3433         // LB 17        B2 SP* x B2
3434         if (fB2->contains(thisChar)) {
3435             //  Scan backwards, checking for the B2 CM* SP* sequence.
3436             tPos = prevPos;
3437             if (fSP->contains(prevChar)) {
3438                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3439                     tPos=fText->moveIndex32(tPos, -1);
3440                 }
3441             }
3442             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3443                 tPos=fText->moveIndex32(tPos, -1);
3444             }
3445             if (fB2->contains(fText->char32At(tPos))) {
3446                 continue;
3447             }
3448         }
3449
3450
3451         // LB 18    break after space
3452         if (fSP->contains(prevChar)) {
3453             break;
3454         }
3455
3456         // LB 19
3457         //    x   QU
3458         //    QU  x
3459         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3460             continue;
3461         }
3462
3463         // LB 20  Break around a CB
3464         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3465             break;
3466         }
3467
3468         // LB 21
3469         if (fBA->contains(thisChar) ||
3470             fHY->contains(thisChar) ||
3471             fNS->contains(thisChar) ||
3472             fBB->contains(prevChar) )   {
3473             continue;
3474         }
3475
3476         // LB 21a
3477         //   HL (HY | BA) x
3478         if (fHL->contains(prevCharX2) &&
3479                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3480             continue;
3481         }
3482
3483         // LB 21b
3484         //   SY x HL
3485         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3486             continue;
3487         }
3488
3489         // LB 22
3490         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3491             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3492             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3493             ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
3494             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3495             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3496             continue;
3497         }
3498
3499
3500         // LB 23    (AL | HL) x NU
3501         //          NU x (AL | HL)
3502         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3503             continue;
3504         }
3505         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3506             continue;
3507         }
3508
3509         // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3510         //      PR x (ID | EB | EM)
3511         //     (ID | EB | EM) x PO
3512         if (fPR->contains(prevChar) &&
3513                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3514             continue;
3515         }
3516         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3517                 fPO->contains(thisChar)) {
3518             continue;
3519         }
3520
3521         // LB 24  Do not break between prefix and letters or ideographs.
3522         //         (PR | PO) x (AL | HL)
3523         //         (AL | HL) x (PR | PO)
3524         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3525                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3526             continue;
3527         }
3528         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3529                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3530             continue;
3531         }
3532
3533
3534
3535         // LB 25    Numbers
3536         if (fNumberMatcher->lookingAt(prevPos, status)) {
3537             if (U_FAILURE(status)) {
3538                 break;
3539             }
3540             // Matched a number.  But could have been just a single digit, which would
3541             //    not represent a "no break here" between prevChar and thisChar
3542             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3543             if (numEndIdx > pos) {
3544                 // Number match includes at least our two chars being checked
3545                 if (numEndIdx > nextPos) {
3546                     // Number match includes additional chars.  Update pos and nextPos
3547                     //   so that next loop iteration will continue at the end of the number,
3548                     //   checking for breaks between last char in number & whatever follows.
3549                     pos = nextPos = numEndIdx;
3550                     do {
3551                         pos = fText->moveIndex32(pos, -1);
3552                         thisChar = fText->char32At(pos);
3553                     } while (fCM->contains(thisChar));
3554                 }
3555                 continue;
3556             }
3557         }
3558
3559
3560         // LB 26 Do not break a Korean syllable.
3561         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3562                                         fJV->contains(thisChar) ||
3563                                         fH2->contains(thisChar) ||
3564                                         fH3->contains(thisChar))) {
3565                                             continue;
3566                                         }
3567
3568         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3569             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3570                 continue;
3571         }
3572
3573         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3574             fJT->contains(thisChar)) {
3575                 continue;
3576         }
3577
3578         // LB 27 Treat a Korean Syllable Block the same as ID.
3579         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3580             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3581             fIN->contains(thisChar)) {
3582                 continue;
3583             }
3584         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3585             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3586             fPO->contains(thisChar)) {
3587                 continue;
3588             }
3589         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3590             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3591                 continue;
3592             }
3593
3594
3595
3596         // LB 28  Do not break between alphabetics ("at").
3597         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3598             continue;
3599         }
3600
3601         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3602         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3603             continue;
3604         }
3605
3606         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3607         //          (AL | NU) x OP
3608         //          CP x (AL | NU)
3609         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3610             continue;
3611         }
3612         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3613             continue;
3614         }
3615
3616         // LB30a    RI RI <break> RI
3617         //             RI    x    RI
3618         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3619             break;
3620         }
3621         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3622             continue;
3623         }
3624
3625         // LB30b    Emoji Base x Emoji Modifier
3626         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3627             continue;
3628         }
3629
3630         // LB 31    Break everywhere else
3631         break;
3632
3633     }
3634
3635     return pos;
3636 }
3637
3638
3639 UVector  *RBBILineMonkey::charClasses() {
3640     return fSets;
3641 }
3642
3643
3644 RBBILineMonkey::~RBBILineMonkey() {
3645     delete fSets;
3646
3647     delete fBK;
3648     delete fCR;
3649     delete fLF;
3650     delete fCM;
3651     delete fNL;
3652     delete fWJ;
3653     delete fZW;
3654     delete fGL;
3655     delete fCB;
3656     delete fSP;
3657     delete fB2;
3658     delete fBA;
3659     delete fBB;
3660     delete fHY;
3661     delete fH2;
3662     delete fH3;
3663     delete fCL;
3664     delete fCP;
3665     delete fEX;
3666     delete fIN;
3667     delete fJL;
3668     delete fJV;
3669     delete fJT;
3670     delete fNS;
3671     delete fOP;
3672     delete fQU;
3673     delete fIS;
3674     delete fNU;
3675     delete fPO;
3676     delete fPR;
3677     delete fSY;
3678     delete fAI;
3679     delete fAL;
3680     delete fCJ;
3681     delete fHL;
3682     delete fID;
3683     delete fRI;
3684     delete fSG;
3685     delete fXX;
3686     delete fEB;
3687     delete fEM;
3688     delete fZJ;
3689     delete fExtendedPict;
3690     delete fEmojiNRK;
3691
3692     delete fCharBI;
3693     delete fNumberMatcher;
3694 }
3695
3696
3697 //-------------------------------------------------------------------------------------------
3698 //
3699 //   TestMonkey
3700 //
3701 //     params
3702 //       seed=nnnnn        Random number starting seed.
3703 //                         Setting the seed allows errors to be reproduced.
3704 //       loop=nnn          Looping count.  Controls running time.
3705 //                         -1:  run forever.
3706 //                          0 or greater:  run length.
3707 //
3708 //       type = char | word | line | sent | title
3709 //
3710 //  Example:
3711 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3712 //
3713 //-------------------------------------------------------------------------------------------
3714
3715 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3716     int32_t val = defaultVal;
3717     name.append(" *= *(-?\\d+)");
3718     UErrorCode status = U_ZERO_ERROR;
3719     RegexMatcher m(name, params, 0, status);
3720     if (m.find()) {
3721         // The param exists.  Convert the string to an int.
3722         char valString[100];
3723         int32_t paramLength = m.end(1, status) - m.start(1, status);
3724         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3725             paramLength = (int32_t)(sizeof(valString)-2);
3726         }
3727         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3728         val = strtol(valString,  NULL, 10);
3729
3730         // Delete this parameter from the params string.
3731         m.reset();
3732         params = m.replaceFirst("", status);
3733     }
3734     U_ASSERT(U_SUCCESS(status));
3735     return val;
3736 }
3737 #endif
3738
3739 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3740 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3741                                     BreakIterator *bi,
3742                                     int expected[],
3743                                     int expectedcount)
3744 {
3745     int count = 0;
3746     int i = 0;
3747     int forward[50];
3748     bi->setText(ustr);
3749     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3750         forward[count] = i;
3751         if (count < expectedcount && expected[count] != i) {
3752             test->errln("break forward test failed: expected %d but got %d",
3753                         expected[count], i);
3754             break;
3755         }
3756         count ++;
3757     }
3758     if (count != expectedcount) {
3759         printStringBreaks(ustr, expected, expectedcount);
3760         test->errln("break forward test failed: missed %d match",
3761                     expectedcount - count);
3762         return;
3763     }
3764     // testing boundaries
3765     for (i = 1; i < expectedcount; i ++) {
3766         int j = expected[i - 1];
3767         if (!bi->isBoundary(j)) {
3768             printStringBreaks(ustr, expected, expectedcount);
3769             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3770             return;
3771         }
3772         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3773             if (bi->isBoundary(j)) {
3774                 printStringBreaks(ustr, expected, expectedcount);
3775                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3776                 return;
3777             }
3778         }
3779     }
3780
3781     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3782         count --;
3783         if (forward[count] != i) {
3784             printStringBreaks(ustr, expected, expectedcount);
3785             test->errln("happy break test previous() failed: expected %d but got %d",
3786                         forward[count], i);
3787             break;
3788         }
3789     }
3790     if (count != 0) {
3791         printStringBreaks(ustr, expected, expectedcount);
3792         test->errln("break test previous() failed: missed a match");
3793         return;
3794     }
3795
3796     // testing preceding
3797     for (i = 0; i < expectedcount - 1; i ++) {
3798         // int j = expected[i] + 1;
3799         int j = ustr.moveIndex32(expected[i], 1);
3800         for (; j <= expected[i + 1]; j ++) {
3801             if (bi->preceding(j) != expected[i]) {
3802                 printStringBreaks(ustr, expected, expectedcount);
3803                 test->errln("preceding(): Not expecting boundary at position %d", j);
3804                 return;
3805             }
3806         }
3807     }
3808 }
3809 #endif
3810
3811 void RBBITest::TestWordBreaks(void)
3812 {
3813 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3814
3815     Locale        locale("en");
3816     UErrorCode    status = U_ZERO_ERROR;
3817     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3818     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3819     // Replaced any C+J characters in a row with a random sequence of characters
3820     // of the same length to make our C+J segmentation not get in the way.
3821     static const char *strlist[] =
3822     {
3823     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3824     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3825     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3826     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3827     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3828     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3829     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3830     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3831     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3832     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3833     "\\u2027\\U000e0067\\u0a47\\u00b7",
3834     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3835     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3836     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3837     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3838     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3839     "\\u0027\\u11af\\U000e0057\\u0602",
3840     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3841     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3842     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3843     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3844     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3845     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3846     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3847     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3848     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3849     "\\u18f4\\U000e0049\\u20e7\\u2027",
3850     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3851     "\\ua183\\u102d\\u0bec\\u003a",
3852     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3853     "\\u003a\\u0e57\\u0fad\\u002e",
3854     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3855     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3856     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3857     "\\u003a\\u0664\\u00b7\\u1fba",
3858     "\\u003b\\u0027\\u00b7\\u47a3",
3859     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3860     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3861     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3862     };
3863     int loop;
3864     if (U_FAILURE(status)) {
3865         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3866         return;
3867     }
3868     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3869         // printf("looping %d\n", loop);
3870         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3871         // RBBICharMonkey monkey;
3872         RBBIWordMonkey monkey;
3873
3874         int expected[50];
3875         int expectedcount = 0;
3876
3877         monkey.setText(ustr);
3878         int i;
3879         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3880             expected[expectedcount ++] = i;
3881         }
3882
3883         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3884     }
3885     delete bi;
3886 #endif
3887 }
3888
3889 void RBBITest::TestWordBoundary(void)
3890 {
3891     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3892     Locale        locale("en");
3893     UErrorCode    status = U_ZERO_ERROR;
3894     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3895     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3896     UChar         str[50];
3897     static const char *strlist[] =
3898     {
3899     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3900     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3901     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3902     "\\u2027\\U000e0067\\u0a47\\u00b7",
3903     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3904     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3905     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3906     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3907     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3908     "\\u0027\\u11af\\U000e0057\\u0602",
3909     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3910     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3911     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3912     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3913     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3914     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3915     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3916     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3917     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3918     "\\u58f4\\U000e0049\\u20e7\\u2027",
3919     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3920     "\\ua183\\u102d\\u0bec\\u003a",
3921     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3922     "\\u003a\\u0e57\\u0fad\\u002e",
3923     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3924     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3925     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3926     "\\u003a\\u0664\\u00b7\\u1fba",
3927     "\\u003b\\u0027\\u00b7\\u47a3",
3928     };
3929     int loop;
3930     if (U_FAILURE(status)) {
3931         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3932         return;
3933     }
3934     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3935         // printf("looping %d\n", loop);
3936         u_unescape(strlist[loop], str, 20);
3937         UnicodeString ustr(str);
3938         int forward[50];
3939         int count = 0;
3940
3941         bi->setText(ustr);
3942         int prev = 0;
3943         int i;
3944         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3945             forward[count ++] = i;
3946             if (i > prev) {
3947                 int j;
3948                 for (j = prev + 1; j < i; j ++) {
3949                     if (bi->isBoundary(j)) {
3950                         printStringBreaks(ustr, forward, count);
3951                         errln("happy boundary test failed: expected %d not a boundary",
3952                                j);
3953                         return;
3954                     }
3955                 }
3956             }
3957             if (!bi->isBoundary(i)) {
3958                 printStringBreaks(ustr, forward, count);
3959                 errln("happy boundary test failed: expected %d a boundary",
3960                        i);
3961                 return;
3962             }
3963             prev = i;
3964         }
3965     }
3966     delete bi;
3967 }
3968
3969 void RBBITest::TestLineBreaks(void)
3970 {
3971 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3972     Locale        locale("en");
3973     UErrorCode    status = U_ZERO_ERROR;
3974     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3975     const int32_t  STRSIZE = 50;
3976     UChar         str[STRSIZE];
3977     static const char *strlist[] =
3978     {
3979      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3980      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3981              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3982      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3983              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3984      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3985      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3986      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3987      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3988      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3989      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3990      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3991      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3992      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3993      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3994      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3995      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3996      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3997      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3998      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3999      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4000      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4001      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4002      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4003      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4004      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4005      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4006      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4007      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4008      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4009      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4010      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4011      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4012      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4013      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4014      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4015      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4016      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4017      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4018          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4019     };
4020     int loop;
4021     TEST_ASSERT_SUCCESS(status);
4022     if (U_FAILURE(status)) {
4023         return;
4024     }
4025     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4026         // printf("looping %d\n", loop);
4027         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4028         if (t >= STRSIZE) {
4029             TEST_ASSERT(FALSE);
4030             continue;
4031         }
4032
4033
4034         UnicodeString ustr(str);
4035         RBBILineMonkey monkey;
4036         if (U_FAILURE(monkey.deferredStatus)) {
4037             continue;
4038         }
4039
4040         const int EXPECTEDSIZE = 50;
4041         int expected[EXPECTEDSIZE];
4042         int expectedcount = 0;
4043
4044         monkey.setText(ustr);
4045         int i;
4046         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4047             if (expectedcount >= EXPECTEDSIZE) {
4048                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4049                 return;
4050             }
4051             expected[expectedcount ++] = i;
4052         }
4053
4054         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4055     }
4056     delete bi;
4057 #endif
4058 }
4059
4060 void RBBITest::TestSentBreaks(void)
4061 {
4062 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4063     Locale        locale("en");
4064     UErrorCode    status = U_ZERO_ERROR;
4065     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4066     UChar         str[200];
4067     static const char *strlist[] =
4068     {
4069      "Now\ris\nthe\r\ntime\n\rfor\r\r",
4070      "This\n",
4071      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4072      "\"Sentence ending with a quote.\" Bye.",
4073      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4074      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4075      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4076      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4077      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4078      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4079      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4080              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4081              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4082              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4083      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4084              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4085              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4086              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4087              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4088              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4089     };
4090     int loop;
4091     if (U_FAILURE(status)) {
4092         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4093         return;
4094     }
4095     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4096         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
4097         UnicodeString ustr(str);
4098
4099         RBBISentMonkey monkey;
4100         if (U_FAILURE(monkey.deferredStatus)) {
4101             continue;
4102         }
4103
4104         const int EXPECTEDSIZE = 50;
4105         int expected[EXPECTEDSIZE];
4106         int expectedcount = 0;
4107
4108         monkey.setText(ustr);
4109         int i;
4110         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4111             if (expectedcount >= EXPECTEDSIZE) {
4112                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4113                 return;
4114             }
4115             expected[expectedcount ++] = i;
4116         }
4117
4118         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4119     }
4120     delete bi;
4121 #endif
4122 }
4123
4124 void RBBITest::TestMonkey() {
4125 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4126
4127     UErrorCode     status    = U_ZERO_ERROR;
4128     int32_t        loopCount = 500;
4129     int32_t        seed      = 1;
4130     UnicodeString  breakType = "all";
4131     Locale         locale("en");
4132     UBool          useUText  = FALSE;
4133
4134     if (quick == FALSE) {
4135         loopCount = 10000;
4136     }
4137
4138     if (fTestParams) {
4139         UnicodeString p(fTestParams);
4140         loopCount = getIntParam("loop", p, loopCount);
4141         seed      = getIntParam("seed", p, seed);
4142
4143         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4144         if (m.find()) {
4145             breakType = m.group(1, status);
4146             m.reset();
4147             p = m.replaceFirst("", status);
4148         }
4149
4150         RegexMatcher u(" *utext", p, 0, status);
4151         if (u.find()) {
4152             useUText = TRUE;
4153             u.reset();
4154             p = u.replaceFirst("", status);
4155         }
4156
4157
4158         // m.reset(p);
4159         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4160             // Each option is stripped out of the option string as it is processed.
4161             // All options have been checked.  The option string should have been completely emptied..
4162             char buf[100];
4163             p.extract(buf, sizeof(buf), NULL, status);
4164             buf[sizeof(buf)-1] = 0;
4165             errln("Unrecognized or extra parameter:  %s\n", buf);
4166             return;
4167         }
4168
4169     }
4170
4171     if (breakType == "char" || breakType == "all") {
4172         RBBICharMonkey  m;
4173         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4174         if (U_SUCCESS(status)) {
4175             RunMonkey(bi, m, "char", seed, loopCount, useUText);
4176             if (breakType == "all" && useUText==FALSE) {
4177                 // Also run a quick test with UText when "all" is specified
4178                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4179             }
4180         }
4181         else {
4182             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4183         }
4184         delete bi;
4185     }
4186
4187     if (breakType == "word" || breakType == "all") {
4188         logln("Word Break Monkey Test");
4189         RBBIWordMonkey  m;
4190         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4191         if (U_SUCCESS(status)) {
4192             RunMonkey(bi, m, "word", seed, loopCount, useUText);
4193         }
4194         else {
4195             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4196         }
4197         delete bi;
4198     }
4199
4200     if (breakType == "line" || breakType == "all") {
4201         logln("Line Break Monkey Test");
4202         RBBILineMonkey  m;
4203         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4204         if (loopCount >= 10) {
4205             loopCount = loopCount / 5;   // Line break runs slower than the others.
4206         }
4207         if (U_SUCCESS(status)) {
4208             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4209         }
4210         else {
4211             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4212         }
4213         delete bi;
4214     }
4215
4216     if (breakType == "sent" || breakType == "all"  ) {
4217         logln("Sentence Break Monkey Test");
4218         RBBISentMonkey  m;
4219         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4220         if (loopCount >= 10) {
4221             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4222         }
4223         if (U_SUCCESS(status)) {
4224             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4225         }
4226         else {
4227             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4228         }
4229         delete bi;
4230     }
4231
4232 #endif
4233 }
4234
4235 //
4236 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4237 //    Parameters:
4238 //       bi      - the break iterator to use
4239 //       mk      - MonkeyKind, abstraction for obtaining expected results
4240 //       name    - Name of test (char, word, etc.) for use in error messages
4241 //       seed    - Seed for starting random number generator (parameter from user)
4242 //       numIterations
4243 //
4244 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4245                          int32_t numIterations, UBool useUText) {
4246
4247 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4248
4249     const int32_t    TESTSTRINGLEN = 500;
4250     UnicodeString    testText;
4251     int32_t          numCharClasses;
4252     UVector          *chClasses;
4253     int              expected[TESTSTRINGLEN*2 + 1];
4254     int              expectedCount = 0;
4255     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4256     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4257     char             reverseBreaks[TESTSTRINGLEN*2+1];
4258     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4259     char             followingBreaks[TESTSTRINGLEN*2+1];
4260     char             precedingBreaks[TESTSTRINGLEN*2+1];
4261     int              i;
4262     int              loopCount = 0;
4263
4264     m_seed = seed;
4265
4266     numCharClasses = mk.charClasses()->size();
4267     chClasses      = mk.charClasses();
4268
4269     // Check for errors that occured during the construction of the MonkeyKind object.
4270     //  Can't report them where they occured because errln() is a method coming from intlTest,
4271     //  and is not visible outside of RBBITest :-(
4272     if (U_FAILURE(mk.deferredStatus)) {
4273         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4274         return;
4275     }
4276
4277     // Verify that the character classes all have at least one member.
4278     for (i=0; i<numCharClasses; i++) {
4279         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4280         if (s == NULL || s->size() == 0) {
4281             errln("Character Class #%d is null or of zero size.", i);
4282             return;
4283         }
4284     }
4285
4286     while (loopCount < numIterations || numIterations == -1) {
4287         if (numIterations == -1 && loopCount % 10 == 0) {
4288             // If test is running in an infinite loop, display a periodic tic so
4289             //   we can tell that it is making progress.
4290             fprintf(stderr, ".");
4291         }
4292         // Save current random number seed, so that we can recreate the random numbers
4293         //   for this loop iteration in event of an error.
4294         seed = m_seed;
4295
4296         // Populate a test string with data.
4297         testText.truncate(0);
4298         for (i=0; i<TESTSTRINGLEN; i++) {
4299             int32_t  aClassNum = m_rand() % numCharClasses;
4300             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4301             int32_t   charIdx = m_rand() % classSet->size();
4302             UChar32   c = classSet->charAt(charIdx);
4303             if (c < 0) {   // TODO:  deal with sets containing strings.
4304                 errln("%s:%d c < 0", __FILE__, __LINE__);
4305                 break;
4306             }
4307             // Do not assemble a supplementary character from randomly generated separate surrogates.
4308             //   (It could be a dictionary character)
4309             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4310                 continue;
4311             }
4312
4313             testText.append(c);
4314         }
4315
4316         // Calculate the expected results for this test string.
4317         mk.setText(testText);
4318         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4319         expectedBreaks[0] = 1;
4320         int32_t breakPos = 0;
4321         expectedCount = 0;
4322         for (;;) {
4323             breakPos = mk.next(breakPos);
4324             if (breakPos == -1) {
4325                 break;
4326             }
4327             if (breakPos > testText.length()) {
4328                 errln("breakPos > testText.length()");
4329             }
4330             expectedBreaks[breakPos] = 1;
4331             U_ASSERT(expectedCount<testText.length());
4332             expected[expectedCount ++] = breakPos;
4333             (void)expected;   // Set but not used warning.
4334                               // TODO (andy): check it out.
4335         }
4336
4337         // Find the break positions using forward iteration
4338         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4339         if (useUText) {
4340             UErrorCode status = U_ZERO_ERROR;
4341             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4342             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4343             bi->setText(testUText, status);
4344             TEST_ASSERT_SUCCESS(status);
4345             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4346                                       //  This UText can be closed immediately, so long as the
4347                                       //  testText string continues to exist.
4348         } else {
4349             bi->setText(testText);
4350         }
4351
4352         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4353             if (i < 0 || i > testText.length()) {
4354                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4355                 break;
4356             }
4357             forwardBreaks[i] = 1;
4358         }
4359
4360         // Find the break positions using reverse iteration
4361         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4362         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4363             if (i < 0 || i > testText.length()) {
4364                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4365                 break;
4366             }
4367             reverseBreaks[i] = 1;
4368         }
4369
4370         // Find the break positions using isBoundary() tests.
4371         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4372         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4373         for (i=0; i<=testText.length(); i++) {
4374             isBoundaryBreaks[i] = bi->isBoundary(i);
4375         }
4376
4377
4378         // Find the break positions using the following() function.
4379         // printf(".");
4380         memset(followingBreaks, 0, sizeof(followingBreaks));
4381         int32_t   lastBreakPos = 0;
4382         followingBreaks[0] = 1;
4383         for (i=0; i<testText.length(); i++) {
4384             breakPos = bi->following(i);
4385             if (breakPos <= i ||
4386                 breakPos < lastBreakPos ||
4387                 breakPos > testText.length() ||
4388                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4389                 errln("%s break monkey test: "
4390                     "Out of range value returned by BreakIterator::following().\n"
4391                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4392                          name, seed, i, breakPos, lastBreakPos);
4393                 break;
4394             }
4395             followingBreaks[breakPos] = 1;
4396             lastBreakPos = breakPos;
4397         }
4398
4399         // Find the break positions using the preceding() function.
4400         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4401         lastBreakPos = testText.length();
4402         precedingBreaks[testText.length()] = 1;
4403         for (i=testText.length(); i>0; i--) {
4404             breakPos = bi->preceding(i);
4405             if (breakPos >= i ||
4406                 breakPos > lastBreakPos ||
4407                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4408                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4409                 errln("%s break monkey test: "
4410                     "Out of range value returned by BreakIterator::preceding().\n"
4411                     "index=%d;  prev returned %d; lastBreak=%d" ,
4412                     name,  i, breakPos, lastBreakPos);
4413                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4414                     precedingBreaks[i] = 2;   // Forces an error.
4415                 }
4416             } else {
4417                 if (breakPos >= 0) {
4418                     precedingBreaks[breakPos] = 1;
4419                 }
4420                 lastBreakPos = breakPos;
4421             }
4422         }
4423
4424         // Compare the expected and actual results.
4425         for (i=0; i<=testText.length(); i++) {
4426             const char *errorType = NULL;
4427             if  (forwardBreaks[i] != expectedBreaks[i]) {
4428                 errorType = "next()";
4429             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4430                 errorType = "previous()";
4431             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4432                 errorType = "isBoundary()";
4433             } else if (followingBreaks[i] != expectedBreaks[i]) {
4434                 errorType = "following()";
4435             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4436                 errorType = "preceding()";
4437             }
4438
4439
4440             if (errorType != NULL) {
4441                 // Format a range of the test text that includes the failure as
4442                 //  a data item that can be included in the rbbi test data file.
4443
4444                 // Start of the range is the last point where expected and actual results
4445                 //   both agreed that there was a break position.
4446                 int startContext = i;
4447                 int32_t count = 0;
4448                 for (;;) {
4449                     if (startContext==0) { break; }
4450                     startContext --;
4451                     if (expectedBreaks[startContext] != 0) {
4452                         if (count == 2) break;
4453                         count ++;
4454                     }
4455                 }
4456
4457                 // End of range is two expected breaks past the start position.
4458                 int endContext = i + 1;
4459                 int ci;
4460                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4461                     for (;;) {
4462                         if (endContext >= testText.length()) {break;}
4463                         if (expectedBreaks[endContext-1] != 0) {
4464                             if (count == 0) break;
4465                             count --;
4466                         }
4467                         endContext ++;
4468                     }
4469                 }
4470
4471                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4472                 UnicodeString errorText = "<data>";
4473                 /***if (strcmp(errorType, "next()") == 0) {
4474                     startContext = 0;
4475                     endContext = testText.length();
4476
4477                     printStringBreaks(testText, expected, expectedCount);
4478                 }***/
4479
4480                 for (ci=startContext; ci<endContext;) {
4481                     UnicodeString hexChars("0123456789abcdef");
4482                     UChar32  c;
4483                     int      bn;
4484                     c = testText.char32At(ci);
4485                     if (ci == i) {
4486                         // This is the location of the error.
4487                         errorText.append("<?>");
4488                     } else if (expectedBreaks[ci] != 0) {
4489                         // This a non-error expected break position.
4490                         errorText.append("\\");
4491                     }
4492                     if (c < 0x10000) {
4493                         errorText.append("\\u");
4494                         for (bn=12; bn>=0; bn-=4) {
4495                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4496                         }
4497                     } else {
4498                         errorText.append("\\U");
4499                         for (bn=28; bn>=0; bn-=4) {
4500                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4501                         }
4502                     }
4503                     ci = testText.moveIndex32(ci, 1);
4504                 }
4505                 errorText.append("\\");
4506                 errorText.append("</data>\n");
4507
4508                 // Output the error
4509                 char  charErrorTxt[500];
4510                 UErrorCode status = U_ZERO_ERROR;
4511                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4512                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4513                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4514
4515                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4516                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4517                     errorType, seed, i, charErrorTxt);
4518                 break;
4519             }
4520         }
4521
4522         loopCount++;
4523     }
4524 #endif
4525 }
4526
4527
4528 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4529 //             This test checks the initial patch,
4530 //             which is to just keep it from crashing.  Correct word boundaries
4531 //             await a proper fix to the dictionary code.
4532 //
4533 void RBBITest::TestBug5532(void)  {
4534    // Text includes a mixture of Thai and Latin.
4535    const unsigned char utf8Data[] = {
4536            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4537            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4538            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4539            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4540            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4541            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4542            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4543            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4544            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4545            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4546            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4547
4548     UErrorCode status = U_ZERO_ERROR;
4549     UText utext=UTEXT_INITIALIZER;
4550     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4551     TEST_ASSERT_SUCCESS(status);
4552
4553     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4554     TEST_ASSERT_SUCCESS(status);
4555     if (U_SUCCESS(status)) {
4556         bi->setText(&utext, status);
4557         TEST_ASSERT_SUCCESS(status);
4558
4559         int32_t breakCount = 0;
4560         int32_t previousBreak = -1;
4561         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4562             // For now, just make sure that the break iterator doesn't hang.
4563             TEST_ASSERT(previousBreak < bi->current());
4564             previousBreak = bi->current();
4565         }
4566         TEST_ASSERT(breakCount > 0);
4567     }
4568     delete bi;
4569     utext_close(&utext);
4570 }
4571
4572
4573 void RBBITest::TestBug9983(void)  {
4574     UnicodeString text = UnicodeString("\\u002A"  // * Other
4575                                        "\\uFF65"  //   Other
4576                                        "\\u309C"  //   Katakana
4577                                        "\\uFF9F"  //   Extend
4578                                        "\\uFF65"  //   Other
4579                                        "\\u0020"  //   Other
4580                                        "\\u0000").unescape();
4581
4582     UErrorCode status = U_ZERO_ERROR;
4583     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4584         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4585     TEST_ASSERT_SUCCESS(status);
4586     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4587         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4588     TEST_ASSERT_SUCCESS(status);
4589     if (U_FAILURE(status)) {
4590         return;
4591     }
4592     int32_t offset, rstatus, iterationCount;
4593
4594     brkiter->setText(text);
4595     brkiter->last();
4596     iterationCount = 0;
4597     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4598         iterationCount++;
4599         rstatus = brkiter->getRuleStatus();
4600         (void)rstatus;     // Suppress set but not used warning.
4601         if (iterationCount >= 10) {
4602            break;
4603         }
4604     }
4605     TEST_ASSERT(iterationCount == 6);
4606
4607     brkiterPOSIX->setText(text);
4608     brkiterPOSIX->last();
4609     iterationCount = 0;
4610     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4611         iterationCount++;
4612         rstatus = brkiterPOSIX->getRuleStatus();
4613         (void)rstatus;     // Suppress set but not used warning.
4614         if (iterationCount >= 10) {
4615            break;
4616         }
4617     }
4618     TEST_ASSERT(iterationCount == 6);
4619 }
4620
4621 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4622 //
4623 void RBBITest::TestBug7547() {
4624     UnicodeString rules;
4625     UErrorCode status = U_ZERO_ERROR;
4626     UParseError parseError;
4627     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4628     if (status != U_BRK_RULE_SYNTAX) {
4629         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4630     }
4631     if (parseError.line != 1 || parseError.offset != 0) {
4632         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4633     }
4634 }
4635
4636
4637 void RBBITest::TestBug12797() {
4638     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4639     UErrorCode status = U_ZERO_ERROR;
4640     UParseError parseError;
4641     RuleBasedBreakIterator bi(rules, parseError, status);
4642     if (U_FAILURE(status)) {
4643         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4644         return;
4645     }
4646     UnicodeString text = "abc";
4647     bi.setText(text);
4648     bi.first();
4649     int32_t boundary = bi.next();
4650     if (boundary != 3) {
4651         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4652     }
4653 }
4654
4655
4656 //
4657 //  TestDebug    -  A place-holder test for debugging purposes.
4658 //                  For putting in fragments of other tests that can be invoked
4659 //                  for tracing  without a lot of unwanted extra stuff happening.
4660 //
4661 void RBBITest::TestDebug(void) {
4662
4663 }
4664
4665 void RBBITest::TestProperties() {
4666     UErrorCode errorCode = U_ZERO_ERROR;
4667     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4668     if (!prependSet.isEmpty()) {
4669         errln(
4670             "[:GCB=Prepend:] is not empty any more. "
4671             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4672             "change this test to the opposite condition.");
4673     }
4674 }
4675
4676 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */