1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
21 #include "unicode/brkiter.h"
22 #include "unicode/localpointer.h"
23 #include "unicode/numfmt.h"
24 #include "unicode/rbbi.h"
25 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
26 #include "unicode/regex.h"
28 #include "unicode/schriter.h"
29 #include "unicode/uchar.h"
30 #include "unicode/utf16.h"
31 #include "unicode/ucnv.h"
32 #include "unicode/uniset.h"
33 #include "unicode/uscript.h"
34 #include "unicode/ustring.h"
35 #include "unicode/utext.h"
41 #include "utypeinfo.h" // for 'typeid' to work
45 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
46 #include "unicode/filteredbrk.h"
47 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
49 #define TEST_ASSERT(x) {if (!(x)) { \
50 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
52 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
53 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
56 //---------------------------------------------
58 //---------------------------------------------
61 // Note: Before adding new tests to this file, check whether the desired test data can
62 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
63 // it's much less work than writing a new test, diagnostic output in the event of failures
64 // is good, and the test data file will is shared with ICU4J, so eventually the test
65 // will run there as well, without additional effort.
67 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
69 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
73 #if !UCONFIG_NO_FILE_IO
74 TESTCASE_AUTO(TestBug4153072);
76 TESTCASE_AUTO(TestStatusReturn);
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(TestUnicodeFiles);
79 TESTCASE_AUTO(TestEmptyString);
81 TESTCASE_AUTO(TestGetAvailableLocales);
82 TESTCASE_AUTO(TestGetDisplayName);
83 #if !UCONFIG_NO_FILE_IO
84 TESTCASE_AUTO(TestEndBehaviour);
85 TESTCASE_AUTO(TestWordBreaks);
86 TESTCASE_AUTO(TestWordBoundary);
87 TESTCASE_AUTO(TestLineBreaks);
88 TESTCASE_AUTO(TestSentBreaks);
89 TESTCASE_AUTO(TestExtended);
91 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
92 TESTCASE_AUTO(TestMonkey);
94 #if !UCONFIG_NO_FILE_IO
95 TESTCASE_AUTO(TestBug3818);
97 TESTCASE_AUTO(TestDebug);
98 #if !UCONFIG_NO_FILE_IO
99 TESTCASE_AUTO(TestBug5775);
101 TESTCASE_AUTO(TestBug9983);
102 TESTCASE_AUTO(TestDictRules);
103 TESTCASE_AUTO(TestBug5532);
104 TESTCASE_AUTO(TestBug7547);
105 TESTCASE_AUTO(TestBug12797);
110 //---------------------------------------------------------------------------
112 // class BITestData Holds a set of Break iterator test data and results
114 // - the string data to be broken
115 // - a vector of the expected break positions.
116 // - a vector of source line numbers for the data,
117 // (to help see where errors occured.)
118 // - The expected break tag values.
119 // - Vectors of actual break positions and tag values.
120 // - Functions for comparing actual with expected and
123 //----------------------------------------------------------------------------
126 UnicodeString fDataToBreak;
127 UVector fExpectedBreakPositions;
128 UVector fExpectedTags;
130 UVector fActualBreakPositions; // Test Results.
133 BITestData(UErrorCode &status);
134 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
135 void checkResults(const char *heading, RBBITest *test);
136 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
143 BITestData::BITestData(UErrorCode &status)
144 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
150 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
151 // The macro form collects the line number, which is helpful
152 // when tracking down failures.
154 // A null data item is inserted at the start of each test's data
155 // to put the starting zero into the data list. The position saved for
156 // each non-null item is its ending position.
158 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
159 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
160 if (U_FAILURE(status)) {return;}
162 fDataToBreak.append(CharsToUnicodeString(data));
164 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
165 fExpectedTags.addElement(tag, status);
166 fLineNum.addElement(lineNum, status);
171 // checkResults. Compare the actual and expected break positions, report any differences.
173 void BITestData::checkResults(const char *heading, RBBITest *test) {
174 int32_t expectedIndex = 0;
175 int32_t actualIndex = 0;
178 // If we've run through both the expected and actual results vectors, we're done.
179 // break out of the loop.
180 if (expectedIndex >= fExpectedBreakPositions.size() &&
181 actualIndex >= fActualBreakPositions.size()) {
186 if (expectedIndex >= fExpectedBreakPositions.size()) {
187 err(heading, test, expectedIndex-1, actualIndex);
192 if (actualIndex >= fActualBreakPositions.size()) {
193 err(heading, test, expectedIndex, actualIndex-1);
198 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
199 err(heading, test, expectedIndex, actualIndex);
200 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
201 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
209 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
210 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
211 heading, fLineNum.elementAt(expectedIndex),
212 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
221 // err - An error was found. Report it, along with information about where the
222 // incorrectly broken test data appeared in the source file.
224 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
226 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
227 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
229 int32_t line = fLineNum.elementAti(expectedIdx);
230 if (expectedIdx > 0) {
231 // The line numbers are off by one because a premature break occurs somewhere
232 // within the previous item, rather than at the start of the current (expected) item.
233 // We want to report the offset of the unexpected break from the start of
234 // this previous item.
235 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
237 if (actual < expected) {
238 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
240 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
245 void BITestData::clearResults() {
246 fActualBreakPositions.removeAllElements();
247 fActualTags.removeAllElements();
251 //--------------------------------------------------------------------------------------
253 // RBBITest constructor and destructor
255 //--------------------------------------------------------------------------------------
257 RBBITest::RBBITest() {
262 RBBITest::~RBBITest() {
265 //-----------------------------------------------------------------------------------
267 // Test for status {tag} return value from break rules.
268 // TODO: a more thorough test.
270 //-----------------------------------------------------------------------------------
271 void RBBITest::TestStatusReturn() {
272 UnicodeString rulesString1("$Letters = [:L:];\n"
273 "$Numbers = [:N:];\n"
276 "Help\\ /me\\!{4};\n"
277 "[^$Letters $Numbers];\n"
278 "!.*;\n", -1, US_INV);
279 UnicodeString testString1 = "abc123..abc Help me Help me!";
280 // 01234567890123456789012345678
281 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
282 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
284 UErrorCode status=U_ZERO_ERROR;
285 UParseError parseError;
287 LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
288 if(U_FAILURE(status)) {
289 dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
294 bi->setText(testString1);
295 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
296 if (pos != bounds1[i]) {
297 errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
301 int tag = bi->getRuleStatus();
302 if (tag != brkStatus[i]) {
303 errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
311 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
312 UErrorCode status = U_ZERO_ERROR;
314 printf("code alpha extend alphanum type word sent line name\n");
315 int nextExpectedIndex = 0;
316 utext_setNativeIndex(tstr, 0);
317 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
318 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
319 printf("------------------------------------------------ %d\n", j);
323 UChar32 c = utext_next32(tstr);
324 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
325 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
327 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
329 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
331 U_SHORT_PROPERTY_NAME),
332 u_getPropertyValueName(UCHAR_WORD_BREAK,
333 u_getIntPropertyValue(c,
335 U_SHORT_PROPERTY_NAME),
336 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
337 u_getIntPropertyValue(c,
338 UCHAR_SENTENCE_BREAK),
339 U_SHORT_PROPERTY_NAME),
340 u_getPropertyValueName(UCHAR_LINE_BREAK,
341 u_getIntPropertyValue(c,
343 U_SHORT_PROPERTY_NAME),
349 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
350 UErrorCode status = U_ZERO_ERROR;
352 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
353 if (U_FAILURE(status)) {
354 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
357 printStringBreaks(tstr, expected, expectedCount);
362 void RBBITest::TestBug3818() {
363 UErrorCode status = U_ZERO_ERROR;
365 // Four Thai words...
366 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
367 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
368 UnicodeString thaiStr(thaiWordData);
370 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
371 if (U_FAILURE(status) || bi == NULL) {
372 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
375 bi->setText(thaiStr);
377 int32_t startOfSecondWord = bi->following(1);
378 if (startOfSecondWord != 4) {
379 errln("Fail at file %s, line %d expected start of word at 4, got %d",
380 __FILE__, __LINE__, startOfSecondWord);
382 startOfSecondWord = bi->following(0);
383 if (startOfSecondWord != 4) {
384 errln("Fail at file %s, line %d expected start of word at 4, got %d",
385 __FILE__, __LINE__, startOfSecondWord);
390 //----------------------------------------------------------------------------
392 // generalIteratorTest Given a break iterator and a set of test data,
393 // Run the tests and report the results.
395 //----------------------------------------------------------------------------
396 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
399 bi.setText(td.fDataToBreak);
401 testFirstAndNext(bi, td);
403 testLastAndPrevious(bi, td);
405 testFollowing(bi, td);
406 testPreceding(bi, td);
407 testIsBoundary(bi, td);
408 doMultipleSelectionTest(bi, td);
413 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
416 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
418 UErrorCode status = U_ZERO_ERROR;
423 logln("Test first and next");
424 bi.setText(td.fDataToBreak);
427 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
428 td.fActualBreakPositions.addElement(p, status); // Save result.
429 tag = bi.getRuleStatus();
430 td.fActualTags.addElement(tag, status);
432 // If the iterator is not making forward progress, stop.
433 // No need to raise an error here, it'll be detected in the normal check of results.
438 td.checkResults("testFirstAndNext", this);
443 // TestLastAndPrevious. Run the iterator backwards, starting with last().
445 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
447 UErrorCode status = U_ZERO_ERROR;
449 int32_t lastP = 0x7ffffffe;
452 logln("Test last and previous");
453 bi.setText(td.fDataToBreak);
456 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
457 // Save break position. Insert it at start of vector of results, shoving
458 // already-saved results further towards the end.
459 td.fActualBreakPositions.insertElementAt(p, 0, status);
460 // bi.previous(); // TODO: Why does this fix things up????
462 tag = bi.getRuleStatus();
463 td.fActualTags.insertElementAt(tag, 0, status);
465 // If the iterator is not making progress, stop.
466 // No need to raise an error here, it'll be detected in the normal check of results.
471 td.checkResults("testLastAndPrevious", this);
475 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
477 UErrorCode status = U_ZERO_ERROR;
480 int32_t lastP = -2; // A value that will never be returned as a break position.
481 // cannot be -1; that is returned for DONE.
484 logln("testFollowing():");
485 bi.setText(td.fDataToBreak);
488 // Save the starting point, since we won't get that out of following.
490 td.fActualBreakPositions.addElement(p, status); // Save result.
491 tag = bi.getRuleStatus();
492 td.fActualTags.addElement(tag, status);
494 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
497 if (p == RuleBasedBreakIterator::DONE) {
500 // We've reached a new break position. Save it.
501 td.fActualBreakPositions.addElement(p, status); // Save result.
502 tag = bi.getRuleStatus();
503 td.fActualTags.addElement(tag, status);
507 // The loop normally exits by means of the break in the middle.
508 // Make sure that the index was at the correct position for the break iterator to have
510 if (i != td.fDataToBreak.length()) {
511 errln("testFollowing(): iterator returned DONE prematurely.");
514 // Full check of all results.
515 td.checkResults("testFollowing", this);
520 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
521 UErrorCode status = U_ZERO_ERROR;
524 int32_t lastP = 0x7ffffffe;
527 logln("testPreceding():");
528 bi.setText(td.fDataToBreak);
532 td.fActualBreakPositions.addElement(p, status);
533 tag = bi.getRuleStatus();
534 td.fActualTags.addElement(tag, status);
536 for (i = td.fDataToBreak.length(); i>=-1; i--) {
539 if (p == RuleBasedBreakIterator::DONE) {
542 // We've reached a new break position. Save it.
543 td.fActualBreakPositions.insertElementAt(p, 0, status);
545 tag = bi.getRuleStatus();
546 td.fActualTags.insertElementAt(tag, 0, status);
549 // The loop normally exits by means of the break in the middle.
550 // Make sure that the index was at the correct position for the break iterator to have
553 errln("testPreceding(): iterator returned DONE prematurely.");
556 // Full check of all results.
557 td.checkResults("testPreceding", this);
562 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
563 UErrorCode status = U_ZERO_ERROR;
567 logln("testIsBoundary():");
568 bi.setText(td.fDataToBreak);
571 for (i = 0; i <= td.fDataToBreak.length(); i++) {
572 if (bi.isBoundary(i)) {
573 td.fActualBreakPositions.addElement(i, status); // Save result.
574 tag = bi.getRuleStatus();
575 td.fActualTags.addElement(tag, status);
578 td.checkResults("testIsBoundary: ", this);
583 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
585 iterator.setText(td.fDataToBreak);
587 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
588 int32_t offset = iterator.first();
592 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
594 if (*testIterator != iterator)
595 errln("clone() or operator!= failed: two clones compared unequal");
598 testOffset = testIterator->first();
599 testOffset = testIterator->next(count);
600 if (offset != testOffset)
601 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
603 if (offset != RuleBasedBreakIterator::DONE) {
605 offset = iterator.next();
607 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
608 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
609 if (count > 10000 || offset == -1) {
610 errln("operator== failed too many times. Stopping test.");
612 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
618 } while (offset != RuleBasedBreakIterator::DONE);
620 // now do it backwards...
621 offset = iterator.last();
625 testOffset = testIterator->last();
626 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
627 if (offset != testOffset)
628 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
630 if (offset != RuleBasedBreakIterator::DONE) {
632 offset = iterator.previous();
634 } while (offset != RuleBasedBreakIterator::DONE);
640 //---------------------------------------------
644 //---------------------------------------------
645 void RBBITest::TestEmptyString()
647 UnicodeString text = "";
648 UErrorCode status = U_ZERO_ERROR;
650 BITestData x(status);
651 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
652 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
653 if (U_FAILURE(status))
655 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
658 generalIteratorTest(*bi, x);
662 void RBBITest::TestGetAvailableLocales()
664 int32_t locCount = 0;
665 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
668 dataerrln("getAvailableLocales() returned an empty list!");
669 // Just make sure that it's returning good memory.
671 for (i = 0; i < locCount; ++i) {
672 logln(locList[i].getName());
676 //Testing the BreakIterator::getDisplayName() function
677 void RBBITest::TestGetDisplayName()
679 UnicodeString result;
681 BreakIterator::getDisplayName(Locale::getUS(), result);
682 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
683 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
686 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
687 if (result != "French (France)")
688 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
695 void RBBITest::TestEndBehaviour()
697 UErrorCode status = U_ZERO_ERROR;
698 UnicodeString testString("boo.");
699 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
700 if (U_FAILURE(status))
702 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
705 wb->setText(testString);
707 if (wb->first() != 0)
708 errln("Didn't get break at beginning of string.");
710 errln("Didn't get break before period in \"boo.\"");
711 if (wb->current() != 4 && wb->next() != 4)
712 errln("Didn't get break at end of string.");
718 void RBBITest::TestBug4153072() {
719 UErrorCode status = U_ZERO_ERROR;
720 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
721 if (U_FAILURE(status))
723 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
726 UnicodeString str("...Hello, World!...");
728 int32_t end = str.length() - 3;
731 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
732 iter->adoptText(textIterator);
734 // Note: with the switch to UText, there is no way to restrict the
735 // iteration range to begin at an index other than zero.
736 // String character iterators created with a non-zero bound are
737 // treated by RBBI as being empty.
738 for (index = -1; index < begin + 1; ++index) {
739 onBoundary = iter->isBoundary(index);
740 if (index == 0? !onBoundary : onBoundary) {
741 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
742 " and begin index = " + begin);
750 // Test for problem reported by Ashok Matoria on 9 July 2007
751 // One.<kSoftHyphen><kSpace>Two.
753 // Sentence break at start (0) and then on calling next() it breaks at
754 // 'T' of "Two". Now, at this point if I do next() and
755 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
757 void RBBITest::TestBug5775() {
758 UErrorCode status = U_ZERO_ERROR;
759 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
760 TEST_ASSERT_SUCCESS(status);
761 if (U_FAILURE(status)) {
764 // Check for status first for better handling of no data errors.
765 TEST_ASSERT(bi != NULL);
770 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
774 int pos = bi->next();
775 TEST_ASSERT(pos == 6);
777 TEST_ASSERT(pos == 10);
778 pos = bi->previous();
779 TEST_ASSERT(pos == 6);
785 //------------------------------------------------------------------------------
787 // RBBITest::Extended Run RBBI Tests from an external test data file
789 //------------------------------------------------------------------------------
792 BreakIterator *bi; // Break iterator is set while parsing test source.
793 // Changed out whenever test data changes break type.
795 UnicodeString dataToBreak; // Data that is built up while parsing the test.
796 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
797 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
800 UText *textToBreak; // UText, could be UTF8 or UTF16.
801 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
802 CharString utf8String; // UTF-8 form of text to break.
804 TestParams(UErrorCode &status) : dataToBreak() {
806 expectedBreaks = new UVector32(status);
807 srcLine = new UVector32(status);
808 srcCol = new UVector32(status);
810 textMap = new UVector32(status);
815 delete expectedBreaks;
818 utext_close(textToBreak);
822 int32_t getSrcLine(int32_t bp);
823 int32_t getExpectedBreak(int32_t bp);
824 int32_t getSrcCol(int32_t bp);
826 void setUTF16(UErrorCode &status);
827 void setUTF8(UErrorCode &status);
830 // Append a UnicodeString to a CharString with UTF-8 encoding.
831 // Substitute any invalid chars.
832 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
833 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
834 if (U_FAILURE(status)) {
838 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
839 src.getBuffer(), src.length(), // UTF-16 data
840 0xfffd, NULL, // Substitution char, number of subs.
842 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
845 status = U_ZERO_ERROR;
847 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
848 u_strToUTF8WithSub(buffer, utf8Length, NULL,
849 src.getBuffer(), src.length(),
850 0xfffd, NULL, &status);
851 dest.append(buffer, utf8Length, status);
855 void TestParams::setUTF16(UErrorCode &status) {
856 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
857 textMap->removeAllElements();
858 for (int32_t i=0; i<dataToBreak.length(); i++) {
859 if (i == dataToBreak.getChar32Start(i)) {
860 textMap->addElement(i, status);
862 textMap->addElement(-1, status);
865 textMap->addElement(dataToBreak.length(), status);
866 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
870 void TestParams::setUTF8(UErrorCode &status) {
871 if (U_FAILURE(status)) {
875 CharStringAppend(utf8String, dataToBreak, status);
876 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
877 if (U_FAILURE(status)) {
881 textMap->removeAllElements();
882 int32_t utf16Index = 0;
884 textMap->addElement(utf16Index, status);
885 UChar32 c32 = utext_current32(textToBreak);
889 utf16Index += U16_LENGTH(c32);
890 utext_next32(textToBreak);
891 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
892 textMap->addElement(-1, status);
895 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
899 int32_t TestParams::getSrcLine(int32_t bp) {
900 if (bp >= textMap->size()) {
901 bp = textMap->size() - 1;
904 for(; bp >= 0 ; --bp) {
905 // Move to a character boundary if we are not on one already.
906 i = textMap->elementAti(bp);
911 return srcLine->elementAti(i);
915 int32_t TestParams::getExpectedBreak(int32_t bp) {
916 if (bp >= textMap->size()) {
919 int32_t i = textMap->elementAti(bp);
922 retVal = expectedBreaks->elementAti(i);
928 int32_t TestParams::getSrcCol(int32_t bp) {
929 if (bp >= textMap->size()) {
930 bp = textMap->size() - 1;
933 for(; bp >= 0; --bp) {
934 // Move bp to a character boundary if we are not on one already.
935 i = textMap->elementAti(bp);
940 return srcCol->elementAti(i);
944 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
949 TEST_ASSERT_SUCCESS(status);
950 if (U_FAILURE(status)) {
958 t->bi->setText(t->textToBreak, status);
960 // Run the iterator forward
963 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
965 // Fail for lack of forward progress.
966 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
967 bp, t->getSrcLine(bp), t->getSrcCol(bp));
971 // Check that there we didn't miss an expected break between the last one
973 for (i=prevBP+1; i<bp; i++) {
974 if (t->getExpectedBreak(i) != 0) {
975 int expected[] = {0, i};
976 printStringBreaks(t->dataToBreak, expected, 2);
977 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
978 i, t->getSrcLine(i), t->getSrcCol(i));
982 // Check that the break we did find was expected
983 if (t->getExpectedBreak(bp) == 0) {
984 int expected[] = {0, bp};
985 printStringBreaks(t->textToBreak, expected, 2);
986 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
987 bp, t->getSrcLine(bp), t->getSrcCol(bp));
989 // The break was expected.
990 // Check that the {nnn} tag value is correct.
991 int32_t expectedTagVal = t->getExpectedBreak(bp);
992 if (expectedTagVal == -1) {
995 int32_t line = t->getSrcLine(bp);
996 int32_t rs = t->bi->getRuleStatus();
997 if (rs != expectedTagVal) {
998 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
999 " Actual, Expected status = %4d, %4d",
1000 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1007 // Verify that there were no missed expected breaks after the last one found
1008 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1009 if (t->getExpectedBreak(i) != 0) {
1010 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1011 i, t->getSrcLine(i), t->getSrcCol(i));
1016 // Run the iterator backwards, verify that the same breaks are found.
1018 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
1019 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1021 // Fail for lack of progress.
1022 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1023 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1027 // Check that we didn't miss an expected break between the last one
1028 // and this one. (UVector returns zeros for index out of bounds.)
1029 for (i=prevBP-1; i>bp; i--) {
1030 if (t->getExpectedBreak(i) != 0) {
1031 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1032 i, t->getSrcLine(i), t->getSrcCol(i));
1036 // Check that the break we did find was expected
1037 if (t->getExpectedBreak(bp) == 0) {
1038 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1039 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1041 // The break was expected.
1042 // Check that the {nnn} tag value is correct.
1043 int32_t expectedTagVal = t->getExpectedBreak(bp);
1044 if (expectedTagVal == -1) {
1047 int line = t->getSrcLine(bp);
1048 int32_t rs = t->bi->getRuleStatus();
1049 if (rs != expectedTagVal) {
1050 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1051 " Actual, Expected status = %4d, %4d",
1052 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1059 // Verify that there were no missed breaks prior to the last one found
1060 for (i=prevBP-1; i>=0; i--) {
1061 if (t->getExpectedBreak(i) != 0) {
1062 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1063 i, t->getSrcLine(i), t->getSrcCol(i));
1067 // Check isBoundary()
1068 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1069 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1070 UBool boundaryFound = t->bi->isBoundary(i);
1071 if (boundaryExpected != boundaryFound) {
1072 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1073 " Expected, Actual= %s, %s",
1074 i, t->getSrcLine(i), t->getSrcCol(i),
1075 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1079 // Check following()
1080 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1081 int32_t actualBreak = t->bi->following(i);
1082 int32_t expectedBreak = BreakIterator::DONE;
1083 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1084 if (t->getExpectedBreak(j) != 0) {
1089 if (expectedBreak != actualBreak) {
1090 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1091 " Expected, Actual= %d, %d",
1092 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1096 // Check preceding()
1097 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1098 int32_t actualBreak = t->bi->preceding(i);
1099 int32_t expectedBreak = BreakIterator::DONE;
1101 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1102 // preceding(trailing byte) will return the index of some preceding code point,
1103 // not the lead byte of the current code point, even though that has a smaller index.
1104 // Therefore, start looking at the expected break data not at i-1, but at
1105 // the start of code point index - 1.
1106 utext_setNativeIndex(t->textToBreak, i);
1107 int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1108 for (; j >= 0; j--) {
1109 if (t->getExpectedBreak(j) != 0) {
1114 if (expectedBreak != actualBreak) {
1115 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1116 " Expected, Actual= %d, %d",
1117 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1123 void RBBITest::TestExtended() {
1124 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1125 UErrorCode status = U_ZERO_ERROR;
1128 UnicodeString rules;
1129 TestParams tp(status);
1131 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1132 if (U_FAILURE(status)) {
1133 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1138 // Open and read the test data file.
1140 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1141 char testFileName[1000];
1142 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1143 errln("Can't open test data. Path too long.");
1146 strcpy(testFileName, testDataDirectory);
1147 strcat(testFileName, "rbbitst.txt");
1150 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1151 if (U_FAILURE(status)) {
1152 return; /* something went wrong, error already output */
1156 bool skipTest = false; // Skip this test?
1159 // Put the test data into a UnicodeString
1161 UnicodeString testString(FALSE, testFile, len);
1169 parseState = PARSE_TAG;
1171 EParseState savedState = PARSE_TAG;
1173 static const UChar CH_LF = 0x0a;
1174 static const UChar CH_CR = 0x0d;
1175 static const UChar CH_HASH = 0x23;
1176 /*static const UChar CH_PERIOD = 0x2e;*/
1177 static const UChar CH_LT = 0x3c;
1178 static const UChar CH_GT = 0x3e;
1179 static const UChar CH_BACKSLASH = 0x5c;
1180 static const UChar CH_BULLET = 0x2022;
1182 int32_t lineNum = 1;
1183 int32_t colStart = 0;
1185 int32_t charIdx = 0;
1187 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1189 for (charIdx = 0; charIdx < len; ) {
1190 status = U_ZERO_ERROR;
1191 UChar c = testString.charAt(charIdx);
1193 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1194 // treat CRLF as a unit
1198 if (c == CH_LF || c == CH_CR) {
1202 column = charIdx - colStart + 1;
1204 switch (parseState) {
1206 if (c == 0x0a || c == 0x0d) {
1207 parseState = savedState;
1214 parseState = PARSE_COMMENT;
1215 savedState = PARSE_TAG;
1218 if (u_isUWhiteSpace(c)) {
1221 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1223 tp.bi = BreakIterator::createWordInstance(locale, status);
1228 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1230 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1235 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1237 tp.bi = BreakIterator::createLineInstance(locale, status);
1242 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1244 tp.bi = BreakIterator::createSentenceInstance(locale, status);
1249 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1251 tp.bi = BreakIterator::createTitleInstance(locale, status);
1256 // <locale loc_name>
1257 localeMatcher.reset(testString);
1258 if (localeMatcher.lookingAt(charIdx-1, status)) {
1259 UnicodeString localeName = localeMatcher.group(1, status);
1260 char localeName8[100];
1261 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1262 locale = Locale::createFromName(localeName8);
1263 charIdx += localeMatcher.group(0, status).length() - 1;
1264 TEST_ASSERT_SUCCESS(status);
1267 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1268 parseState = PARSE_DATA;
1270 tp.dataToBreak = "";
1271 tp.expectedBreaks->removeAllElements();
1272 tp.srcCol ->removeAllElements();
1273 tp.srcLine->removeAllElements();
1277 errln("line %d: Tag expected in test file.", lineNum);
1278 parseState = PARSE_COMMENT;
1279 savedState = PARSE_DATA;
1280 goto end_test; // Stop the test.
1285 if (c == CH_BULLET) {
1286 int32_t breakIdx = tp.dataToBreak.length();
1287 tp.expectedBreaks->setSize(breakIdx+1);
1288 tp.expectedBreaks->setElementAt(-1, breakIdx);
1289 tp.srcLine->setSize(breakIdx+1);
1290 tp.srcLine->setElementAt(lineNum, breakIdx);
1291 tp.srcCol ->setSize(breakIdx+1);
1292 tp.srcCol ->setElementAt(column, breakIdx);
1296 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1297 // Add final entry to mappings from break location to source file position.
1298 // Need one extra because last break position returned is after the
1299 // last char in the data, not at the last char.
1300 tp.srcLine->addElement(lineNum, status);
1301 tp.srcCol ->addElement(column, status);
1303 parseState = PARSE_TAG;
1308 status = U_ZERO_ERROR;
1309 tp.setUTF16(status);
1310 executeTest(&tp, status);
1311 TEST_ASSERT_SUCCESS(status);
1313 // Run again, this time with UTF-8 text wrapped in a UText.
1314 status = U_ZERO_ERROR;
1316 TEST_ASSERT_SUCCESS(status);
1317 executeTest(&tp, status);
1322 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1323 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1324 // Get the code point from the name and insert it into the test data.
1325 // (Damn, no API takes names in Unicode !!!
1326 // we've got to take it back to char *)
1327 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1328 int32_t nameLength = nameEndIdx - (charIdx+2);
1329 char charNameBuf[200];
1330 UChar32 theChar = -1;
1331 if (nameEndIdx != -1) {
1332 UErrorCode status = U_ZERO_ERROR;
1333 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1334 charNameBuf[sizeof(charNameBuf)-1] = 0;
1335 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1336 if (U_FAILURE(status)) {
1340 if (theChar == -1) {
1341 errln("Error in named character in test file at line %d, col %d",
1344 // Named code point was recognized. Insert it
1345 // into the test data.
1346 tp.dataToBreak.append(theChar);
1347 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1348 tp.srcLine->addElement(lineNum, status);
1349 tp.srcCol ->addElement(column, status);
1352 if (nameEndIdx > charIdx) {
1353 charIdx = nameEndIdx+1;
1362 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1364 int32_t breakIdx = tp.dataToBreak.length();
1365 tp.expectedBreaks->setSize(breakIdx+1);
1366 tp.expectedBreaks->setElementAt(-1, breakIdx);
1367 tp.srcLine->setSize(breakIdx+1);
1368 tp.srcLine->setElementAt(lineNum, breakIdx);
1369 tp.srcCol ->setSize(breakIdx+1);
1370 tp.srcCol ->setElementAt(column, breakIdx);
1376 parseState = PARSE_NUM;
1380 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1381 parseState = PARSE_COMMENT;
1382 savedState = PARSE_DATA;
1386 if (c == CH_BACKSLASH) {
1387 // Check for \ at end of line, a line continuation.
1388 // Advance over (discard) the newline
1389 UChar32 cp = testString.char32At(charIdx);
1390 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1392 // Need an extra increment of the input ptr to move over both of them
1395 if (cp == CH_LF || cp == CH_CR) {
1402 // Let unescape handle the back slash.
1403 cp = testString.unescapeAt(charIdx);
1405 // Escape sequence was recognized. Insert the char
1406 // into the test data.
1407 tp.dataToBreak.append(cp);
1408 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1409 tp.srcLine->addElement(lineNum, status);
1410 tp.srcCol ->addElement(column, status);
1416 // Not a recognized backslash escape sequence.
1417 // Take the next char as a literal.
1418 // TODO: Should this be an error?
1419 c = testString.charAt(charIdx);
1420 charIdx = testString.moveIndex32(charIdx, 1);
1423 // Normal, non-escaped data char.
1424 tp.dataToBreak.append(c);
1426 // Save the mapping from offset in the data to line/column numbers in
1427 // the original input file. Will be used for better error messages only.
1428 // If there's an expected break before this char, the slot in the mapping
1429 // vector will already be set for this char; don't overwrite it.
1430 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1431 tp.srcLine->addElement(lineNum, status);
1432 tp.srcCol ->addElement(column, status);
1438 // We are parsing an expected numeric tag value, like <1234>,
1439 // within a chunk of data.
1440 if (u_isUWhiteSpace(c)) {
1445 // Finished the number. Add the info to the expected break data,
1446 // and switch parse state back to doing plain data.
1447 parseState = PARSE_DATA;
1448 if (tagValue == 0) {
1451 int32_t breakIdx = tp.dataToBreak.length();
1452 tp.expectedBreaks->setSize(breakIdx+1);
1453 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1454 tp.srcLine->setSize(breakIdx+1);
1455 tp.srcLine->setElementAt(lineNum, breakIdx);
1456 tp.srcCol ->setSize(breakIdx+1);
1457 tp.srcCol ->setElementAt(column, breakIdx);
1462 tagValue = tagValue*10 + u_charDigitValue(c);
1466 errln("Syntax Error in test file at line %d, col %d",
1468 parseState = PARSE_COMMENT;
1469 goto end_test; // Stop the test
1474 if (U_FAILURE(status)) {
1475 dataerrln("ICU Error %s while parsing test file at line %d.",
1476 u_errorName(status), lineNum);
1477 status = U_ZERO_ERROR;
1478 goto end_test; // Stop the test
1489 //-------------------------------------------------------------------------------
1491 // TestDictRules create a break iterator from source rules that includes a
1492 // dictionary range. Regression for bug #7130. Source rules
1493 // do not declare a break iterator type (word, line, sentence, etc.
1494 // but the dictionary code, without a type, would loop.
1496 //-------------------------------------------------------------------------------
1497 void RBBITest::TestDictRules() {
1498 const char *rules = "$dictionary = [a-z]; \n"
1500 "$dictionary $dictionary; \n"
1502 "$dictionary $dictionary; \n";
1503 const char *text = "aa";
1504 UErrorCode status = U_ZERO_ERROR;
1505 UParseError parseError;
1507 RuleBasedBreakIterator bi(rules, parseError, status);
1508 if (U_SUCCESS(status)) {
1509 UnicodeString utext = text;
1513 for (loops = 0; loops<10; loops++) {
1514 position = bi.next();
1515 if (position == RuleBasedBreakIterator::DONE) {
1519 TEST_ASSERT(loops == 1);
1521 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1527 //-------------------------------------------------------------------------------
1529 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1530 // return the data in one big UChar * buffer, which the caller must delete.
1533 // fileName: the name of the file, with no directory part. The test data directory
1535 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1536 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1537 // specified here. The BOM, if it exists, will be stripped from the returned data.
1538 // Pass NULL for the system default encoding.
1541 // The file data, converted to UChar.
1542 // The caller must delete this when done with
1543 // delete [] theBuffer;
1545 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1546 // Move this function to some common place.
1548 //--------------------------------------------------------------------------------
1549 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1550 UChar *retPtr = NULL;
1551 char *fileBuf = NULL;
1552 UConverter* conv = NULL;
1556 if (U_FAILURE(status)) {
1563 f = fopen(fileName, "rb");
1565 dataerrln("Error opening test data file %s\n", fileName);
1566 status = U_FILE_ACCESS_ERROR;
1575 fseek( f, 0, SEEK_END);
1576 fileSize = ftell(f);
1577 fileBuf = new char[fileSize];
1578 fseek(f, 0, SEEK_SET);
1579 amt_read = fread(fileBuf, 1, fileSize, f);
1580 if (amt_read != fileSize || fileSize <= 0) {
1581 errln("Error reading test data file.");
1582 goto cleanUpAndReturn;
1586 // Look for a Unicode Signature (BOM) on the data just read
1588 int32_t signatureLength;
1589 const char * fileBufC;
1590 const char* bomEncoding;
1593 bomEncoding = ucnv_detectUnicodeSignature(
1594 fileBuf, fileSize, &signatureLength, &status);
1595 if(bomEncoding!=NULL ){
1596 fileBufC += signatureLength;
1597 fileSize -= signatureLength;
1598 encoding = bomEncoding;
1602 // Open a converter to take the rule file to UTF-16
1604 conv = ucnv_open(encoding, &status);
1605 if (U_FAILURE(status)) {
1606 goto cleanUpAndReturn;
1610 // Convert the rules to UChar.
1611 // Preflight first to determine required buffer size.
1613 ulen = ucnv_toUChars(conv,
1619 if (status == U_BUFFER_OVERFLOW_ERROR) {
1620 // Buffer Overflow is expected from the preflight operation.
1621 status = U_ZERO_ERROR;
1623 retPtr = new UChar[ulen+1];
1636 if (U_FAILURE(status)) {
1637 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1647 //--------------------------------------------------------------------------------------------
1649 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1651 //-------------------------------------------------------------------------------------------
1652 void RBBITest::TestUnicodeFiles() {
1653 RuleBasedBreakIterator *bi;
1654 UErrorCode status = U_ZERO_ERROR;
1656 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1657 TEST_ASSERT_SUCCESS(status);
1658 if (U_SUCCESS(status)) {
1659 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1663 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1664 TEST_ASSERT_SUCCESS(status);
1665 if (U_SUCCESS(status)) {
1666 runUnicodeTestData("WordBreakTest.txt", bi);
1670 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1671 TEST_ASSERT_SUCCESS(status);
1672 if (U_SUCCESS(status)) {
1673 runUnicodeTestData("SentenceBreakTest.txt", bi);
1677 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1678 TEST_ASSERT_SUCCESS(status);
1679 if (U_SUCCESS(status)) {
1680 runUnicodeTestData("LineBreakTest.txt", bi);
1686 // Check for test cases from the Unicode test data files that are known to fail
1687 // and should be skipped because ICU is not yet able to fully implement the spec.
1688 // See ticket #7270.
1690 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1691 static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file.
1692 {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198
1693 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202
1694 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214
1695 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246
1696 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298
1697 {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302
1699 if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1703 for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1704 if (testCase == UnicodeString(badTestCases[i])) {
1705 return logKnownIssue("7270");
1712 //--------------------------------------------------------------------------------------------
1714 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1716 //-------------------------------------------------------------------------------------------
1717 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1718 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1719 UErrorCode status = U_ZERO_ERROR;
1722 // Open and read the test data file, put it into a UnicodeString.
1724 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1725 char testFileName[1000];
1726 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1727 dataerrln("Can't open test data. Path too long.");
1730 strcpy(testFileName, testDataDirectory);
1731 strcat(testFileName, fileName);
1733 logln("Opening data file %s\n", fileName);
1736 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1737 if (status != U_FILE_ACCESS_ERROR) {
1738 TEST_ASSERT_SUCCESS(status);
1739 TEST_ASSERT(testFile != NULL);
1741 if (U_FAILURE(status) || testFile == NULL) {
1742 return; /* something went wrong, error already output */
1744 UnicodeString testFileAsString(TRUE, testFile, len);
1747 // Parse the test data file using a regular expression.
1748 // Each kind of token is recognized in its own capture group; what type of item was scanned
1749 // is identified by which group had a match.
1751 // Caputure Group # 1 2 3 4 5
1752 // Parses this item: divide x hex digits comment \n unrecognized \n
1754 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1755 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1756 UnicodeString testString;
1757 UVector32 breakPositions(status);
1759 TEST_ASSERT_SUCCESS(status);
1760 if (U_FAILURE(status)) {
1765 // Scan through each test case, building up the string to be broken in testString,
1766 // and the positions that should be boundaries in the breakPositions vector.
1769 while (tokenMatcher.find()) {
1770 if(tokenMatcher.hitEnd()) {
1771 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1772 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1773 and caused an infinite loop here on EBCDIC systems!
1775 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1778 if (tokenMatcher.start(1, status) >= 0) {
1779 // Scanned a divide sign, indicating a break position in the test data.
1780 if (testString.length()>0) {
1781 breakPositions.addElement(testString.length(), status);
1784 else if (tokenMatcher.start(2, status) >= 0) {
1785 // Scanned an 'x', meaning no break at this position in the test data
1786 // Nothing to be done here.
1788 else if (tokenMatcher.start(3, status) >= 0) {
1789 // Scanned Hex digits. Convert them to binary, append to the character data string.
1790 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1791 int length = hexNumber.length();
1794 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1795 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1797 testString.append(c);
1799 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1800 fileName, lineNumber);
1803 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1804 fileName, lineNumber);
1807 else if (tokenMatcher.start(4, status) >= 0) {
1808 // Scanned to end of a line, possibly skipping over a comment in the process.
1809 // If the line from the file contained test data, run the test now.
1810 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1811 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1814 // Clear out this test case.
1815 // The string and breakPositions vector will be refilled as the next
1816 // test case is parsed.
1817 testString.remove();
1818 breakPositions.removeAllElements();
1821 // Scanner catchall. Something unrecognized appeared on the line.
1823 UnicodeString uToken = tokenMatcher.group(0, status);
1824 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1825 token[sizeof(token)-1] = 0;
1826 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1828 // Clean up, in preparation for continuing with the next line.
1829 testString.remove();
1830 breakPositions.removeAllElements();
1833 TEST_ASSERT_SUCCESS(status);
1834 if (U_FAILURE(status)) {
1840 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1843 //--------------------------------------------------------------------------------------------
1845 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1846 // test data files. Do only a simple, forward-only check -
1847 // this test is mostly to check that ICU and the Unicode
1848 // data agree with each other.
1850 //--------------------------------------------------------------------------------------------
1851 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1852 const UnicodeString &testString, // Text data to be broken
1853 UVector32 *breakPositions, // Positions where breaks should be found.
1854 RuleBasedBreakIterator *bi) {
1855 int32_t pos; // Break Position in the test string
1856 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1857 int32_t expectedPos; // Expected break position (index into test string)
1859 bi->setText(testString);
1863 while (pos != BreakIterator::DONE) {
1864 if (expectedI >= breakPositions->size()) {
1865 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1866 testFileName, lineNumber, pos);
1869 expectedPos = breakPositions->elementAti(expectedI);
1870 if (pos < expectedPos) {
1871 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1872 testFileName, lineNumber, pos);
1875 if (pos > expectedPos) {
1876 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1877 testFileName, lineNumber, expectedPos);
1884 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1885 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1886 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1892 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1893 //---------------------------------------------------------------------------------------
1895 // classs RBBIMonkeyKind
1897 // Monkey Test for Break Iteration
1898 // Abstract interface class. Concrete derived classes independently
1899 // implement the break rules for different iterator types.
1901 // The Monkey Test itself uses doesn't know which type of break iterator it is
1902 // testing, but works purely in terms of the interface defined here.
1904 //---------------------------------------------------------------------------------------
1905 class RBBIMonkeyKind {
1907 // Return a UVector of UnicodeSets, representing the character classes used
1908 // for this type of iterator.
1909 virtual UVector *charClasses() = 0;
1911 // Set the test text on which subsequent calls to next() will operate
1912 virtual void setText(const UnicodeString &s) = 0;
1914 // Find the next break postion, starting from the prev break position, or from zero.
1915 // Return -1 after reaching end of string.
1916 virtual int32_t next(int32_t i) = 0;
1918 virtual ~RBBIMonkeyKind();
1919 UErrorCode deferredStatus;
1928 RBBIMonkeyKind::RBBIMonkeyKind() {
1929 deferredStatus = U_ZERO_ERROR;
1932 RBBIMonkeyKind::~RBBIMonkeyKind() {
1936 //----------------------------------------------------------------------------------------
1938 // Random Numbers. Similar to standard lib rand() and srand()
1939 // Not using library to
1940 // 1. Get same results on all platforms.
1941 // 2. Get access to current seed, to more easily reproduce failures.
1943 //---------------------------------------------------------------------------------------
1944 static uint32_t m_seed = 1;
1946 static uint32_t m_rand()
1948 m_seed = m_seed * 1103515245 + 12345;
1949 return (uint32_t)(m_seed/65536) % 32768;
1954 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
1956 static const char *gExtended_Pict = "["
1957 "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093"
1958 "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
1959 "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF"
1960 "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395"
1961 "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548"
1962 "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589"
1963 "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0"
1964 "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0"
1965 "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
1966 "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625"
1967 "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667"
1968 "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF"
1969 "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF"
1970 "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF"
1971 "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF"
1972 "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF"
1973 "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F"
1974 "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8"
1975 "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF"
1978 //------------------------------------------------------------------------------------------
1980 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1981 // of RBBIMonkeyKind.
1983 //------------------------------------------------------------------------------------------
1984 class RBBICharMonkey: public RBBIMonkeyKind {
1987 virtual ~RBBICharMonkey();
1988 virtual UVector *charClasses();
1989 virtual void setText(const UnicodeString &s);
1990 virtual int32_t next(int32_t i);
1994 UnicodeSet *fCRLFSet;
1995 UnicodeSet *fControlSet;
1996 UnicodeSet *fExtendSet;
1997 UnicodeSet *fZWJSet;
1998 UnicodeSet *fRegionalIndicatorSet;
1999 UnicodeSet *fPrependSet;
2000 UnicodeSet *fSpacingSet;
2005 UnicodeSet *fLVTSet;
2006 UnicodeSet *fHangulSet;
2007 UnicodeSet *fEmojiBaseSet;
2008 UnicodeSet *fEmojiModifierSet;
2009 UnicodeSet *fExtendedPictSet;
2010 UnicodeSet *fEBGSet;
2011 UnicodeSet *fEmojiNRKSet;
2012 UnicodeSet *fAnySet;
2014 const UnicodeString *fText;
2018 RBBICharMonkey::RBBICharMonkey() {
2019 UErrorCode status = U_ZERO_ERROR;
2023 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2024 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
2025 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
2026 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
2027 fRegionalIndicatorSet =
2028 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2029 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2030 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2031 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2032 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2033 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2034 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2035 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2036 fHangulSet = new UnicodeSet();
2037 fHangulSet->addAll(*fLSet);
2038 fHangulSet->addAll(*fVSet);
2039 fHangulSet->addAll(*fTSet);
2040 fHangulSet->addAll(*fLVSet);
2041 fHangulSet->addAll(*fLVTSet);
2043 fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
2044 fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
2045 fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
2046 fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
2047 fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
2048 "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2049 fAnySet = new UnicodeSet(0, 0x10ffff);
2051 fSets = new UVector(status);
2052 fSets->addElement(fCRLFSet, status);
2053 fSets->addElement(fControlSet, status);
2054 fSets->addElement(fExtendSet, status);
2055 fSets->addElement(fRegionalIndicatorSet, status);
2056 if (!fPrependSet->isEmpty()) {
2057 fSets->addElement(fPrependSet, status);
2059 fSets->addElement(fSpacingSet, status);
2060 fSets->addElement(fHangulSet, status);
2061 fSets->addElement(fAnySet, status);
2062 fSets->addElement(fEmojiBaseSet, status);
2063 fSets->addElement(fEmojiModifierSet, status);
2064 fSets->addElement(fZWJSet, status);
2065 fSets->addElement(fExtendedPictSet, status);
2066 fSets->addElement(fEBGSet, status);
2067 fSets->addElement(fEmojiNRKSet,status);
2068 if (U_FAILURE(status)) {
2069 deferredStatus = status;
2074 void RBBICharMonkey::setText(const UnicodeString &s) {
2080 int32_t RBBICharMonkey::next(int32_t prevPos) {
2081 int p0, p1, p2, p3; // Indices of the significant code points around the
2082 // break position being tested. The candidate break
2083 // location is before p2.
2087 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2088 UChar32 cBase; // for (X Extend*) patterns, the X character.
2090 if (U_FAILURE(deferredStatus)) {
2094 // Previous break at end of string. return DONE.
2095 if (prevPos >= fText->length()) {
2098 p0 = p1 = p2 = p3 = prevPos;
2099 c3 = fText->char32At(prevPos);
2100 c0 = c1 = c2 = cBase = 0;
2101 (void)p0; // suppress set but not used warning.
2104 // Loop runs once per "significant" character position in the input text.
2106 // Move all of the positions forward in the input string.
2111 // Advancd p3 by one codepoint
2112 p3 = fText->moveIndex32(p3, 1);
2113 c3 = fText->char32At(p3);
2116 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2119 if (p2 == fText->length()) {
2120 // Reached end of string. Always a break position.
2125 // No Extend or Format characters may appear between the CR and LF,
2126 // which requires the additional check for p2 immediately following p1.
2128 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2132 // Rule (GB4). ( Control | CR | LF ) <break>
2133 if (fControlSet->contains(c1) ||
2139 // Rule (GB5) <break> ( Control | CR | LF )
2141 if (fControlSet->contains(c2) ||
2148 // Rule (GB6) L x ( L | V | LV | LVT )
2149 if (fLSet->contains(c1) &&
2150 (fLSet->contains(c2) ||
2151 fVSet->contains(c2) ||
2152 fLVSet->contains(c2) ||
2153 fLVTSet->contains(c2))) {
2157 // Rule (GB7) ( LV | V ) x ( V | T )
2158 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2159 (fVSet->contains(c2) || fTSet->contains(c2))) {
2163 // Rule (GB8) ( LVT | T) x T
2164 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2165 fTSet->contains(c2)) {
2169 // Rule (GB9) x (Extend | ZWJ)
2170 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
2171 if (!fExtendSet->contains(c1)) {
2177 // Rule (GB9a) x SpacingMark
2178 if (fSpacingSet->contains(c2)) {
2182 // Rule (GB9b) Prepend x
2183 if (fPrependSet->contains(c1)) {
2187 // Rule (GB10) (Emoji_Base | EBG) Extend * x Emoji_Modifier
2188 if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
2191 if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
2192 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
2196 // Rule (GB11) (Glue_After_ZWJ | Emoji) ZWJ x (Glue_After_ZWJ | Emoji)
2197 if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
2198 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2202 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
2203 // Note: The first if condition is a little tricky. We only need to force
2204 // a break if there are three or more contiguous RIs. If there are
2205 // only two, a break following will occur via other rules, and will include
2206 // any trailing extend characters, which is needed behavior.
2207 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2208 && fRegionalIndicatorSet->contains(c2)) {
2211 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2215 // Rule (GB999) Any <break> Any
2225 UVector *RBBICharMonkey::charClasses() {
2230 RBBICharMonkey::~RBBICharMonkey() {
2235 delete fRegionalIndicatorSet;
2245 delete fEmojiBaseSet;
2246 delete fEmojiModifierSet;
2248 delete fExtendedPictSet;
2250 delete fEmojiNRKSet;
2253 //------------------------------------------------------------------------------------------
2255 // class RBBIWordMonkey Word Break specific implementation
2256 // of RBBIMonkeyKind.
2258 //------------------------------------------------------------------------------------------
2259 class RBBIWordMonkey: public RBBIMonkeyKind {
2262 virtual ~RBBIWordMonkey();
2263 virtual UVector *charClasses();
2264 virtual void setText(const UnicodeString &s);
2265 virtual int32_t next(int32_t i);
2271 UnicodeSet *fNewlineSet;
2272 UnicodeSet *fRegionalIndicatorSet;
2273 UnicodeSet *fKatakanaSet;
2274 UnicodeSet *fHebrew_LetterSet;
2275 UnicodeSet *fALetterSet;
2276 UnicodeSet *fSingle_QuoteSet;
2277 UnicodeSet *fDouble_QuoteSet;
2278 UnicodeSet *fMidNumLetSet;
2279 UnicodeSet *fMidLetterSet;
2280 UnicodeSet *fMidNumSet;
2281 UnicodeSet *fNumericSet;
2282 UnicodeSet *fFormatSet;
2283 UnicodeSet *fOtherSet;
2284 UnicodeSet *fExtendSet;
2285 UnicodeSet *fExtendNumLetSet;
2286 UnicodeSet *fDictionarySet;
2287 UnicodeSet *fEBaseSet;
2288 UnicodeSet *fEBGSet;
2289 UnicodeSet *fEModifierSet;
2290 UnicodeSet *fZWJSet;
2291 UnicodeSet *fExtendedPictSet;
2292 UnicodeSet *fEmojiNRKSet;
2294 const UnicodeString *fText;
2298 RBBIWordMonkey::RBBIWordMonkey()
2300 UErrorCode status = U_ZERO_ERROR;
2302 fSets = new UVector(status);
2304 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
2305 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
2306 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
2307 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
2308 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2309 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2310 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2311 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status);
2312 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status);
2313 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
2314 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
2315 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
2316 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
2317 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
2318 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2319 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
2321 fEBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
2322 "[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
2323 fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EBG}]"), status);
2324 fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EM}]"), status);
2325 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ZWJ}]"), status);
2326 fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
2327 fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
2328 "[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2330 fDictionarySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"), status);
2331 fDictionarySet->addAll(*fKatakanaSet);
2332 fDictionarySet->addAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2334 fALetterSet->removeAll(*fDictionarySet);
2336 fOtherSet = new UnicodeSet();
2337 if(U_FAILURE(status)) {
2338 deferredStatus = status;
2342 fOtherSet->complement();
2343 fOtherSet->removeAll(*fCRSet);
2344 fOtherSet->removeAll(*fLFSet);
2345 fOtherSet->removeAll(*fNewlineSet);
2346 fOtherSet->removeAll(*fKatakanaSet);
2347 fOtherSet->removeAll(*fHebrew_LetterSet);
2348 fOtherSet->removeAll(*fALetterSet);
2349 fOtherSet->removeAll(*fSingle_QuoteSet);
2350 fOtherSet->removeAll(*fDouble_QuoteSet);
2351 fOtherSet->removeAll(*fMidLetterSet);
2352 fOtherSet->removeAll(*fMidNumSet);
2353 fOtherSet->removeAll(*fNumericSet);
2354 fOtherSet->removeAll(*fExtendNumLetSet);
2355 fOtherSet->removeAll(*fFormatSet);
2356 fOtherSet->removeAll(*fExtendSet);
2357 fOtherSet->removeAll(*fRegionalIndicatorSet);
2358 fOtherSet->removeAll(*fEBaseSet);
2359 fOtherSet->removeAll(*fEBGSet);
2360 fOtherSet->removeAll(*fEModifierSet);
2361 fOtherSet->removeAll(*fZWJSet);
2362 fOtherSet->removeAll(*fExtendedPictSet);
2363 fOtherSet->removeAll(*fEmojiNRKSet);
2365 // Inhibit dictionary characters from being tested at all.
2366 fOtherSet->removeAll(*fDictionarySet);
2368 fSets->addElement(fCRSet, status);
2369 fSets->addElement(fLFSet, status);
2370 fSets->addElement(fNewlineSet, status);
2371 fSets->addElement(fRegionalIndicatorSet, status);
2372 fSets->addElement(fHebrew_LetterSet, status);
2373 fSets->addElement(fALetterSet, status);
2374 fSets->addElement(fSingle_QuoteSet, status);
2375 fSets->addElement(fDouble_QuoteSet, status);
2376 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
2377 // from the test data. They are all in the dictionary set,
2378 // which this (old, to be retired) monkey test cannot handle.
2379 fSets->addElement(fMidLetterSet, status);
2380 fSets->addElement(fMidNumLetSet, status);
2381 fSets->addElement(fMidNumSet, status);
2382 fSets->addElement(fNumericSet, status);
2383 fSets->addElement(fFormatSet, status);
2384 fSets->addElement(fExtendSet, status);
2385 fSets->addElement(fOtherSet, status);
2386 fSets->addElement(fExtendNumLetSet, status);
2388 fSets->addElement(fEBaseSet, status);
2389 fSets->addElement(fEBGSet, status);
2390 fSets->addElement(fEModifierSet, status);
2391 fSets->addElement(fZWJSet, status);
2392 fSets->addElement(fExtendedPictSet, status);
2393 fSets->addElement(fEmojiNRKSet, status);
2395 if (U_FAILURE(status)) {
2396 deferredStatus = status;
2400 void RBBIWordMonkey::setText(const UnicodeString &s) {
2405 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2406 int p0, p1, p2, p3; // Indices of the significant code points around the
2407 // break position being tested. The candidate break
2408 // location is before p2.
2412 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2414 if (U_FAILURE(deferredStatus)) {
2418 // Prev break at end of string. return DONE.
2419 if (prevPos >= fText->length()) {
2422 p0 = p1 = p2 = p3 = prevPos;
2423 c3 = fText->char32At(prevPos);
2425 (void)p0; // Suppress set but not used warning.
2427 // Loop runs once per "significant" character position in the input text.
2429 // Move all of the positions forward in the input string.
2434 // Advancd p3 by X(Extend | Format)* Rule 4
2435 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2437 p3 = fText->moveIndex32(p3, 1);
2438 c3 = fText->char32At(p3);
2439 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2443 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2447 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2450 if (p2 == fText->length()) {
2451 // Reached end of string. Always a break position.
2456 // No Extend or Format characters may appear between the CR and LF,
2457 // which requires the additional check for p2 immediately following p1.
2459 if (c1==0x0D && c2==0x0A) {
2463 // Rule (3a) Break before and after newlines (including CR and LF)
2465 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2468 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2472 // Rule (3c) ZWJ x (Glue_after_ZWJ | EmojiNRK).
2473 // Not ignoring extend chars, so peek into input text to
2474 // get the potential ZWJ, the character immediately preceding c2.
2475 // Sloppy UChar32 indexing: p2-1 may reference trail half
2476 // but char32At will get the full code point.
2477 if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2481 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2482 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2483 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2487 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2489 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2490 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2491 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2495 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2496 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2497 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2498 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2502 // Rule (7a) Hebrew_Letter x Single_Quote
2503 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2507 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2508 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2512 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2513 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2517 // Rule (8) Numeric x Numeric
2518 if (fNumericSet->contains(c1) &&
2519 fNumericSet->contains(c2)) {
2523 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2524 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2525 fNumericSet->contains(c2)) {
2529 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2530 if (fNumericSet->contains(c1) &&
2531 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2535 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2536 if (fNumericSet->contains(c0) &&
2537 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2538 fNumericSet->contains(c2)) {
2542 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2543 if (fNumericSet->contains(c1) &&
2544 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2545 fNumericSet->contains(c3)) {
2549 // Rule (13) Katakana x Katakana
2550 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2551 // all Katakana are handled by the dictionary breaker.
2552 if (fKatakanaSet->contains(c1) &&
2553 fKatakanaSet->contains(c2)) {
2557 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2558 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2559 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2560 fExtendNumLetSet->contains(c2)) {
2564 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2565 if (fExtendNumLetSet->contains(c1) &&
2566 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2567 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2571 // WB 14 (E_Base | EBG) x E_Modifier
2572 if ((fEBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
2576 // Rule 15 - 17 Group pairs of Regional Indicators.
2577 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2580 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2584 // Rule 999. Break found here.
2593 UVector *RBBIWordMonkey::charClasses() {
2598 RBBIWordMonkey::~RBBIWordMonkey() {
2603 delete fKatakanaSet;
2604 delete fHebrew_LetterSet;
2606 delete fSingle_QuoteSet;
2607 delete fDouble_QuoteSet;
2608 delete fMidNumLetSet;
2609 delete fMidLetterSet;
2614 delete fExtendNumLetSet;
2615 delete fRegionalIndicatorSet;
2616 delete fDictionarySet;
2620 delete fEModifierSet;
2622 delete fExtendedPictSet;
2623 delete fEmojiNRKSet;
2629 //------------------------------------------------------------------------------------------
2631 // class RBBISentMonkey Sentence Break specific implementation
2632 // of RBBIMonkeyKind.
2634 //------------------------------------------------------------------------------------------
2635 class RBBISentMonkey: public RBBIMonkeyKind {
2638 virtual ~RBBISentMonkey();
2639 virtual UVector *charClasses();
2640 virtual void setText(const UnicodeString &s);
2641 virtual int32_t next(int32_t i);
2643 int moveBack(int posFrom);
2644 int moveForward(int posFrom);
2645 UChar32 cAt(int pos);
2649 UnicodeSet *fSepSet;
2650 UnicodeSet *fFormatSet;
2652 UnicodeSet *fLowerSet;
2653 UnicodeSet *fUpperSet;
2654 UnicodeSet *fOLetterSet;
2655 UnicodeSet *fNumericSet;
2656 UnicodeSet *fATermSet;
2657 UnicodeSet *fSContinueSet;
2658 UnicodeSet *fSTermSet;
2659 UnicodeSet *fCloseSet;
2660 UnicodeSet *fOtherSet;
2661 UnicodeSet *fExtendSet;
2663 const UnicodeString *fText;
2667 RBBISentMonkey::RBBISentMonkey()
2669 UErrorCode status = U_ZERO_ERROR;
2671 fSets = new UVector(status);
2673 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2674 // set and made into character classes of their own. For the monkey impl,
2675 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2676 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2677 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2678 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2679 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2680 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2681 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2682 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2683 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2684 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2685 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2686 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2687 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2688 fOtherSet = new UnicodeSet();
2690 if(U_FAILURE(status)) {
2691 deferredStatus = status;
2695 fOtherSet->complement();
2696 fOtherSet->removeAll(*fSepSet);
2697 fOtherSet->removeAll(*fFormatSet);
2698 fOtherSet->removeAll(*fSpSet);
2699 fOtherSet->removeAll(*fLowerSet);
2700 fOtherSet->removeAll(*fUpperSet);
2701 fOtherSet->removeAll(*fOLetterSet);
2702 fOtherSet->removeAll(*fNumericSet);
2703 fOtherSet->removeAll(*fATermSet);
2704 fOtherSet->removeAll(*fSContinueSet);
2705 fOtherSet->removeAll(*fSTermSet);
2706 fOtherSet->removeAll(*fCloseSet);
2707 fOtherSet->removeAll(*fExtendSet);
2709 fSets->addElement(fSepSet, status);
2710 fSets->addElement(fFormatSet, status);
2711 fSets->addElement(fSpSet, status);
2712 fSets->addElement(fLowerSet, status);
2713 fSets->addElement(fUpperSet, status);
2714 fSets->addElement(fOLetterSet, status);
2715 fSets->addElement(fNumericSet, status);
2716 fSets->addElement(fATermSet, status);
2717 fSets->addElement(fSContinueSet, status);
2718 fSets->addElement(fSTermSet, status);
2719 fSets->addElement(fCloseSet, status);
2720 fSets->addElement(fOtherSet, status);
2721 fSets->addElement(fExtendSet, status);
2723 if (U_FAILURE(status)) {
2724 deferredStatus = status;
2730 void RBBISentMonkey::setText(const UnicodeString &s) {
2734 UVector *RBBISentMonkey::charClasses() {
2739 // moveBack() Find the "significant" code point preceding the index i.
2740 // Skips over ($Extend | $Format)* .
2742 int RBBISentMonkey::moveBack(int i) {
2749 j = fText->moveIndex32(j, -1);
2750 c = fText->char32At(j);
2752 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2758 int RBBISentMonkey::moveForward(int i) {
2759 if (i>=fText->length()) {
2760 return fText->length();
2765 j = fText->moveIndex32(j, 1);
2768 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2772 UChar32 RBBISentMonkey::cAt(int pos) {
2773 if (pos<0 || pos>=fText->length()) {
2776 return fText->char32At(pos);
2780 int32_t RBBISentMonkey::next(int32_t prevPos) {
2781 int p0, p1, p2, p3; // Indices of the significant code points around the
2782 // break position being tested. The candidate break
2783 // location is before p2.
2787 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2790 if (U_FAILURE(deferredStatus)) {
2794 // Prev break at end of string. return DONE.
2795 if (prevPos >= fText->length()) {
2798 p0 = p1 = p2 = p3 = prevPos;
2799 c3 = fText->char32At(prevPos);
2801 (void)p0; // Suppress set but not used warning.
2803 // Loop runs once per "significant" character position in the input text.
2805 // Move all of the positions forward in the input string.
2810 // Advancd p3 by X(Extend | Format)* Rule 4
2811 p3 = moveForward(p3);
2815 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2819 // Rule (4). Sep <break>
2820 if (fSepSet->contains(c1)) {
2821 p2 = p1+1; // Separators don't combine with Extend or Format.
2825 if (p2 >= fText->length()) {
2826 // Reached end of string. Always a break position.
2830 if (p2 == prevPos) {
2831 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2835 // Rule (6). ATerm x Numeric
2836 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2840 // Rule (7). (Upper | Lower) ATerm x Uppper
2841 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2842 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2846 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2847 // Note: STerm | ATerm are added to the negated part of the expression by a
2848 // note to the Unicode 5.0 documents.
2850 while (fSpSet->contains(cAt(p8))) {
2853 while (fCloseSet->contains(cAt(p8))) {
2856 if (fATermSet->contains(cAt(p8))) {
2860 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2861 fLowerSet->contains(c) || fSepSet->contains(c) ||
2862 fATermSet->contains(c) || fSTermSet->contains(c)) {
2865 p8 = moveForward(p8);
2867 if (fLowerSet->contains(cAt(p8))) {
2872 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2873 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2875 while (fSpSet->contains(cAt(p8))) {
2878 while (fCloseSet->contains(cAt(p8))) {
2882 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2887 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2889 while (fCloseSet->contains(cAt(p9))) {
2893 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2894 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2899 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2901 while (fSpSet->contains(cAt(p10))) {
2902 p10 = moveBack(p10);
2904 while (fCloseSet->contains(cAt(p10))) {
2905 p10 = moveBack(p10);
2907 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2908 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2913 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2915 if (fSepSet->contains(cAt(p11))) {
2916 p11 = moveBack(p11);
2918 while (fSpSet->contains(cAt(p11))) {
2919 p11 = moveBack(p11);
2921 while (fCloseSet->contains(cAt(p11))) {
2922 p11 = moveBack(p11);
2924 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2928 // Rule (12) Any x Any
2935 RBBISentMonkey::~RBBISentMonkey() {
2945 delete fSContinueSet;
2954 //-------------------------------------------------------------------------------------------
2958 //-------------------------------------------------------------------------------------------
2960 class RBBILineMonkey: public RBBIMonkeyKind {
2963 virtual ~RBBILineMonkey();
2964 virtual UVector *charClasses();
2965 virtual void setText(const UnicodeString &s);
2966 virtual int32_t next(int32_t i);
2967 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
3013 UnicodeSet *fExtendedPict;
3014 UnicodeSet *fEmojiNRK;
3016 BreakIterator *fCharBI;
3017 const UnicodeString *fText;
3018 RegexMatcher *fNumberMatcher;
3021 RBBILineMonkey::RBBILineMonkey() :
3027 fNumberMatcher(NULL)
3030 if (U_FAILURE(deferredStatus)) {
3034 UErrorCode status = U_ZERO_ERROR;
3036 fSets = new UVector(status);
3038 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3039 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3040 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3041 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3042 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3043 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3044 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3045 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3046 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3047 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3048 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3049 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3050 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3051 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3052 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3053 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3054 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3055 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3056 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3057 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3058 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3059 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3060 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3061 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3062 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3063 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3064 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3065 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3066 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3067 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3068 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3069 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3070 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3071 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
3072 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
3073 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3074 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
3075 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3076 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3077 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE(
3078 "[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
3079 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
3080 fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
3081 fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
3082 fExtendedPict = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
3084 if (U_FAILURE(status)) {
3085 deferredStatus = status;
3089 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
3090 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
3091 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
3093 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
3094 fCM->addAll(*fZJ); // ZWJ behaves as a CM.
3096 fSets->addElement(fBK, status);
3097 fSets->addElement(fCR, status);
3098 fSets->addElement(fLF, status);
3099 fSets->addElement(fCM, status);
3100 fSets->addElement(fNL, status);
3101 fSets->addElement(fWJ, status);
3102 fSets->addElement(fZW, status);
3103 fSets->addElement(fGL, status);
3104 fSets->addElement(fCB, status);
3105 fSets->addElement(fSP, status);
3106 fSets->addElement(fB2, status);
3107 fSets->addElement(fBA, status);
3108 fSets->addElement(fBB, status);
3109 fSets->addElement(fHY, status);
3110 fSets->addElement(fH2, status);
3111 fSets->addElement(fH3, status);
3112 fSets->addElement(fCL, status);
3113 fSets->addElement(fCP, status);
3114 fSets->addElement(fEX, status);
3115 fSets->addElement(fIN, status);
3116 fSets->addElement(fJL, status);
3117 fSets->addElement(fJT, status);
3118 fSets->addElement(fJV, status);
3119 fSets->addElement(fNS, status);
3120 fSets->addElement(fOP, status);
3121 fSets->addElement(fQU, status);
3122 fSets->addElement(fIS, status);
3123 fSets->addElement(fNU, status);
3124 fSets->addElement(fPO, status);
3125 fSets->addElement(fPR, status);
3126 fSets->addElement(fSY, status);
3127 fSets->addElement(fAI, status);
3128 fSets->addElement(fAL, status);
3129 fSets->addElement(fHL, status);
3130 fSets->addElement(fID, status);
3131 fSets->addElement(fWJ, status);
3132 fSets->addElement(fRI, status);
3133 fSets->addElement(fSG, status);
3134 fSets->addElement(fEB, status);
3135 fSets->addElement(fEM, status);
3136 fSets->addElement(fZJ, status);
3137 fSets->addElement(fExtendedPict, status);
3138 fSets->addElement(fEmojiNRK, status);
3142 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
3143 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
3144 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
3145 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
3146 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
3147 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
3149 fNumberMatcher = new RegexMatcher(
3150 UnicodeString(rules, -1, US_INV), 0, status);
3152 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3154 if (U_FAILURE(status)) {
3155 deferredStatus = status;
3160 void RBBILineMonkey::setText(const UnicodeString &s) {
3162 fCharBI->setText(s);
3163 fNumberMatcher->reset(s);
3168 // Line Break TR rules 9 and 10 implementation.
3169 // This deals with combining marks and other sequences that
3170 // that must be treated as if they were something other than what they actually are.
3172 // This is factored out into a separate function because it must be applied twice for
3173 // each potential break, once to the chars before the position being checked, then
3174 // again to the text following the possible break.
3176 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3178 // Invalid initial position. Happens during the warmup iteration of the
3179 // main loop in next().
3183 int32_t nPos = *nextPos;
3185 // LB 9 Keep combining sequences together.
3186 // advance over any CM class chars. Note that Line Break CM is different
3187 // from the normal Grapheme Extend property.
3188 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3189 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3191 *nextChar = fText->char32At(nPos);
3192 if (!fCM->contains(*nextChar)) {
3195 nPos = fText->moveIndex32(nPos, 1);
3200 // LB 9 Treat X CM* as if it were x.
3201 // No explicit action required.
3203 // LB 10 Treat any remaining combining mark as AL
3204 if (fCM->contains(*posChar)) {
3205 *posChar = 0x41; // thisChar = 'A';
3208 // Push the updated nextPos and nextChar back to our caller.
3209 // This only makes a difference if posChar got bigger by consuming a
3210 // combining sequence.
3212 *nextChar = fText->char32At(nPos);
3217 int32_t RBBILineMonkey::next(int32_t startPos) {
3218 UErrorCode status = U_ZERO_ERROR;
3219 int32_t pos; // Index of the char following a potential break position
3220 UChar32 thisChar; // Character at above position "pos"
3222 int32_t prevPos; // Index of the char preceding a potential break position
3223 UChar32 prevChar; // Character at above position. Note that prevChar
3224 // and thisChar may not be adjacent because combining
3225 // characters between them will be ignored.
3227 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
3230 int32_t nextPos; // Index of the next character following pos.
3231 // Usually skips over combining marks.
3232 int32_t nextCPPos; // Index of the code point following "pos."
3233 // May point to a combining mark.
3234 int32_t tPos; // temp value.
3237 if (U_FAILURE(deferredStatus)) {
3241 if (startPos >= fText->length()) {
3246 // Initial values for loop. Loop will run the first time without finding breaks,
3247 // while the invalid values shift out and the "this" and
3248 // "prev" positions are filled in with good values.
3249 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
3250 thisChar = prevChar = prevCharX2 = 0;
3251 nextPos = nextCPPos = startPos;
3254 // Loop runs once per position in the test text, until a break position
3257 prevPosX2 = prevPos;
3258 prevCharX2 = prevChar;
3261 prevChar = thisChar;
3264 thisChar = fText->char32At(pos);
3266 nextCPPos = fText->moveIndex32(pos, 1);
3267 nextPos = nextCPPos;
3269 // Rule LB2 - Break at end of text.
3270 if (pos >= fText->length()) {
3274 // Rule LB 9 - adjust for combining sequences.
3275 // We do this one out-of-order because the adjustment does not change anything
3276 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3278 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3279 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3280 c = fText->char32At(nextPos);
3281 rule9Adjust(pos, &thisChar, &nextPos, &c);
3283 // If the loop is still warming up - if we haven't shifted the initial
3284 // -1 positions out of prevPos yet - loop back to advance the
3285 // position in the input without any further looking for breaks.
3286 if (prevPos == -1) {
3290 // LB 4 Always break after hard line breaks,
3291 if (fBK->contains(prevChar)) {
3295 // LB 5 Break after CR, LF, NL, but not inside CR LF
3296 if (prevChar == 0x0d && thisChar == 0x0a) {
3299 if (prevChar == 0x0d ||
3305 // LB 6 Don't break before hard line breaks
3306 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3307 fBK->contains(thisChar)) {
3312 // LB 7 Don't break before spaces or zero-width space.
3313 if (fSP->contains(thisChar)) {
3317 if (fZW->contains(thisChar)) {
3321 // LB 8 Break after zero width space
3322 if (fZW->contains(prevChar)) {
3326 // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
3327 // The monkey test's way of ignoring combining characters doesn't work
3328 // for this rule. ZJ is also a CM. Need to get the actual character
3329 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3331 int32_t prevIdx = fText->moveIndex32(pos, -1);
3332 UChar32 prevC = fText->char32At(prevIdx);
3333 if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
3338 // LB 9, 10 Already done, at top of loop.
3342 // LB 11 Do not break before or after WORD JOINER and related characters.
3346 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3352 if (fGL->contains(prevChar)) {
3358 if (!(fSP->contains(prevChar) ||
3359 fBA->contains(prevChar) ||
3360 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3366 // LB 13 Don't break before closings.
3367 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3368 // fall into LB 17 and the more general number regular expression.
3370 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3371 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3372 fEX->contains(thisChar) ||
3373 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3374 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
3378 // LB 14 Don't break after OP SP*
3379 // Scan backwards, checking for this sequence.
3380 // The OP char could include combining marks, so we actually check for
3382 // Another Twist: The Rule 67 fixes may have changed a SP CM
3383 // sequence into a ID char, so before scanning back through spaces,
3384 // verify that prevChar is indeed a space. The prevChar variable
3385 // may differ from fText[prevPos]
3387 if (fSP->contains(prevChar)) {
3388 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3389 tPos=fText->moveIndex32(tPos, -1);
3392 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3393 tPos=fText->moveIndex32(tPos, -1);
3395 if (fOP->contains(fText->char32At(tPos))) {
3400 // LB 15 QU SP* x OP
3401 if (fOP->contains(thisChar)) {
3402 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3404 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3405 tPos = fText->moveIndex32(tPos, -1);
3407 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3408 tPos = fText->moveIndex32(tPos, -1);
3410 if (fQU->contains(fText->char32At(tPos))) {
3417 // LB 16 (CL | CP) SP* x NS
3418 // Scan backwards for SP* CM* (CL | CP)
3419 if (fNS->contains(thisChar)) {
3421 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3422 tPos = fText->moveIndex32(tPos, -1);
3424 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3425 tPos = fText->moveIndex32(tPos, -1);
3427 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3433 // LB 17 B2 SP* x B2
3434 if (fB2->contains(thisChar)) {
3435 // Scan backwards, checking for the B2 CM* SP* sequence.
3437 if (fSP->contains(prevChar)) {
3438 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3439 tPos=fText->moveIndex32(tPos, -1);
3442 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3443 tPos=fText->moveIndex32(tPos, -1);
3445 if (fB2->contains(fText->char32At(tPos))) {
3451 // LB 18 break after space
3452 if (fSP->contains(prevChar)) {
3459 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3463 // LB 20 Break around a CB
3464 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3469 if (fBA->contains(thisChar) ||
3470 fHY->contains(thisChar) ||
3471 fNS->contains(thisChar) ||
3472 fBB->contains(prevChar) ) {
3478 if (fHL->contains(prevCharX2) &&
3479 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3485 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3490 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3491 (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3492 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3493 ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
3494 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3495 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
3500 // LB 23 (AL | HL) x NU
3502 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3505 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3509 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3510 // PR x (ID | EB | EM)
3511 // (ID | EB | EM) x PO
3512 if (fPR->contains(prevChar) &&
3513 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3516 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3517 fPO->contains(thisChar)) {
3521 // LB 24 Do not break between prefix and letters or ideographs.
3522 // (PR | PO) x (AL | HL)
3523 // (AL | HL) x (PR | PO)
3524 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3525 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3528 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3529 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3536 if (fNumberMatcher->lookingAt(prevPos, status)) {
3537 if (U_FAILURE(status)) {
3540 // Matched a number. But could have been just a single digit, which would
3541 // not represent a "no break here" between prevChar and thisChar
3542 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3543 if (numEndIdx > pos) {
3544 // Number match includes at least our two chars being checked
3545 if (numEndIdx > nextPos) {
3546 // Number match includes additional chars. Update pos and nextPos
3547 // so that next loop iteration will continue at the end of the number,
3548 // checking for breaks between last char in number & whatever follows.
3549 pos = nextPos = numEndIdx;
3551 pos = fText->moveIndex32(pos, -1);
3552 thisChar = fText->char32At(pos);
3553 } while (fCM->contains(thisChar));
3560 // LB 26 Do not break a Korean syllable.
3561 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3562 fJV->contains(thisChar) ||
3563 fH2->contains(thisChar) ||
3564 fH3->contains(thisChar))) {
3568 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3569 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3573 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3574 fJT->contains(thisChar)) {
3578 // LB 27 Treat a Korean Syllable Block the same as ID.
3579 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3580 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3581 fIN->contains(thisChar)) {
3584 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3585 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3586 fPO->contains(thisChar)) {
3589 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3590 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3596 // LB 28 Do not break between alphabetics ("at").
3597 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3601 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3602 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3606 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3609 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3612 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3616 // LB30a RI RI <break> RI
3618 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3621 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3625 // LB30b Emoji Base x Emoji Modifier
3626 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3630 // LB 31 Break everywhere else
3639 UVector *RBBILineMonkey::charClasses() {
3644 RBBILineMonkey::~RBBILineMonkey() {
3689 delete fExtendedPict;
3693 delete fNumberMatcher;
3697 //-------------------------------------------------------------------------------------------
3702 // seed=nnnnn Random number starting seed.
3703 // Setting the seed allows errors to be reproduced.
3704 // loop=nnn Looping count. Controls running time.
3706 // 0 or greater: run length.
3708 // type = char | word | line | sent | title
3711 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3713 //-------------------------------------------------------------------------------------------
3715 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3716 int32_t val = defaultVal;
3717 name.append(" *= *(-?\\d+)");
3718 UErrorCode status = U_ZERO_ERROR;
3719 RegexMatcher m(name, params, 0, status);
3721 // The param exists. Convert the string to an int.
3722 char valString[100];
3723 int32_t paramLength = m.end(1, status) - m.start(1, status);
3724 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3725 paramLength = (int32_t)(sizeof(valString)-2);
3727 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3728 val = strtol(valString, NULL, 10);
3730 // Delete this parameter from the params string.
3732 params = m.replaceFirst("", status);
3734 U_ASSERT(U_SUCCESS(status));
3739 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3740 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3749 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3751 if (count < expectedcount && expected[count] != i) {
3752 test->errln("break forward test failed: expected %d but got %d",
3753 expected[count], i);
3758 if (count != expectedcount) {
3759 printStringBreaks(ustr, expected, expectedcount);
3760 test->errln("break forward test failed: missed %d match",
3761 expectedcount - count);
3764 // testing boundaries
3765 for (i = 1; i < expectedcount; i ++) {
3766 int j = expected[i - 1];
3767 if (!bi->isBoundary(j)) {
3768 printStringBreaks(ustr, expected, expectedcount);
3769 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3772 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3773 if (bi->isBoundary(j)) {
3774 printStringBreaks(ustr, expected, expectedcount);
3775 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3781 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3783 if (forward[count] != i) {
3784 printStringBreaks(ustr, expected, expectedcount);
3785 test->errln("happy break test previous() failed: expected %d but got %d",
3791 printStringBreaks(ustr, expected, expectedcount);
3792 test->errln("break test previous() failed: missed a match");
3796 // testing preceding
3797 for (i = 0; i < expectedcount - 1; i ++) {
3798 // int j = expected[i] + 1;
3799 int j = ustr.moveIndex32(expected[i], 1);
3800 for (; j <= expected[i + 1]; j ++) {
3801 if (bi->preceding(j) != expected[i]) {
3802 printStringBreaks(ustr, expected, expectedcount);
3803 test->errln("preceding(): Not expecting boundary at position %d", j);
3811 void RBBITest::TestWordBreaks(void)
3813 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3815 Locale locale("en");
3816 UErrorCode status = U_ZERO_ERROR;
3817 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3818 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3819 // Replaced any C+J characters in a row with a random sequence of characters
3820 // of the same length to make our C+J segmentation not get in the way.
3821 static const char *strlist[] =
3823 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3824 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3825 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3826 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3827 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3828 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3829 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3830 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3831 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3832 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3833 "\\u2027\\U000e0067\\u0a47\\u00b7",
3834 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3835 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3836 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3837 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3838 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3839 "\\u0027\\u11af\\U000e0057\\u0602",
3840 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3841 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3842 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3843 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3844 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3845 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3846 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3847 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3848 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3849 "\\u18f4\\U000e0049\\u20e7\\u2027",
3850 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3851 "\\ua183\\u102d\\u0bec\\u003a",
3852 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3853 "\\u003a\\u0e57\\u0fad\\u002e",
3854 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3855 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3856 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3857 "\\u003a\\u0664\\u00b7\\u1fba",
3858 "\\u003b\\u0027\\u00b7\\u47a3",
3859 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3860 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3861 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3864 if (U_FAILURE(status)) {
3865 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3868 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3869 // printf("looping %d\n", loop);
3870 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3871 // RBBICharMonkey monkey;
3872 RBBIWordMonkey monkey;
3875 int expectedcount = 0;
3877 monkey.setText(ustr);
3879 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3880 expected[expectedcount ++] = i;
3883 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3889 void RBBITest::TestWordBoundary(void)
3891 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3892 Locale locale("en");
3893 UErrorCode status = U_ZERO_ERROR;
3894 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3895 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3897 static const char *strlist[] =
3899 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3900 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3901 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3902 "\\u2027\\U000e0067\\u0a47\\u00b7",
3903 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3904 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3905 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3906 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3907 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3908 "\\u0027\\u11af\\U000e0057\\u0602",
3909 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3910 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3911 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3912 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3913 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3914 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3915 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3916 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3917 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3918 "\\u58f4\\U000e0049\\u20e7\\u2027",
3919 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3920 "\\ua183\\u102d\\u0bec\\u003a",
3921 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3922 "\\u003a\\u0e57\\u0fad\\u002e",
3923 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3924 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3925 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3926 "\\u003a\\u0664\\u00b7\\u1fba",
3927 "\\u003b\\u0027\\u00b7\\u47a3",
3930 if (U_FAILURE(status)) {
3931 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3934 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3935 // printf("looping %d\n", loop);
3936 u_unescape(strlist[loop], str, 20);
3937 UnicodeString ustr(str);
3944 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3945 forward[count ++] = i;
3948 for (j = prev + 1; j < i; j ++) {
3949 if (bi->isBoundary(j)) {
3950 printStringBreaks(ustr, forward, count);
3951 errln("happy boundary test failed: expected %d not a boundary",
3957 if (!bi->isBoundary(i)) {
3958 printStringBreaks(ustr, forward, count);
3959 errln("happy boundary test failed: expected %d a boundary",
3969 void RBBITest::TestLineBreaks(void)
3971 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3972 Locale locale("en");
3973 UErrorCode status = U_ZERO_ERROR;
3974 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3975 const int32_t STRSIZE = 50;
3977 static const char *strlist[] =
3979 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3980 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3981 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3982 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3983 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3984 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3985 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3986 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3987 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3988 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3989 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3990 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3991 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3992 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3993 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3994 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3995 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3996 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3997 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3998 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3999 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4000 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4001 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4002 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4003 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4004 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4005 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4006 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4007 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4008 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4009 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4010 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4011 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4012 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4013 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4014 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4015 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4016 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4017 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4018 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4021 TEST_ASSERT_SUCCESS(status);
4022 if (U_FAILURE(status)) {
4025 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4026 // printf("looping %d\n", loop);
4027 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4034 UnicodeString ustr(str);
4035 RBBILineMonkey monkey;
4036 if (U_FAILURE(monkey.deferredStatus)) {
4040 const int EXPECTEDSIZE = 50;
4041 int expected[EXPECTEDSIZE];
4042 int expectedcount = 0;
4044 monkey.setText(ustr);
4046 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4047 if (expectedcount >= EXPECTEDSIZE) {
4048 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4051 expected[expectedcount ++] = i;
4054 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4060 void RBBITest::TestSentBreaks(void)
4062 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4063 Locale locale("en");
4064 UErrorCode status = U_ZERO_ERROR;
4065 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4067 static const char *strlist[] =
4069 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4071 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4072 "\"Sentence ending with a quote.\" Bye.",
4073 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4074 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4075 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4076 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4077 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4078 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4079 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4080 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4081 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4082 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4083 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4084 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4085 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4086 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4087 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4088 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4091 if (U_FAILURE(status)) {
4092 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4095 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4096 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
4097 UnicodeString ustr(str);
4099 RBBISentMonkey monkey;
4100 if (U_FAILURE(monkey.deferredStatus)) {
4104 const int EXPECTEDSIZE = 50;
4105 int expected[EXPECTEDSIZE];
4106 int expectedcount = 0;
4108 monkey.setText(ustr);
4110 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4111 if (expectedcount >= EXPECTEDSIZE) {
4112 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4115 expected[expectedcount ++] = i;
4118 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4124 void RBBITest::TestMonkey() {
4125 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4127 UErrorCode status = U_ZERO_ERROR;
4128 int32_t loopCount = 500;
4130 UnicodeString breakType = "all";
4131 Locale locale("en");
4132 UBool useUText = FALSE;
4134 if (quick == FALSE) {
4139 UnicodeString p(fTestParams);
4140 loopCount = getIntParam("loop", p, loopCount);
4141 seed = getIntParam("seed", p, seed);
4143 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4145 breakType = m.group(1, status);
4147 p = m.replaceFirst("", status);
4150 RegexMatcher u(" *utext", p, 0, status);
4154 p = u.replaceFirst("", status);
4159 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4160 // Each option is stripped out of the option string as it is processed.
4161 // All options have been checked. The option string should have been completely emptied..
4163 p.extract(buf, sizeof(buf), NULL, status);
4164 buf[sizeof(buf)-1] = 0;
4165 errln("Unrecognized or extra parameter: %s\n", buf);
4171 if (breakType == "char" || breakType == "all") {
4173 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4174 if (U_SUCCESS(status)) {
4175 RunMonkey(bi, m, "char", seed, loopCount, useUText);
4176 if (breakType == "all" && useUText==FALSE) {
4177 // Also run a quick test with UText when "all" is specified
4178 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4182 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4187 if (breakType == "word" || breakType == "all") {
4188 logln("Word Break Monkey Test");
4190 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4191 if (U_SUCCESS(status)) {
4192 RunMonkey(bi, m, "word", seed, loopCount, useUText);
4195 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4200 if (breakType == "line" || breakType == "all") {
4201 logln("Line Break Monkey Test");
4203 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4204 if (loopCount >= 10) {
4205 loopCount = loopCount / 5; // Line break runs slower than the others.
4207 if (U_SUCCESS(status)) {
4208 RunMonkey(bi, m, "line", seed, loopCount, useUText);
4211 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4216 if (breakType == "sent" || breakType == "all" ) {
4217 logln("Sentence Break Monkey Test");
4219 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4220 if (loopCount >= 10) {
4221 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4223 if (U_SUCCESS(status)) {
4224 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4227 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4236 // Run a RBBI monkey test. Common routine, for all break iterator types.
4238 // bi - the break iterator to use
4239 // mk - MonkeyKind, abstraction for obtaining expected results
4240 // name - Name of test (char, word, etc.) for use in error messages
4241 // seed - Seed for starting random number generator (parameter from user)
4244 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4245 int32_t numIterations, UBool useUText) {
4247 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4249 const int32_t TESTSTRINGLEN = 500;
4250 UnicodeString testText;
4251 int32_t numCharClasses;
4253 int expected[TESTSTRINGLEN*2 + 1];
4254 int expectedCount = 0;
4255 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4256 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4257 char reverseBreaks[TESTSTRINGLEN*2+1];
4258 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4259 char followingBreaks[TESTSTRINGLEN*2+1];
4260 char precedingBreaks[TESTSTRINGLEN*2+1];
4266 numCharClasses = mk.charClasses()->size();
4267 chClasses = mk.charClasses();
4269 // Check for errors that occured during the construction of the MonkeyKind object.
4270 // Can't report them where they occured because errln() is a method coming from intlTest,
4271 // and is not visible outside of RBBITest :-(
4272 if (U_FAILURE(mk.deferredStatus)) {
4273 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4277 // Verify that the character classes all have at least one member.
4278 for (i=0; i<numCharClasses; i++) {
4279 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4280 if (s == NULL || s->size() == 0) {
4281 errln("Character Class #%d is null or of zero size.", i);
4286 while (loopCount < numIterations || numIterations == -1) {
4287 if (numIterations == -1 && loopCount % 10 == 0) {
4288 // If test is running in an infinite loop, display a periodic tic so
4289 // we can tell that it is making progress.
4290 fprintf(stderr, ".");
4292 // Save current random number seed, so that we can recreate the random numbers
4293 // for this loop iteration in event of an error.
4296 // Populate a test string with data.
4297 testText.truncate(0);
4298 for (i=0; i<TESTSTRINGLEN; i++) {
4299 int32_t aClassNum = m_rand() % numCharClasses;
4300 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4301 int32_t charIdx = m_rand() % classSet->size();
4302 UChar32 c = classSet->charAt(charIdx);
4303 if (c < 0) { // TODO: deal with sets containing strings.
4304 errln("%s:%d c < 0", __FILE__, __LINE__);
4307 // Do not assemble a supplementary character from randomly generated separate surrogates.
4308 // (It could be a dictionary character)
4309 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4316 // Calculate the expected results for this test string.
4317 mk.setText(testText);
4318 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4319 expectedBreaks[0] = 1;
4320 int32_t breakPos = 0;
4323 breakPos = mk.next(breakPos);
4324 if (breakPos == -1) {
4327 if (breakPos > testText.length()) {
4328 errln("breakPos > testText.length()");
4330 expectedBreaks[breakPos] = 1;
4331 U_ASSERT(expectedCount<testText.length());
4332 expected[expectedCount ++] = breakPos;
4333 (void)expected; // Set but not used warning.
4334 // TODO (andy): check it out.
4337 // Find the break positions using forward iteration
4338 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4340 UErrorCode status = U_ZERO_ERROR;
4341 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4342 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4343 bi->setText(testUText, status);
4344 TEST_ASSERT_SUCCESS(status);
4345 utext_close(testUText); // The break iterator does a shallow clone of the UText
4346 // This UText can be closed immediately, so long as the
4347 // testText string continues to exist.
4349 bi->setText(testText);
4352 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4353 if (i < 0 || i > testText.length()) {
4354 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4357 forwardBreaks[i] = 1;
4360 // Find the break positions using reverse iteration
4361 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4362 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4363 if (i < 0 || i > testText.length()) {
4364 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4367 reverseBreaks[i] = 1;
4370 // Find the break positions using isBoundary() tests.
4371 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4372 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4373 for (i=0; i<=testText.length(); i++) {
4374 isBoundaryBreaks[i] = bi->isBoundary(i);
4378 // Find the break positions using the following() function.
4380 memset(followingBreaks, 0, sizeof(followingBreaks));
4381 int32_t lastBreakPos = 0;
4382 followingBreaks[0] = 1;
4383 for (i=0; i<testText.length(); i++) {
4384 breakPos = bi->following(i);
4385 if (breakPos <= i ||
4386 breakPos < lastBreakPos ||
4387 breakPos > testText.length() ||
4388 (breakPos > lastBreakPos && lastBreakPos > i)) {
4389 errln("%s break monkey test: "
4390 "Out of range value returned by BreakIterator::following().\n"
4391 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4392 name, seed, i, breakPos, lastBreakPos);
4395 followingBreaks[breakPos] = 1;
4396 lastBreakPos = breakPos;
4399 // Find the break positions using the preceding() function.
4400 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4401 lastBreakPos = testText.length();
4402 precedingBreaks[testText.length()] = 1;
4403 for (i=testText.length(); i>0; i--) {
4404 breakPos = bi->preceding(i);
4405 if (breakPos >= i ||
4406 breakPos > lastBreakPos ||
4407 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4408 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4409 errln("%s break monkey test: "
4410 "Out of range value returned by BreakIterator::preceding().\n"
4411 "index=%d; prev returned %d; lastBreak=%d" ,
4412 name, i, breakPos, lastBreakPos);
4413 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4414 precedingBreaks[i] = 2; // Forces an error.
4417 if (breakPos >= 0) {
4418 precedingBreaks[breakPos] = 1;
4420 lastBreakPos = breakPos;
4424 // Compare the expected and actual results.
4425 for (i=0; i<=testText.length(); i++) {
4426 const char *errorType = NULL;
4427 if (forwardBreaks[i] != expectedBreaks[i]) {
4428 errorType = "next()";
4429 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4430 errorType = "previous()";
4431 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4432 errorType = "isBoundary()";
4433 } else if (followingBreaks[i] != expectedBreaks[i]) {
4434 errorType = "following()";
4435 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4436 errorType = "preceding()";
4440 if (errorType != NULL) {
4441 // Format a range of the test text that includes the failure as
4442 // a data item that can be included in the rbbi test data file.
4444 // Start of the range is the last point where expected and actual results
4445 // both agreed that there was a break position.
4446 int startContext = i;
4449 if (startContext==0) { break; }
4451 if (expectedBreaks[startContext] != 0) {
4452 if (count == 2) break;
4457 // End of range is two expected breaks past the start position.
4458 int endContext = i + 1;
4460 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4462 if (endContext >= testText.length()) {break;}
4463 if (expectedBreaks[endContext-1] != 0) {
4464 if (count == 0) break;
4471 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4472 UnicodeString errorText = "<data>";
4473 /***if (strcmp(errorType, "next()") == 0) {
4475 endContext = testText.length();
4477 printStringBreaks(testText, expected, expectedCount);
4480 for (ci=startContext; ci<endContext;) {
4481 UnicodeString hexChars("0123456789abcdef");
4484 c = testText.char32At(ci);
4486 // This is the location of the error.
4487 errorText.append("<?>");
4488 } else if (expectedBreaks[ci] != 0) {
4489 // This a non-error expected break position.
4490 errorText.append("\\");
4493 errorText.append("\\u");
4494 for (bn=12; bn>=0; bn-=4) {
4495 errorText.append(hexChars.charAt((c>>bn)&0xf));
4498 errorText.append("\\U");
4499 for (bn=28; bn>=0; bn-=4) {
4500 errorText.append(hexChars.charAt((c>>bn)&0xf));
4503 ci = testText.moveIndex32(ci, 1);
4505 errorText.append("\\");
4506 errorText.append("</data>\n");
4509 char charErrorTxt[500];
4510 UErrorCode status = U_ZERO_ERROR;
4511 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4512 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4513 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4515 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4516 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4517 errorType, seed, i, charErrorTxt);
4528 // Bug 5532. UTF-8 based UText fails in dictionary code.
4529 // This test checks the initial patch,
4530 // which is to just keep it from crashing. Correct word boundaries
4531 // await a proper fix to the dictionary code.
4533 void RBBITest::TestBug5532(void) {
4534 // Text includes a mixture of Thai and Latin.
4535 const unsigned char utf8Data[] = {
4536 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4537 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4538 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4539 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4540 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4541 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4542 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4543 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4544 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4545 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4546 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4548 UErrorCode status = U_ZERO_ERROR;
4549 UText utext=UTEXT_INITIALIZER;
4550 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4551 TEST_ASSERT_SUCCESS(status);
4553 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4554 TEST_ASSERT_SUCCESS(status);
4555 if (U_SUCCESS(status)) {
4556 bi->setText(&utext, status);
4557 TEST_ASSERT_SUCCESS(status);
4559 int32_t breakCount = 0;
4560 int32_t previousBreak = -1;
4561 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4562 // For now, just make sure that the break iterator doesn't hang.
4563 TEST_ASSERT(previousBreak < bi->current());
4564 previousBreak = bi->current();
4566 TEST_ASSERT(breakCount > 0);
4569 utext_close(&utext);
4573 void RBBITest::TestBug9983(void) {
4574 UnicodeString text = UnicodeString("\\u002A" // * Other
4576 "\\u309C" // Katakana
4580 "\\u0000").unescape();
4582 UErrorCode status = U_ZERO_ERROR;
4583 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4584 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4585 TEST_ASSERT_SUCCESS(status);
4586 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4587 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4588 TEST_ASSERT_SUCCESS(status);
4589 if (U_FAILURE(status)) {
4592 int32_t offset, rstatus, iterationCount;
4594 brkiter->setText(text);
4597 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4599 rstatus = brkiter->getRuleStatus();
4600 (void)rstatus; // Suppress set but not used warning.
4601 if (iterationCount >= 10) {
4605 TEST_ASSERT(iterationCount == 6);
4607 brkiterPOSIX->setText(text);
4608 brkiterPOSIX->last();
4610 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4612 rstatus = brkiterPOSIX->getRuleStatus();
4613 (void)rstatus; // Suppress set but not used warning.
4614 if (iterationCount >= 10) {
4618 TEST_ASSERT(iterationCount == 6);
4621 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4623 void RBBITest::TestBug7547() {
4624 UnicodeString rules;
4625 UErrorCode status = U_ZERO_ERROR;
4626 UParseError parseError;
4627 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4628 if (status != U_BRK_RULE_SYNTAX) {
4629 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4631 if (parseError.line != 1 || parseError.offset != 0) {
4632 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4637 void RBBITest::TestBug12797() {
4638 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4639 UErrorCode status = U_ZERO_ERROR;
4640 UParseError parseError;
4641 RuleBasedBreakIterator bi(rules, parseError, status);
4642 if (U_FAILURE(status)) {
4643 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4646 UnicodeString text = "abc";
4649 int32_t boundary = bi.next();
4650 if (boundary != 3) {
4651 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4657 // TestDebug - A place-holder test for debugging purposes.
4658 // For putting in fragments of other tests that can be invoked
4659 // for tracing without a lot of unwanted extra stuff happening.
4661 void RBBITest::TestDebug(void) {
4665 void RBBITest::TestProperties() {
4666 UErrorCode errorCode = U_ZERO_ERROR;
4667 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4668 if (!prependSet.isEmpty()) {
4670 "[:GCB=Prepend:] is not empty any more. "
4671 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4672 "change this test to the opposite condition.");
4676 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */