+// Copyright (C) 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1999-2016, International Business Machines Corporation and
void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
{
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
+ fTestParams = params;
- switch (index) {
+ TESTCASE_AUTO_BEGIN;
#if !UCONFIG_NO_FILE_IO
- case 0: name = "TestBug4153072";
- if(exec) TestBug4153072(); break;
-#else
- case 0: name = "skip";
- break;
+ TESTCASE_AUTO(TestBug4153072);
#endif
-
- case 1: name = "skip";
- break;
- case 2: name = "TestStatusReturn";
- if(exec) TestStatusReturn(); break;
-
+ TESTCASE_AUTO(TestStatusReturn);
#if !UCONFIG_NO_FILE_IO
- case 3: name = "TestUnicodeFiles";
- if(exec) TestUnicodeFiles(); break;
- case 4: name = "TestEmptyString";
- if(exec) TestEmptyString(); break;
-#else
- case 3: case 4: name = "skip";
- break;
+ TESTCASE_AUTO(TestUnicodeFiles);
+ TESTCASE_AUTO(TestEmptyString);
#endif
-
- case 5: name = "TestGetAvailableLocales";
- if(exec) TestGetAvailableLocales(); break;
-
- case 6: name = "TestGetDisplayName";
- if(exec) TestGetDisplayName(); break;
-
+ TESTCASE_AUTO(TestGetAvailableLocales);
+ TESTCASE_AUTO(TestGetDisplayName);
#if !UCONFIG_NO_FILE_IO
- case 7: name = "TestEndBehaviour";
- if(exec) TestEndBehaviour(); break;
- case 8: case 9: case 10: name = "skip";
- break;
- case 11: name = "TestWordBreaks";
- if(exec) TestWordBreaks(); break;
- case 12: name = "TestWordBoundary";
- if(exec) TestWordBoundary(); break;
- case 13: name = "TestLineBreaks";
- if(exec) TestLineBreaks(); break;
- case 14: name = "TestSentBreaks";
- if(exec) TestSentBreaks(); break;
- case 15: name = "TestExtended";
- if(exec) TestExtended(); break;
-#else
- case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
- break;
+ TESTCASE_AUTO(TestEndBehaviour);
+ TESTCASE_AUTO(TestWordBreaks);
+ TESTCASE_AUTO(TestWordBoundary);
+ TESTCASE_AUTO(TestLineBreaks);
+ TESTCASE_AUTO(TestSentBreaks);
+ TESTCASE_AUTO(TestExtended);
#endif
-
#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
- case 16:
- name = "TestMonkey"; if(exec) TestMonkey(params); break;
-#else
- case 16:
- name = "skip"; break;
+ TESTCASE_AUTO(TestMonkey);
#endif
-
#if !UCONFIG_NO_FILE_IO
- case 17: name = "TestBug3818";
- if(exec) TestBug3818(); break;
-#else
- case 17: name = "skip";
- break;
+ TESTCASE_AUTO(TestBug3818);
#endif
-
- case 18: name = "skip";
- break;
- case 19: name = "TestDebug";
- if(exec) TestDebug(); break;
- case 20: name = "skip";
- break;
-
+ TESTCASE_AUTO(TestDebug);
#if !UCONFIG_NO_FILE_IO
- case 21: name = "TestBug5775";
- if (exec) TestBug5775(); break;
-#else
- case 21: name = "skip";
- break;
+ TESTCASE_AUTO(TestBug5775);
#endif
-
- case 22: name = "TestBug9983";
- if (exec) TestBug9983(); break;
- case 23: name = "TestDictRules";
- if (exec) TestDictRules(); break;
- case 24: name = "TestBug5532";
- if (exec) TestBug5532(); break;
- default: name = ""; break; //needed to end loop
- }
+ TESTCASE_AUTO(TestBug9983);
+ TESTCASE_AUTO(TestDictRules);
+ TESTCASE_AUTO(TestBug5532);
+ TESTCASE_AUTO(TestBug7547);
+ TESTCASE_AUTO(TestBug12797);
+ TESTCASE_AUTO_END;
}
//--------------------------------------------------------------------------------------
RBBITest::RBBITest() {
+ fTestParams = NULL;
}
}
-int32_t TestParams::getSrcLine(int bp) {
+int32_t TestParams::getSrcLine(int32_t bp) {
if (bp >= textMap->size()) {
bp = textMap->size() - 1;
}
}
-int32_t TestParams::getExpectedBreak(int bp) {
+int32_t TestParams::getExpectedBreak(int32_t bp) {
if (bp >= textMap->size()) {
return 0;
}
}
-int32_t TestParams::getSrcCol(int bp) {
+int32_t TestParams::getSrcCol(int32_t bp) {
if (bp >= textMap->size()) {
bp = textMap->size() - 1;
}
expectedTagVal = 0;
}
int32_t line = t->getSrcLine(bp);
- int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
+ int32_t rs = t->bi->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
" Actual, Expected status = %4d, %4d",
}
+//
+// Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+//
+static const char *gExtended_Pict = "["
+ "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093"
+ "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
+ "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF"
+ "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395"
+ "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548"
+ "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589"
+ "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0"
+ "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0"
+ "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
+ "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625"
+ "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667"
+ "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF"
+ "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF"
+ "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF"
+ "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF"
+ "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF"
+ "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F"
+ "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8"
+ "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF"
+ "]";
+
//------------------------------------------------------------------------------------------
//
// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
UnicodeSet *fCRLFSet;
UnicodeSet *fControlSet;
UnicodeSet *fExtendSet;
+ UnicodeSet *fZWJSet;
UnicodeSet *fRegionalIndicatorSet;
UnicodeSet *fPrependSet;
UnicodeSet *fSpacingSet;
UnicodeSet *fLVSet;
UnicodeSet *fLVTSet;
UnicodeSet *fHangulSet;
- UnicodeSet *fAnySet;
- UnicodeSet *fEmojiModifierSet;
UnicodeSet *fEmojiBaseSet;
- UnicodeSet *fZWJSet;
- UnicodeSet *fGAZSet;
+ UnicodeSet *fEmojiModifierSet;
+ UnicodeSet *fExtendedPictSet;
+ UnicodeSet *fEBGSet;
+ UnicodeSet *fEmojiNRKSet;
+ UnicodeSet *fAnySet;
const UnicodeString *fText;
};
fText = NULL;
fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
- fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status);
- fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status);
- fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
+ fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
+ fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
+ fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
+ fRegionalIndicatorSet =
+ new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
fHangulSet->addAll(*fTSet);
fHangulSet->addAll(*fLVSet);
fHangulSet->addAll(*fLVTSet);
- fAnySet = new UnicodeSet(0, 0x10ffff);
-
+ fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
+ fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
+ fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
+ fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
+ fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
+ "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
+ fAnySet = new UnicodeSet(0, 0x10ffff);
- fEmojiBaseSet = new UnicodeSet(UnicodeString(
- "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
- "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
- "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
- "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
-
- fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
- fZWJSet = new UnicodeSet(0x200D, 0x200D);
- fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status);
-
- fSets = new UVector(status);
+ fSets = new UVector(status);
fSets->addElement(fCRLFSet, status);
fSets->addElement(fControlSet, status);
fSets->addElement(fExtendSet, status);
fSets->addElement(fEmojiBaseSet, status);
fSets->addElement(fEmojiModifierSet, status);
fSets->addElement(fZWJSet, status);
- fSets->addElement(fGAZSet, status);
+ fSets->addElement(fExtendedPictSet, status);
+ fSets->addElement(fEBGSet, status);
+ fSets->addElement(fEmojiNRKSet,status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
int breakPos = -1;
UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
+ UChar32 cBase; // for (X Extend*) patterns, the X character.
if (U_FAILURE(deferredStatus)) {
return -1;
}
p0 = p1 = p2 = p3 = prevPos;
c3 = fText->char32At(prevPos);
- c0 = c1 = c2 = 0;
+ c0 = c1 = c2 = cBase = 0;
(void)p0; // suppress set but not used warning.
(void)c0;
continue;
}
- // Rule (GB8a) Regional_Indicator x Regional_Indicator
- // Note: The first if condition is a little tricky. We only need to force
- // a break if there are three or more contiguous RIs. If there are
- // only two, a break following will occur via other rules, and will include
- // any trailing extend characters, which is needed behavior.
- if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
- && fRegionalIndicatorSet->contains(c2)) {
- break;
- }
- if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
- continue;
- }
-
- // Rule (GB9) x Extend
+ // Rule (GB9) x (Extend | ZWJ)
if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
+ if (!fExtendSet->contains(c1)) {
+ cBase = c1;
+ }
continue;
}
continue;
}
- // Rule (GB9c) Emoji_Base x Emoji_Modifier
- if ((fEmojiBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
+ // Rule (GB10) (Emoji_Base | EBG) Extend * x Emoji_Modifier
+ if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
+ continue;
+ }
+ if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
+ fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
+ continue;
+ }
+
+ // Rule (GB11) (Glue_After_ZWJ | Emoji) ZWJ x (Glue_After_ZWJ | Emoji)
+ if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
+ (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
continue;
}
- // Rule (GB9d) ZWJ x Glue_After_Zwj
- if (fZWJSet->contains(c1) && fGAZSet->contains(c2)) {
+ // Rule (GB12-13) Regional_Indicator x Regional_Indicator
+ // Note: The first if condition is a little tricky. We only need to force
+ // a break if there are three or more contiguous RIs. If there are
+ // only two, a break following will occur via other rules, and will include
+ // any trailing extend characters, which is needed behavior.
+ if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
+ && fRegionalIndicatorSet->contains(c2)) {
+ break;
+ }
+ if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
continue;
}
- // Rule (GB10) Any <break> Any
+ // Rule (GB999) Any <break> Any
break;
}
delete fEmojiBaseSet;
delete fEmojiModifierSet;
delete fZWJSet;
- delete fGAZSet;
+ delete fExtendedPictSet;
+ delete fEBGSet;
+ delete fEmojiNRKSet;
}
//------------------------------------------------------------------------------------------
UnicodeSet *fKatakanaSet;
UnicodeSet *fHebrew_LetterSet;
UnicodeSet *fALetterSet;
- // TODO(jungshik): Do we still need this change?
- // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
UnicodeSet *fSingle_QuoteSet;
UnicodeSet *fDouble_QuoteSet;
UnicodeSet *fMidNumLetSet;
UnicodeSet *fOtherSet;
UnicodeSet *fExtendSet;
UnicodeSet *fExtendNumLetSet;
- UnicodeSet *fDictionaryCjkSet;
+ UnicodeSet *fDictionarySet;
UnicodeSet *fEBaseSet;
+ UnicodeSet *fEBGSet;
UnicodeSet *fEModifierSet;
- UnicodeSet *fZWSSet;
- UnicodeSet *fGAZSet;
+ UnicodeSet *fZWJSet;
+ UnicodeSet *fExtendedPictSet;
+ UnicodeSet *fEmojiNRKSet;
const UnicodeString *fText;
};
fSets = new UVector(status);
- fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
- fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
- fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
- fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
- // Exclude Hangul syllables from ALetterSet during testing.
- // Leave CJK dictionary characters out from the monkey tests!
-#if 0
- fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
- "[\\p{Line_Break = Complex_Context}"
- "-\\p{Grapheme_Cluster_Break = Extend}"
- "-\\p{Grapheme_Cluster_Break = Control}"
- "]]",
- status);
-#endif
- fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
+ fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
+ fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
+ fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
+ fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
- fALetterSet->removeAll(*fDictionaryCjkSet);
fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status);
fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status);
fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
- // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
- // we should figure out why
fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
- fEBaseSet = new UnicodeSet(UnicodeString(
- "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
- "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
- "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
- "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
+ fEBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
+ "[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
+ fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EBG}]"), status);
+ fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EM}]"), status);
+ fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ZWJ}]"), status);
+ fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
+ fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
+ "[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
- fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
- fZWSSet = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);;
- fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status);
- fExtendSet->removeAll(*fZWSSet);
+ fDictionarySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"), status);
+ fDictionarySet->addAll(*fKatakanaSet);
+ fDictionarySet->addAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
+ fALetterSet->removeAll(*fDictionarySet);
fOtherSet = new UnicodeSet();
if(U_FAILURE(status)) {
fOtherSet->removeAll(*fExtendSet);
fOtherSet->removeAll(*fRegionalIndicatorSet);
fOtherSet->removeAll(*fEBaseSet);
+ fOtherSet->removeAll(*fEBGSet);
fOtherSet->removeAll(*fEModifierSet);
- fOtherSet->removeAll(*fZWSSet);
- fOtherSet->removeAll(*fGAZSet);
-
+ fOtherSet->removeAll(*fZWJSet);
+ fOtherSet->removeAll(*fExtendedPictSet);
+ fOtherSet->removeAll(*fEmojiNRKSet);
+
// Inhibit dictionary characters from being tested at all.
- fOtherSet->removeAll(*fDictionaryCjkSet);
- fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
+ fOtherSet->removeAll(*fDictionarySet);
fSets->addElement(fCRSet, status);
fSets->addElement(fLFSet, status);
fSets->addElement(fALetterSet, status);
fSets->addElement(fSingle_QuoteSet, status);
fSets->addElement(fDouble_QuoteSet, status);
- //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
+ //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
+ // from the test data. They are all in the dictionary set,
+ // which this (old, to be retired) monkey test cannot handle.
fSets->addElement(fMidLetterSet, status);
fSets->addElement(fMidNumLetSet, status);
fSets->addElement(fMidNumSet, status);
fSets->addElement(fExtendNumLetSet, status);
fSets->addElement(fEBaseSet, status);
+ fSets->addElement(fEBGSet, status);
fSets->addElement(fEModifierSet, status);
- fSets->addElement(fZWSSet, status);
- fSets->addElement(fGAZSet, status);
+ fSets->addElement(fZWJSet, status);
+ fSets->addElement(fExtendedPictSet, status);
+ fSets->addElement(fEmojiNRKSet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
break;
};
}
- while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWSSet->contains(c3));
+ while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
if (p1 == p2) {
break;
};
- // Rule (3c) ZWJ x GAZ (Glue after ZWJ).
+ // Rule (3c) ZWJ x (Glue_after_ZWJ | EmojiNRK).
// Not ignoring extend chars, so peek into input text to
// get the potential ZWJ, the character immediately preceding c2.
// Sloppy UChar32 indexing: p2-1 may reference trail half
// but char32At will get the full code point.
- if (fZWSSet->contains(fText->char32At(p2-1)) && fGAZSet->contains(c2)) {
+ if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
continue;
}
}
// Rule (13) Katakana x Katakana
+ // Note: matches UAX 29 rules, but doesn't come into play for ICU because
+ // all Katakana are handled by the dictionary breaker.
if (fKatakanaSet->contains(c1) &&
fKatakanaSet->contains(c2)) {
continue;
continue;
}
- // Rule 13c
+ // WB 14 (E_Base | EBG) x E_Modifier
+ if ((fEBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
+ continue;
+ }
+
+ // Rule 15 - 17 Group pairs of Regional Indicators.
if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
break;
}
continue;
}
- // Rule 13d
- if ((fEBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEModifierSet->contains(c2)) {
- continue;
- }
-
- // Rule 14. Break found here.
+ // Rule 999. Break found here.
break;
}
delete fExtendSet;
delete fExtendNumLetSet;
delete fRegionalIndicatorSet;
- delete fDictionaryCjkSet;
+ delete fDictionarySet;
delete fOtherSet;
delete fEBaseSet;
+ delete fEBGSet;
delete fEModifierSet;
- delete fZWSSet;
- delete fGAZSet;
+ delete fZWJSet;
+ delete fExtendedPictSet;
+ delete fEmojiNRKSet;
}
UnicodeSet *fEB;
UnicodeSet *fEM;
UnicodeSet *fZJ;
+ UnicodeSet *fExtendedPict;
+ UnicodeSet *fEmojiNRK;
BreakIterator *fCharBI;
const UnicodeString *fText;
fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
- fEB = new UnicodeSet(UnicodeString(
- "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
- "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
- "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
- "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
- fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
- fZJ = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);
+ fEB = new UnicodeSet(UNICODE_STRING_SIMPLE(
+ "[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
+ fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
+ fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
+ fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
+ fExtendedPict = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
if (U_FAILURE(status)) {
deferredStatus = status;
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
-
- fID->addAll(*fEB); // Emoji Base and Emoji Modifier behave as ID.
- fID->addAll(*fEM);
- fAL->removeAll(*fEM);
-
-
- fAL->remove((UChar32)0x2764); // Emoji Proposal: move u2764 from Al to Id
- fID->add((UChar32)0x2764);
+ fCM->addAll(*fZJ); // ZWJ behaves as a CM.
fSets->addElement(fBK, status);
fSets->addElement(fCR, status);
fSets->addElement(fEB, status);
fSets->addElement(fEM, status);
fSets->addElement(fZJ, status);
+ fSets->addElement(fExtendedPict, status);
+ fSets->addElement(fEmojiNRK, status);
+
const char *rules =
- "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
- "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
- "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
- "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
- "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
- "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
+ "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
+ "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
+ "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
+ "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
+ "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
+ "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
fNumberMatcher = new RegexMatcher(
UnicodeString(rules, -1, US_INV), 0, status);
break;
}
- // LB 8a ZJ x ID
+ // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
// The monkey test's way of ignoring combining characters doesn't work
// for this rule. ZJ is also a CM. Need to get the actual character
// preceding "thisChar", not ignoring combining marks, possibly ZJ.
{
int32_t prevIdx = fText->moveIndex32(pos, -1);
UChar32 prevC = fText->char32At(prevIdx);
- if (fZJ->contains(prevC) && fID->contains(thisChar)) {
+ if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
continue;
}
}
if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
(fEX->contains(prevChar) && fIN->contains(thisChar)) ||
(fHL->contains(prevChar) && fIN->contains(thisChar)) ||
- (fID->contains(prevChar) && fIN->contains(thisChar)) ||
+ ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
(fIN->contains(prevChar) && fIN->contains(thisChar)) ||
(fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
continue;
}
- // LB 23 ID x PO
- // AL x NU
- // HL x NU
- // NU x AL
- if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
- (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
- (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
- (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
- (fNU->contains(prevChar) && fHL->contains(thisChar)) ) {
+ // LB 23 (AL | HL) x NU
+ // NU x (AL | HL)
+ if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
+ continue;
+ }
+ if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+ continue;
+ }
+
+ // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
+ // PR x (ID | EB | EM)
+ // (ID | EB | EM) x PO
+ if (fPR->contains(prevChar) &&
+ (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
+ continue;
+ }
+ if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
+ fPO->contains(thisChar)) {
continue;
}
// LB 24 Do not break between prefix and letters or ideographs.
- // PR x ID
- // PR x (AL | HL)
- // PO x (AL | HL)
- if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
- (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
- (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) {
+ // (PR | PO) x (AL | HL)
+ // (AL | HL) x (PR | PO)
+ if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
+ (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+ continue;
+ }
+ if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
+ (fPR->contains(thisChar) || fPO->contains(thisChar))) {
continue;
}
delete fEB;
delete fEM;
delete fZJ;
+ delete fExtendedPict;
+ delete fEmojiNRK;
delete fCharBI;
delete fNumberMatcher;
#endif
}
-void RBBITest::TestMonkey(char *params) {
+void RBBITest::TestMonkey() {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
UErrorCode status = U_ZERO_ERROR;
loopCount = 10000;
}
- if (params) {
- UnicodeString p(params);
+ if (fTestParams) {
+ UnicodeString p(fTestParams);
loopCount = getIntParam("loop", p, loopCount);
seed = getIntParam("seed", p, seed);
TEST_ASSERT(iterationCount == 6);
}
+// Bug 7547 - verify that building a break itereator from empty rules produces an error.
+//
+void RBBITest::TestBug7547() {
+ UnicodeString rules;
+ UErrorCode status = U_ZERO_ERROR;
+ UParseError parseError;
+ RuleBasedBreakIterator breakIterator(rules, parseError, status);
+ if (status != U_BRK_RULE_SYNTAX) {
+ errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
+ }
+ if (parseError.line != 1 || parseError.offset != 0) {
+ errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
+ }
+}
+
+
+void RBBITest::TestBug12797() {
+ UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
+ UErrorCode status = U_ZERO_ERROR;
+ UParseError parseError;
+ RuleBasedBreakIterator bi(rules, parseError, status);
+ if (U_FAILURE(status)) {
+ errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
+ return;
+ }
+ UnicodeString text = "abc";
+ bi.setText(text);
+ bi.first();
+ int32_t boundary = bi.next();
+ if (boundary != 3) {
+ errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
+ }
+}
+
//
// TestDebug - A place-holder test for debugging purposes.
// for tracing without a lot of unwanted extra stuff happening.
//
void RBBITest::TestDebug(void) {
-#if 0
- UErrorCode status = U_ZERO_ERROR;
- int pos = 0;
- int ruleStatus = 0;
-
- RuleBasedBreakIterator* bi =
- // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
- // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
- (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
- UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
- // UnicodeString s("Aaa. Bcd");
- s = s.unescape();
- bi->setText(s);
- UBool r = bi->isBoundary(8);
- printf("%s", r?"true":"false");
- return;
- pos = bi->last();
- do {
- // ruleStatus = bi->getRuleStatus();
- printf("%d\t%d\n", pos, ruleStatus);
- pos = bi->previous();
- } while (pos != BreakIterator::DONE);
-#endif
+
}
void RBBITest::TestProperties() {