Imported Upstream version 58.1

[platform/upstream/icu.git] / source / test / intltest / rbbitst.cpp
diff --git a/source/test/intltest/rbbitst.cpp b/source/test/intltest/rbbitst.cpp

index 045cea0..a3102c1 100644 (file)
--- a/source/test/intltest/rbbitst.cpp
+++ b/source/test/intltest/rbbitst.cpp
@@ -1,3 +1,5 @@
+// Copyright (C) 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /********************************************************************
   * COPYRIGHT:
   * Copyright (c) 1999-2016, International Business Machines Corporation and
@@ -65,96 +67,43 @@
  void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  {
      if (exec) logln("TestSuite RuleBasedBreakIterator: ");
+    fTestParams = params;
  
-    switch (index) {
+    TESTCASE_AUTO_BEGIN;
  #if !UCONFIG_NO_FILE_IO
-        case 0: name = "TestBug4153072";
-            if(exec) TestBug4153072();                         break;
-#else
-        case 0: name = "skip";
-            break;
+    TESTCASE_AUTO(TestBug4153072);
  #endif
-
-        case 1: name = "skip";
-            break;
-        case 2: name = "TestStatusReturn";
-            if(exec) TestStatusReturn();                       break;
-
+    TESTCASE_AUTO(TestStatusReturn);
  #if !UCONFIG_NO_FILE_IO
-        case 3: name = "TestUnicodeFiles";
-            if(exec) TestUnicodeFiles();                       break;
-        case 4: name = "TestEmptyString";
-            if(exec) TestEmptyString();                        break;
-#else
-        case 3: case 4: name = "skip";
-            break;
+    TESTCASE_AUTO(TestUnicodeFiles);
+    TESTCASE_AUTO(TestEmptyString);
  #endif
-
-        case 5: name = "TestGetAvailableLocales";
-            if(exec) TestGetAvailableLocales();                break;
-
-        case 6: name = "TestGetDisplayName";
-            if(exec) TestGetDisplayName();                     break;
-
+    TESTCASE_AUTO(TestGetAvailableLocales);
+    TESTCASE_AUTO(TestGetDisplayName);
  #if !UCONFIG_NO_FILE_IO
-        case 7: name = "TestEndBehaviour";
-            if(exec) TestEndBehaviour();                       break;
-        case 8: case 9: case 10: name = "skip";
-             break;
-        case 11: name = "TestWordBreaks";
-             if(exec) TestWordBreaks();                        break;
-        case 12: name = "TestWordBoundary";
-             if(exec) TestWordBoundary();                      break;
-        case 13: name = "TestLineBreaks";
-             if(exec) TestLineBreaks();                        break;
-        case 14: name = "TestSentBreaks";
-             if(exec) TestSentBreaks();                        break;
-        case 15: name = "TestExtended";
-             if(exec) TestExtended();                          break;
-#else
-        case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
-             break;
+    TESTCASE_AUTO(TestEndBehaviour);
+    TESTCASE_AUTO(TestWordBreaks);
+    TESTCASE_AUTO(TestWordBoundary);
+    TESTCASE_AUTO(TestLineBreaks);
+    TESTCASE_AUTO(TestSentBreaks);
+    TESTCASE_AUTO(TestExtended);
  #endif
-
  #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
-        case 16:
-            name = "TestMonkey"; if(exec)  TestMonkey(params); break;
-#else
-        case 16:
-             name = "skip";                                    break;
+    TESTCASE_AUTO(TestMonkey);
  #endif
-
  #if !UCONFIG_NO_FILE_IO
-        case 17: name = "TestBug3818";
-            if(exec) TestBug3818();                            break;
-#else
-        case 17: name = "skip";
-            break;
+    TESTCASE_AUTO(TestBug3818);
  #endif
-
-        case 18: name = "skip";
-            break;
-        case 19: name = "TestDebug";
-            if(exec) TestDebug();                              break;
-        case 20: name = "skip";
-            break;
-
+    TESTCASE_AUTO(TestDebug);
  #if !UCONFIG_NO_FILE_IO
-        case 21: name = "TestBug5775";
-            if (exec) TestBug5775();                           break;
-#else
-        case 21: name = "skip";
-            break;
+    TESTCASE_AUTO(TestBug5775);
  #endif
-
-        case 22: name = "TestBug9983";
-            if (exec) TestBug9983();                           break;
-        case 23: name = "TestDictRules";
-            if (exec) TestDictRules();                         break;
-        case 24: name = "TestBug5532";
-            if (exec) TestBug5532();                           break;
-        default: name = ""; break; //needed to end loop
-    }
+    TESTCASE_AUTO(TestBug9983);
+    TESTCASE_AUTO(TestDictRules);
+    TESTCASE_AUTO(TestBug5532);
+    TESTCASE_AUTO(TestBug7547);
+    TESTCASE_AUTO(TestBug12797);
+    TESTCASE_AUTO_END;
  }
  
  
@@ -306,6 +255,7 @@ void BITestData::clearResults() {
  //--------------------------------------------------------------------------------------
  
  RBBITest::RBBITest() {
+    fTestParams = NULL;
  }
  
  
@@ -946,7 +896,7 @@ void TestParams::setUTF8(UErrorCode &status) {
  }
  
  
-int32_t TestParams::getSrcLine(int bp) {
+int32_t TestParams::getSrcLine(int32_t bp) {
      if (bp >= textMap->size()) {
          bp = textMap->size() - 1;
      }
@@ -962,7 +912,7 @@ int32_t TestParams::getSrcLine(int bp) {
  }
  
  
-int32_t TestParams::getExpectedBreak(int bp) {
+int32_t TestParams::getExpectedBreak(int32_t bp) {
      if (bp >= textMap->size()) {
          return 0;
      }
@@ -975,7 +925,7 @@ int32_t TestParams::getExpectedBreak(int bp) {
  }
  
  
-int32_t TestParams::getSrcCol(int bp) {
+int32_t TestParams::getSrcCol(int32_t bp) {
      if (bp >= textMap->size()) {
          bp = textMap->size() - 1;
      }
@@ -1043,7 +993,7 @@ void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
                  expectedTagVal = 0;
              }
              int32_t line = t->getSrcLine(bp);
-            int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
+            int32_t rs = t->bi->getRuleStatus();
              if (rs != expectedTagVal) {
                  errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
                        "          Actual, Expected status = %4d, %4d",
@@ -2000,6 +1950,31 @@ static uint32_t m_rand()
  }
  
  
+//
+// Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
+//
+static const char *gExtended_Pict = "["
+    "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093"
+    "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
+    "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF"
+    "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395"
+    "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548"
+    "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589"
+    "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0"
+    "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0"
+    "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
+    "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625"
+    "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667"
+    "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF"
+    "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF"
+    "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF"
+    "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF"
+    "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF"
+    "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F"
+    "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8"
+    "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF"
+    "]";
+
  //------------------------------------------------------------------------------------------
  //
  //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
@@ -2019,6 +1994,7 @@ private:
      UnicodeSet  *fCRLFSet;
      UnicodeSet  *fControlSet;
      UnicodeSet  *fExtendSet;
+    UnicodeSet  *fZWJSet;
      UnicodeSet  *fRegionalIndicatorSet;
      UnicodeSet  *fPrependSet;
      UnicodeSet  *fSpacingSet;
@@ -2028,11 +2004,12 @@ private:
      UnicodeSet  *fLVSet;
      UnicodeSet  *fLVTSet;
      UnicodeSet  *fHangulSet;
-    UnicodeSet  *fAnySet;
-    UnicodeSet  *fEmojiModifierSet;
      UnicodeSet  *fEmojiBaseSet;
-    UnicodeSet  *fZWJSet;
-    UnicodeSet  *fGAZSet;
+    UnicodeSet  *fEmojiModifierSet;
+    UnicodeSet  *fExtendedPictSet;
+    UnicodeSet  *fEBGSet;
+    UnicodeSet  *fEmojiNRKSet;
+    UnicodeSet  *fAnySet;
  
      const UnicodeString *fText;
  };
@@ -2044,9 +2021,11 @@ RBBICharMonkey::RBBICharMonkey() {
      fText = NULL;
  
      fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
-    fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status);
-    fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status);
-    fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
+    fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
+    fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
+    fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
+    fRegionalIndicatorSet =
+                  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
      fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
      fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
      fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
@@ -2060,21 +2039,16 @@ RBBICharMonkey::RBBICharMonkey() {
      fHangulSet->addAll(*fTSet);
      fHangulSet->addAll(*fLVSet);
      fHangulSet->addAll(*fLVTSet);
-    fAnySet     = new UnicodeSet(0, 0x10ffff);
-
  
+    fEmojiBaseSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
+    fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
+    fExtendedPictSet  = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
+    fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
+    fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
+                "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
+    fAnySet           = new UnicodeSet(0, 0x10ffff);
  
-    fEmojiBaseSet = new UnicodeSet(UnicodeString(
-                "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
-                "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
-                "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
-                "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
-
-    fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
-    fZWJSet           = new UnicodeSet(0x200D, 0x200D);
-    fGAZSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status);
-
-    fSets       = new UVector(status);
+    fSets             = new UVector(status);
      fSets->addElement(fCRLFSet,    status);
      fSets->addElement(fControlSet, status);
      fSets->addElement(fExtendSet,  status);
@@ -2088,7 +2062,9 @@ RBBICharMonkey::RBBICharMonkey() {
      fSets->addElement(fEmojiBaseSet, status);
      fSets->addElement(fEmojiModifierSet, status);
      fSets->addElement(fZWJSet,     status);
-    fSets->addElement(fGAZSet,     status);
+    fSets->addElement(fExtendedPictSet, status);
+    fSets->addElement(fEBGSet,     status);
+    fSets->addElement(fEmojiNRKSet,status);
      if (U_FAILURE(status)) {
          deferredStatus = status;
      }
@@ -2109,6 +2085,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
      int     breakPos = -1;
  
      UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
+    UChar32 cBase;            // for (X Extend*) patterns, the X character.
  
      if (U_FAILURE(deferredStatus)) {
          return -1;
@@ -2120,7 +2097,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
      }
      p0 = p1 = p2 = p3 = prevPos;
      c3 =  fText->char32At(prevPos);
-    c0 = c1 = c2 = 0;
+    c0 = c1 = c2 = cBase = 0;
      (void)p0;   // suppress set but not used warning.
      (void)c0;
  
@@ -2189,21 +2166,11 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
              continue;
          }
  
-        // Rule (GB8a)    Regional_Indicator x Regional_Indicator
-        //                Note: The first if condition is a little tricky. We only need to force
-        //                      a break if there are three or more contiguous RIs. If there are
-        //                      only two, a break following will occur via other rules, and will include
-        //                      any trailing extend characters, which is needed behavior.
-        if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) 
-                && fRegionalIndicatorSet->contains(c2)) {
-            break;
-        }
-        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
-            continue;
-        }
-
-        // Rule (GB9)    x Extend
+        // Rule (GB9)    x (Extend | ZWJ)
          if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
+            if (!fExtendSet->contains(c1)) {
+                cBase = c1;
+            }
              continue;
          }
  
@@ -2217,17 +2184,35 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
              continue;
          }
  
-        // Rule (GB9c)   Emoji_Base x Emoji_Modifier
-        if ((fEmojiBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
+        // Rule (GB10)   (Emoji_Base | EBG) Extend * x Emoji_Modifier
+        if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
+            continue;
+        }
+        if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
+                fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
+            continue;
+        }
+
+        // Rule (GB11)   (Glue_After_ZWJ | Emoji) ZWJ x (Glue_After_ZWJ | Emoji)
+        if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
+                (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
              continue;
          }
  
-        // Rule (GB9d)   ZWJ x Glue_After_Zwj
-        if (fZWJSet->contains(c1) && fGAZSet->contains(c2)) {
+        // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
+        //                   Note: The first if condition is a little tricky. We only need to force
+        //                      a break if there are three or more contiguous RIs. If there are
+        //                      only two, a break following will occur via other rules, and will include
+        //                      any trailing extend characters, which is needed behavior.
+        if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) 
+                && fRegionalIndicatorSet->contains(c2)) {
+            break;
+        }
+        if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
              continue;
          }
  
-        // Rule (GB10)  Any  <break>  Any
+        // Rule (GB999)  Any  <break>  Any
          break;
      }
  
@@ -2260,7 +2245,9 @@ RBBICharMonkey::~RBBICharMonkey() {
      delete fEmojiBaseSet;
      delete fEmojiModifierSet;
      delete fZWJSet;
-    delete fGAZSet;
+    delete fExtendedPictSet;
+    delete fEBGSet;
+    delete fEmojiNRKSet;
  }
  
  //------------------------------------------------------------------------------------------
@@ -2286,8 +2273,6 @@ private:
      UnicodeSet  *fKatakanaSet;
      UnicodeSet  *fHebrew_LetterSet;
      UnicodeSet  *fALetterSet;
-    // TODO(jungshik): Do we still need this change?
-    // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
      UnicodeSet  *fSingle_QuoteSet;
      UnicodeSet  *fDouble_QuoteSet;
      UnicodeSet  *fMidNumLetSet;
@@ -2298,11 +2283,13 @@ private:
      UnicodeSet  *fOtherSet;
      UnicodeSet  *fExtendSet;
      UnicodeSet  *fExtendNumLetSet;
-    UnicodeSet  *fDictionaryCjkSet;
+    UnicodeSet  *fDictionarySet;
      UnicodeSet  *fEBaseSet;
+    UnicodeSet  *fEBGSet;
      UnicodeSet  *fEModifierSet;
-    UnicodeSet  *fZWSSet;
-    UnicodeSet  *fGAZSet;
+    UnicodeSet  *fZWJSet;
+    UnicodeSet  *fExtendedPictSet;
+    UnicodeSet  *fEmojiNRKSet;
  
      const UnicodeString  *fText;
  };
@@ -2314,48 +2301,37 @@ RBBIWordMonkey::RBBIWordMonkey()
  
      fSets            = new UVector(status);
  
-    fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
-    fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
-    fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
-    fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
-    // Exclude Hangul syllables from ALetterSet during testing.
-    // Leave CJK dictionary characters out from the monkey tests!
-#if 0
-    fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
-                                      "[\\p{Line_Break = Complex_Context}"
-                                      "-\\p{Grapheme_Cluster_Break = Extend}"
-                                      "-\\p{Grapheme_Cluster_Break = Control}"
-                                      "]]",
-                                      status);
-#endif
-    fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
+    fCRSet            = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
+    fLFSet            = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
+    fNewlineSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
      fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
+    fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
      fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
      fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
-    fALetterSet->removeAll(*fDictionaryCjkSet);
      fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
      fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
      fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
      fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
      fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
-    // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
-    // we should figure out why
      fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
      fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
      fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
      fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
  
-    fEBaseSet         = new UnicodeSet(UnicodeString(
-                "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
-                "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
-                "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
-                "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
+    fEBaseSet         = new UnicodeSet(UNICODE_STRING_SIMPLE(
+            "[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
+    fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EBG}]"),          status);
+    fEModifierSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EM}]"),           status);
+    fZWJSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ZWJ}]"),          status);
+    fExtendedPictSet  = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
+    fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
+            "[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
  
-    fEModifierSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
-    fZWSSet          = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);;
-    fGAZSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8\\u2764]"), status);
-    fExtendSet->removeAll(*fZWSSet);
+    fDictionarySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"), status);
+    fDictionarySet->addAll(*fKatakanaSet);
+    fDictionarySet->addAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
  
+    fALetterSet->removeAll(*fDictionarySet);
  
      fOtherSet        = new UnicodeSet();
      if(U_FAILURE(status)) {
@@ -2380,13 +2356,14 @@ RBBIWordMonkey::RBBIWordMonkey()
      fOtherSet->removeAll(*fExtendSet);
      fOtherSet->removeAll(*fRegionalIndicatorSet);
      fOtherSet->removeAll(*fEBaseSet);
+    fOtherSet->removeAll(*fEBGSet);
      fOtherSet->removeAll(*fEModifierSet);
-    fOtherSet->removeAll(*fZWSSet);
-    fOtherSet->removeAll(*fGAZSet);
-    
+    fOtherSet->removeAll(*fZWJSet);
+    fOtherSet->removeAll(*fExtendedPictSet);
+    fOtherSet->removeAll(*fEmojiNRKSet);
+
      // Inhibit dictionary characters from being tested at all.
-    fOtherSet->removeAll(*fDictionaryCjkSet);
-    fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
+    fOtherSet->removeAll(*fDictionarySet);
  
      fSets->addElement(fCRSet,                status);
      fSets->addElement(fLFSet,                status);
@@ -2396,7 +2373,9 @@ RBBIWordMonkey::RBBIWordMonkey()
      fSets->addElement(fALetterSet,           status);
      fSets->addElement(fSingle_QuoteSet,      status);
      fSets->addElement(fDouble_QuoteSet,      status);
-    //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
+    //fSets->addElement(fKatakanaSet,          status); // Omit Katakana from fSets, which omits Katakana characters
+                                                        // from the test data. They are all in the dictionary set,
+                                                        // which this (old, to be retired) monkey test cannot handle.
      fSets->addElement(fMidLetterSet,         status);
      fSets->addElement(fMidNumLetSet,         status);
      fSets->addElement(fMidNumSet,            status);
@@ -2407,9 +2386,11 @@ RBBIWordMonkey::RBBIWordMonkey()
      fSets->addElement(fExtendNumLetSet,      status);
  
      fSets->addElement(fEBaseSet,             status);
+    fSets->addElement(fEBGSet,               status);
      fSets->addElement(fEModifierSet,         status);
-    fSets->addElement(fZWSSet,               status);
-    fSets->addElement(fGAZSet,               status);
+    fSets->addElement(fZWJSet,               status);
+    fSets->addElement(fExtendedPictSet,      status);
+    fSets->addElement(fEmojiNRKSet,          status);
  
      if (U_FAILURE(status)) {
          deferredStatus = status;
@@ -2459,7 +2440,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
                 break;
              };
          }
-        while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWSSet->contains(c3));
+        while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
  
  
          if (p1 == p2) {
@@ -2488,12 +2469,12 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
              break;
          };
  
-        // Rule (3c)    ZWJ x GAZ (Glue after ZWJ).
+        // Rule (3c)    ZWJ x (Glue_after_ZWJ | EmojiNRK).
          //              Not ignoring extend chars, so peek into input text to
          //              get the potential ZWJ, the character immediately preceding c2.
          //              Sloppy UChar32 indexing: p2-1 may reference trail half
          //              but char32At will get the full code point.
-        if (fZWSSet->contains(fText->char32At(p2-1)) && fGAZSet->contains(c2)) {
+        if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
              continue;
          }
  
@@ -2566,6 +2547,8 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
          }
  
          // Rule (13)  Katakana x Katakana
+        //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
+        //                  all Katakana are handled by the dictionary breaker.
          if (fKatakanaSet->contains(c1) &&
              fKatakanaSet->contains(c2))  {
              continue;
@@ -2585,7 +2568,12 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
              continue;
          }
  
-        // Rule 13c
+        // WB 14  (E_Base | EBG) x E_Modifier
+        if ((fEBaseSet->contains(c1)  || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
+            continue;
+        }
+
+        // Rule 15 - 17   Group pairs of Regional Indicators.
          if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
              break;
          }
@@ -2593,12 +2581,7 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
              continue;
          }
  
-        // Rule 13d
-        if ((fEBaseSet->contains(c1)  || fGAZSet->contains(c1)) && fEModifierSet->contains(c2)) {
-            continue;
-        }
-
-        // Rule 14.  Break found here.
+        // Rule 999.  Break found here.
          break;
      }
  
@@ -2630,12 +2613,14 @@ RBBIWordMonkey::~RBBIWordMonkey() {
      delete fExtendSet;
      delete fExtendNumLetSet;
      delete fRegionalIndicatorSet;
-    delete fDictionaryCjkSet;
+    delete fDictionarySet;
      delete fOtherSet;
      delete fEBaseSet;
+    delete fEBGSet;
      delete fEModifierSet;
-    delete fZWSSet;
-    delete fGAZSet;
+    delete fZWJSet;
+    delete fExtendedPictSet;
+    delete fEmojiNRKSet;
  }
  
  
@@ -3025,6 +3010,8 @@ private:
      UnicodeSet  *fEB;
      UnicodeSet  *fEM;
      UnicodeSet  *fZJ;
+    UnicodeSet  *fExtendedPict;
+    UnicodeSet  *fEmojiNRK;
  
      BreakIterator        *fCharBI;
      const UnicodeString  *fText;
@@ -3087,13 +3074,12 @@ RBBILineMonkey::RBBILineMonkey() :
      fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
      fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
      fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
-    fEB    = new UnicodeSet(UnicodeString(
-                "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
-                "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
-                "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
-                "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
-    fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
-    fZJ    = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);
+    fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE(
+            "[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
+    fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
+    fZJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
+    fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
+    fExtendedPict = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
  
      if (U_FAILURE(status)) {
          deferredStatus = status;
@@ -3105,14 +3091,7 @@ RBBILineMonkey::RBBILineMonkey() :
      fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
  
      fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
-
-    fID->addAll(*fEB);     // Emoji Base and Emoji Modifier behave as ID.
-    fID->addAll(*fEM);
-    fAL->removeAll(*fEM);
-
-
-    fAL->remove((UChar32)0x2764);   // Emoji Proposal: move u2764 from Al to Id
-    fID->add((UChar32)0x2764);
+    fCM->addAll(*fZJ);     // ZWJ behaves as a CM.
  
      fSets->addElement(fBK, status);
      fSets->addElement(fCR, status);
@@ -3155,14 +3134,17 @@ RBBILineMonkey::RBBILineMonkey() :
      fSets->addElement(fEB, status);
      fSets->addElement(fEM, status);
      fSets->addElement(fZJ, status);
+    fSets->addElement(fExtendedPict, status);
+    fSets->addElement(fEmojiNRK, status);
+
  
      const char *rules =
-            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
-            "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
-            "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
-            "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
-            "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
-            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
+            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
+            "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
+            "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
+            "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
+            "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
+            "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
  
      fNumberMatcher = new RegexMatcher(
          UnicodeString(rules, -1, US_INV), 0, status);
@@ -3341,14 +3323,14 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
              break;
          }
  
-        // LB 8a ZJ x ID
+        // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
          //       The monkey test's way of ignoring combining characters doesn't work
          //       for this rule. ZJ is also a CM. Need to get the actual character
          //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
          {
              int32_t prevIdx = fText->moveIndex32(pos, -1);
              UChar32 prevC = fText->char32At(prevIdx);
-            if (fZJ->contains(prevC) && fID->contains(thisChar)) {
+            if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
                  continue;
              }
          }
@@ -3508,32 +3490,43 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
          if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
              (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
              (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
-            (fID->contains(prevChar) && fIN->contains(thisChar)) ||
+            ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
              (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
              (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
              continue;
          }
  
  
-        // LB 23    ID x PO
-        //          AL x NU
-        //          HL x NU
-        //          NU x AL
-        if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
-            (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
-            (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
-            (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
-            (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
+        // LB 23    (AL | HL) x NU
+        //          NU x (AL | HL)
+        if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
+            continue;
+        }
+        if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+            continue;
+        }
+
+        // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
+        //      PR x (ID | EB | EM)
+        //     (ID | EB | EM) x PO
+        if (fPR->contains(prevChar) && 
+                (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
+            continue;
+        }
+        if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && 
+                fPO->contains(thisChar)) {
              continue;
          }
  
          // LB 24  Do not break between prefix and letters or ideographs.
-        //        PR x ID
-        //        PR x (AL | HL)
-        //        PO x (AL | HL)
-        if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
-            (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
-            (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
+        //         (PR | PO) x (AL | HL)
+        //         (AL | HL) x (PR | PO)
+        if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
+                (fAL->contains(thisChar) || fHL->contains(thisChar))) {
+            continue;
+        }
+        if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
+                (fPR->contains(thisChar) || fPO->contains(thisChar))) {
              continue;
          }
  
@@ -3693,6 +3686,8 @@ RBBILineMonkey::~RBBILineMonkey() {
      delete fEB;
      delete fEM;
      delete fZJ;
+    delete fExtendedPict;
+    delete fEmojiNRK;
  
      delete fCharBI;
      delete fNumberMatcher;
@@ -4126,7 +4121,7 @@ void RBBITest::TestSentBreaks(void)
  #endif
  }
  
-void RBBITest::TestMonkey(char *params) {
+void RBBITest::TestMonkey() {
  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  
      UErrorCode     status    = U_ZERO_ERROR;
@@ -4140,8 +4135,8 @@ void RBBITest::TestMonkey(char *params) {
          loopCount = 10000;
      }
  
-    if (params) {
-        UnicodeString p(params);
+    if (fTestParams) {
+        UnicodeString p(fTestParams);
          loopCount = getIntParam("loop", p, loopCount);
          seed      = getIntParam("seed", p, seed);
  
@@ -4623,6 +4618,40 @@ void RBBITest::TestBug9983(void)  {
      TEST_ASSERT(iterationCount == 6);
  }
  
+// Bug 7547 - verify that building a break itereator from empty rules produces an error.
+//
+void RBBITest::TestBug7547() {
+    UnicodeString rules;
+    UErrorCode status = U_ZERO_ERROR;
+    UParseError parseError;
+    RuleBasedBreakIterator breakIterator(rules, parseError, status);
+    if (status != U_BRK_RULE_SYNTAX) {
+        errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
+    }
+    if (parseError.line != 1 || parseError.offset != 0) {
+        errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
+    }
+}
+
+
+void RBBITest::TestBug12797() {
+    UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
+    UErrorCode status = U_ZERO_ERROR;
+    UParseError parseError;
+    RuleBasedBreakIterator bi(rules, parseError, status);
+    if (U_FAILURE(status)) {
+        errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
+        return;
+    }
+    UnicodeString text = "abc";
+    bi.setText(text);
+    bi.first();
+    int32_t boundary = bi.next();
+    if (boundary != 3) {
+        errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
+    }
+}
+
  
  //
  //  TestDebug    -  A place-holder test for debugging purposes.
@@ -4630,29 +4659,7 @@ void RBBITest::TestBug9983(void)  {
  //                  for tracing  without a lot of unwanted extra stuff happening.
  //
  void RBBITest::TestDebug(void) {
-#if 0
-    UErrorCode   status = U_ZERO_ERROR;
-    int pos = 0;
-    int ruleStatus = 0;
-
-    RuleBasedBreakIterator* bi =
-       // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
-       // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
-       (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
-    UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
-    // UnicodeString s("Aaa.  Bcd");
-    s = s.unescape();
-    bi->setText(s);
-    UBool r = bi->isBoundary(8);
-    printf("%s", r?"true":"false");
-    return;
-    pos = bi->last();
-    do {
-        // ruleStatus = bi->getRuleStatus();
-        printf("%d\t%d\n", pos, ruleStatus);
-        pos = bi->previous();
-    } while (pos != BreakIterator::DONE);
-#endif
+
  }
  
  void RBBITest::TestProperties() {