source/common/dictbe.cpp

   1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /**
   4  *******************************************************************************
   5  * Copyright (C) 2006-2016, International Business Machines Corporation
   6  * and others. All Rights Reserved.
   7  *******************************************************************************
   8  */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_BREAK_ITERATION
  13
  14 #include "brkeng.h"
  15 #include "dictbe.h"
  16 #include "unicode/uniset.h"
  17 #include "unicode/chariter.h"
  18 #include "unicode/ubrk.h"
  19 #include "uvectr32.h"
  20 #include "uvector.h"
  21 #include "uassert.h"
  22 #include "unicode/normlzr.h"
  23 #include "cmemory.h"
  24 #include "dictionarydata.h"
  25
  26 U_NAMESPACE_BEGIN
  27
  28 /*
  29  ******************************************************************
  30  */
  31
  32 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
  33     fTypes = breakTypes;
  34 }
  35
  36 DictionaryBreakEngine::~DictionaryBreakEngine() {
  37 }
  38
  39 UBool
  40 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
  41     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
  42             && fSet.contains(c));
  43 }
  44
  45 int32_t
  46 DictionaryBreakEngine::findBreaks( UText *text,
  47                                  int32_t startPos,
  48                                  int32_t endPos,
  49                                  UBool reverse,
  50                                  int32_t breakType,
  51                                  UStack &foundBreaks ) const {
  52     int32_t result = 0;
  53
  54     // Find the span of characters included in the set.
  55     //   The span to break begins at the current position in the text, and
  56     //   extends towards the start or end of the text, depending on 'reverse'.
  57
  58     int32_t start = (int32_t)utext_getNativeIndex(text);
  59     int32_t current;
  60     int32_t rangeStart;
  61     int32_t rangeEnd;
  62     UChar32 c = utext_current32(text);
  63     if (reverse) {
  64         UBool   isDict = fSet.contains(c);
  65         while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
  66             c = utext_previous32(text);
  67             isDict = fSet.contains(c);
  68         }
  69         if (current < startPos) {
  70             rangeStart = startPos;
  71         } else {
  72             rangeStart = current;
  73             if (!isDict) {
  74                 utext_next32(text);
  75                 rangeStart = (int32_t)utext_getNativeIndex(text);
  76             }
  77         }
  78         // rangeEnd = start + 1;
  79         utext_setNativeIndex(text, start);
  80         utext_next32(text);
  81         rangeEnd = (int32_t)utext_getNativeIndex(text);
  82     }
  83     else {
  84         while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
  85             utext_next32(text);         // TODO:  recast loop for postincrement
  86             c = utext_current32(text);
  87         }
  88         rangeStart = start;
  89         rangeEnd = current;
  90     }
  91     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
  92         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
  93         utext_setNativeIndex(text, current);
  94     }
  95
  96     return result;
  97 }
  98
  99 void
 100 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
 101     fSet = set;
 102     // Compact for caching
 103     fSet.compact();
 104 }
 105
 106 /*
 107  ******************************************************************
 108  * PossibleWord
 109  */
 110
 111 // Helper class for improving readability of the Thai/Lao/Khmer word break
 112 // algorithm. The implementation is completely inline.
 113
 114 // List size, limited by the maximum number of words in the dictionary
 115 // that form a nested sequence.
 116 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
 117
 118 class PossibleWord {
 119 private:
 120     // list of word candidate lengths, in increasing length order
 121     // TODO: bytes would be sufficient for word lengths.
 122     int32_t   count;      // Count of candidates
 123     int32_t   prefix;     // The longest match with a dictionary word
 124     int32_t   offset;     // Offset in the text of these candidates
 125     int32_t   mark;       // The preferred candidate's offset
 126     int32_t   current;    // The candidate we're currently looking at
 127     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
 128     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
 129
 130 public:
 131     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
 132     ~PossibleWord() {};
 133
 134     // Fill the list of candidates if needed, select the longest, and return the number found
 135     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
 136
 137     // Select the currently marked candidate, point after it in the text, and invalidate self
 138     int32_t   acceptMarked( UText *text );
 139
 140     // Back up from the current candidate to the next shorter one; return TRUE if that exists
 141     // and point the text after it
 142     UBool     backUp( UText *text );
 143
 144     // Return the longest prefix this candidate location shares with a dictionary word
 145     // Return value is in code points.
 146     int32_t   longestPrefix() { return prefix; };
 147
 148     // Mark the current candidate as the one we like
 149     void      markCurrent() { mark = current; };
 150
 151     // Get length in code points of the marked word.
 152     int32_t   markedCPLength() { return cpLengths[mark]; };
 153 };
 154
 155
 156 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
 157     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
 158     int32_t start = (int32_t)utext_getNativeIndex(text);
 159     if (start != offset) {
 160         offset = start;
 161         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
 162         // Dictionary leaves text after longest prefix, not longest word. Back up.
 163         if (count <= 0) {
 164             utext_setNativeIndex(text, start);
 165         }
 166     }
 167     if (count > 0) {
 168         utext_setNativeIndex(text, start+cuLengths[count-1]);
 169     }
 170     current = count-1;
 171     mark = current;
 172     return count;
 173 }
 174
 175 int32_t
 176 PossibleWord::acceptMarked( UText *text ) {
 177     utext_setNativeIndex(text, offset + cuLengths[mark]);
 178     return cuLengths[mark];
 179 }
 180
 181
 182 UBool
 183 PossibleWord::backUp( UText *text ) {
 184     if (current > 0) {
 185         utext_setNativeIndex(text, offset + cuLengths[--current]);
 186         return TRUE;
 187     }
 188     return FALSE;
 189 }
 190
 191 /*
 192  ******************************************************************
 193  * ThaiBreakEngine
 194  */
 195
 196 // How many words in a row are "good enough"?
 197 static const int32_t THAI_LOOKAHEAD = 3;
 198
 199 // Will not combine a non-word with a preceding dictionary word longer than this
 200 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
 201
 202 // Will not combine a non-word that shares at least this much prefix with a
 203 // dictionary word, with a preceding word
 204 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
 205
 206 // Ellision character
 207 static const int32_t THAI_PAIYANNOI = 0x0E2F;
 208
 209 // Repeat character
 210 static const int32_t THAI_MAIYAMOK = 0x0E46;
 211
 212 // Minimum word size
 213 static const int32_t THAI_MIN_WORD = 2;
 214
 215 // Minimum number of characters for two words
 216 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
 217
 218 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 219     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
 220       fDictionary(adoptDictionary)
 221 {
 222     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
 223     if (U_SUCCESS(status)) {
 224         setCharacters(fThaiWordSet);
 225     }
 226     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
 227     fMarkSet.add(0x0020);
 228     fEndWordSet = fThaiWordSet;
 229     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
 230     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
 231     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
 232     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
 233     fSuffixSet.add(THAI_PAIYANNOI);
 234     fSuffixSet.add(THAI_MAIYAMOK);
 235
 236     // Compact for caching.
 237     fMarkSet.compact();
 238     fEndWordSet.compact();
 239     fBeginWordSet.compact();
 240     fSuffixSet.compact();
 241 }
 242
 243 ThaiBreakEngine::~ThaiBreakEngine() {
 244     delete fDictionary;
 245 }
 246
 247 int32_t
 248 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
 249                                                 int32_t rangeStart,
 250                                                 int32_t rangeEnd,
 251                                                 UStack &foundBreaks ) const {
 252     utext_setNativeIndex(text, rangeStart);
 253     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
 254     if (utext_getNativeIndex(text) >= rangeEnd) {
 255         return 0;       // Not enough characters for two words
 256     }
 257     utext_setNativeIndex(text, rangeStart);
 258
 259
 260     uint32_t wordsFound = 0;
 261     int32_t cpWordLength = 0;    // Word Length in Code Points.
 262     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
 263     int32_t current;
 264     UErrorCode status = U_ZERO_ERROR;
 265     PossibleWord words[THAI_LOOKAHEAD];
 266
 267     utext_setNativeIndex(text, rangeStart);
 268
 269     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 270         cpWordLength = 0;
 271         cuWordLength = 0;
 272
 273         // Look for candidate words at the current position
 274         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 275
 276         // If we found exactly one, use that
 277         if (candidates == 1) {
 278             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
 279             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
 280             wordsFound += 1;
 281         }
 282         // If there was more than one, see which one can take us forward the most words
 283         else if (candidates > 1) {
 284             // If we're already at the end of the range, we're done
 285             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 286                 goto foundBest;
 287             }
 288             do {
 289                 int32_t wordsMatched = 1;
 290                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 291                     if (wordsMatched < 2) {
 292                         // Followed by another dictionary word; mark first word as a good candidate
 293                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
 294                         wordsMatched = 2;
 295                     }
 296
 297                     // If we're already at the end of the range, we're done
 298                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 299                         goto foundBest;
 300                     }
 301
 302                     // See if any of the possible second words is followed by a third word
 303                     do {
 304                         // If we find a third word, stop right away
 305                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 306                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
 307                             goto foundBest;
 308                         }
 309                     }
 310                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
 311                 }
 312             }
 313             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
 314 foundBest:
 315             // Set UText position to after the accepted word.
 316             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
 317             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
 318             wordsFound += 1;
 319         }
 320
 321         // We come here after having either found a word or not. We look ahead to the
 322         // next word. If it's not a dictionary word, we will combine it with the word we
 323         // just found (if there is one), but only if the preceding word does not exceed
 324         // the threshold.
 325         // The text iterator should now be positioned at the end of the word we found.
 326
 327         UChar32 uc = 0;
 328         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
 329             // if it is a dictionary word, do nothing. If it isn't, then if there is
 330             // no preceding word, or the non-word shares less than the minimum threshold
 331             // of characters with a dictionary word, then scan to resynchronize
 332             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 333                   && (cuWordLength == 0
 334                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
 335                 // Look for a plausible word boundary
 336                 int32_t remaining = rangeEnd - (current+cuWordLength);
 337                 UChar32 pc;
 338                 int32_t chars = 0;
 339                 for (;;) {
 340                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 341                     pc = utext_next32(text);
 342                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 343                     chars += pcSize;
 344                     remaining -= pcSize;
 345                     if (remaining <= 0) {
 346                         break;
 347                     }
 348                     uc = utext_current32(text);
 349                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 350                         // Maybe. See if it's in the dictionary.
 351                         // NOTE: In the original Apple code, checked that the next
 352                         // two characters after uc were not 0x0E4C THANTHAKHAT before
 353                         // checking the dictionary. That is just a performance filter,
 354                         // but it's not clear it's faster than checking the trie.
 355                         int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 356                         utext_setNativeIndex(text, current + cuWordLength + chars);
 357                         if (candidates > 0) {
 358                             break;
 359                         }
 360                     }
 361                 }
 362
 363                 // Bump the word count if there wasn't already one
 364                 if (cuWordLength <= 0) {
 365                     wordsFound += 1;
 366                 }
 367
 368                 // Update the length with the passed-over characters
 369                 cuWordLength += chars;
 370             }
 371             else {
 372                 // Back up to where we were for next iteration
 373                 utext_setNativeIndex(text, current+cuWordLength);
 374             }
 375         }
 376
 377         // Never stop before a combining mark.
 378         int32_t currPos;
 379         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 380             utext_next32(text);
 381             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 382         }
 383
 384         // Look ahead for possible suffixes if a dictionary word does not follow.
 385         // We do this in code rather than using a rule so that the heuristic
 386         // resynch continues to function. For example, one of the suffix characters
 387         // could be a typo in the middle of a word.
 388         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
 389             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 390                 && fSuffixSet.contains(uc = utext_current32(text))) {
 391                 if (uc == THAI_PAIYANNOI) {
 392                     if (!fSuffixSet.contains(utext_previous32(text))) {
 393                         // Skip over previous end and PAIYANNOI
 394                         utext_next32(text);
 395                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
 396                         utext_next32(text);
 397                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
 398                         uc = utext_current32(text);     // Fetch next character
 399                     }
 400                     else {
 401                         // Restore prior position
 402                         utext_next32(text);
 403                     }
 404                 }
 405                 if (uc == THAI_MAIYAMOK) {
 406                     if (utext_previous32(text) != THAI_MAIYAMOK) {
 407                         // Skip over previous end and MAIYAMOK
 408                         utext_next32(text);
 409                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
 410                         utext_next32(text);
 411                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
 412                     }
 413                     else {
 414                         // Restore prior position
 415                         utext_next32(text);
 416                     }
 417                 }
 418             }
 419             else {
 420                 utext_setNativeIndex(text, current+cuWordLength);
 421             }
 422         }
 423
 424         // Did we find a word on this iteration? If so, push it on the break stack
 425         if (cuWordLength > 0) {
 426             foundBreaks.push((current+cuWordLength), status);
 427         }
 428     }
 429
 430     // Don't return a break for the end of the dictionary range if there is one there.
 431     if (foundBreaks.peeki() >= rangeEnd) {
 432         (void) foundBreaks.popi();
 433         wordsFound -= 1;
 434     }
 435
 436     return wordsFound;
 437 }
 438
 439 /*
 440  ******************************************************************
 441  * LaoBreakEngine
 442  */
 443
 444 // How many words in a row are "good enough"?
 445 static const int32_t LAO_LOOKAHEAD = 3;
 446
 447 // Will not combine a non-word with a preceding dictionary word longer than this
 448 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
 449
 450 // Will not combine a non-word that shares at least this much prefix with a
 451 // dictionary word, with a preceding word
 452 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
 453
 454 // Minimum word size
 455 static const int32_t LAO_MIN_WORD = 2;
 456
 457 // Minimum number of characters for two words
 458 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
 459
 460 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 461     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
 462       fDictionary(adoptDictionary)
 463 {
 464     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
 465     if (U_SUCCESS(status)) {
 466         setCharacters(fLaoWordSet);
 467     }
 468     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
 469     fMarkSet.add(0x0020);
 470     fEndWordSet = fLaoWordSet;
 471     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
 472     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
 473     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
 474     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
 475
 476     // Compact for caching.
 477     fMarkSet.compact();
 478     fEndWordSet.compact();
 479     fBeginWordSet.compact();
 480 }
 481
 482 LaoBreakEngine::~LaoBreakEngine() {
 483     delete fDictionary;
 484 }
 485
 486 int32_t
 487 LaoBreakEngine::divideUpDictionaryRange( UText *text,
 488                                                 int32_t rangeStart,
 489                                                 int32_t rangeEnd,
 490                                                 UStack &foundBreaks ) const {
 491     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
 492         return 0;       // Not enough characters for two words
 493     }
 494
 495     uint32_t wordsFound = 0;
 496     int32_t cpWordLength = 0;
 497     int32_t cuWordLength = 0;
 498     int32_t current;
 499     UErrorCode status = U_ZERO_ERROR;
 500     PossibleWord words[LAO_LOOKAHEAD];
 501
 502     utext_setNativeIndex(text, rangeStart);
 503
 504     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 505         cuWordLength = 0;
 506         cpWordLength = 0;
 507
 508         // Look for candidate words at the current position
 509         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 510
 511         // If we found exactly one, use that
 512         if (candidates == 1) {
 513             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
 514             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
 515             wordsFound += 1;
 516         }
 517         // If there was more than one, see which one can take us forward the most words
 518         else if (candidates > 1) {
 519             // If we're already at the end of the range, we're done
 520             if (utext_getNativeIndex(text) >= rangeEnd) {
 521                 goto foundBest;
 522             }
 523             do {
 524                 int32_t wordsMatched = 1;
 525                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 526                     if (wordsMatched < 2) {
 527                         // Followed by another dictionary word; mark first word as a good candidate
 528                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
 529                         wordsMatched = 2;
 530                     }
 531
 532                     // If we're already at the end of the range, we're done
 533                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 534                         goto foundBest;
 535                     }
 536
 537                     // See if any of the possible second words is followed by a third word
 538                     do {
 539                         // If we find a third word, stop right away
 540                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 541                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
 542                             goto foundBest;
 543                         }
 544                     }
 545                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
 546                 }
 547             }
 548             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
 549 foundBest:
 550             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
 551             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
 552             wordsFound += 1;
 553         }
 554
 555         // We come here after having either found a word or not. We look ahead to the
 556         // next word. If it's not a dictionary word, we will combine it withe the word we
 557         // just found (if there is one), but only if the preceding word does not exceed
 558         // the threshold.
 559         // The text iterator should now be positioned at the end of the word we found.
 560         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
 561             // if it is a dictionary word, do nothing. If it isn't, then if there is
 562             // no preceding word, or the non-word shares less than the minimum threshold
 563             // of characters with a dictionary word, then scan to resynchronize
 564             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 565                   && (cuWordLength == 0
 566                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
 567                 // Look for a plausible word boundary
 568                 int32_t remaining = rangeEnd - (current + cuWordLength);
 569                 UChar32 pc;
 570                 UChar32 uc;
 571                 int32_t chars = 0;
 572                 for (;;) {
 573                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 574                     pc = utext_next32(text);
 575                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 576                     chars += pcSize;
 577                     remaining -= pcSize;
 578                     if (remaining <= 0) {
 579                         break;
 580                     }
 581                     uc = utext_current32(text);
 582                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 583                         // Maybe. See if it's in the dictionary.
 584                         // TODO: this looks iffy; compare with old code.
 585                         int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 586                         utext_setNativeIndex(text, current + cuWordLength + chars);
 587                         if (candidates > 0) {
 588                             break;
 589                         }
 590                     }
 591                 }
 592
 593                 // Bump the word count if there wasn't already one
 594                 if (cuWordLength <= 0) {
 595                     wordsFound += 1;
 596                 }
 597
 598                 // Update the length with the passed-over characters
 599                 cuWordLength += chars;
 600             }
 601             else {
 602                 // Back up to where we were for next iteration
 603                 utext_setNativeIndex(text, current + cuWordLength);
 604             }
 605         }
 606
 607         // Never stop before a combining mark.
 608         int32_t currPos;
 609         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 610             utext_next32(text);
 611             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 612         }
 613
 614         // Look ahead for possible suffixes if a dictionary word does not follow.
 615         // We do this in code rather than using a rule so that the heuristic
 616         // resynch continues to function. For example, one of the suffix characters
 617         // could be a typo in the middle of a word.
 618         // NOT CURRENTLY APPLICABLE TO LAO
 619
 620         // Did we find a word on this iteration? If so, push it on the break stack
 621         if (cuWordLength > 0) {
 622             foundBreaks.push((current+cuWordLength), status);
 623         }
 624     }
 625
 626     // Don't return a break for the end of the dictionary range if there is one there.
 627     if (foundBreaks.peeki() >= rangeEnd) {
 628         (void) foundBreaks.popi();
 629         wordsFound -= 1;
 630     }
 631
 632     return wordsFound;
 633 }
 634
 635 /*
 636  ******************************************************************
 637  * BurmeseBreakEngine
 638  */
 639
 640 // How many words in a row are "good enough"?
 641 static const int32_t BURMESE_LOOKAHEAD = 3;
 642
 643 // Will not combine a non-word with a preceding dictionary word longer than this
 644 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
 645
 646 // Will not combine a non-word that shares at least this much prefix with a
 647 // dictionary word, with a preceding word
 648 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
 649
 650 // Minimum word size
 651 static const int32_t BURMESE_MIN_WORD = 2;
 652
 653 // Minimum number of characters for two words
 654 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
 655
 656 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 657     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
 658       fDictionary(adoptDictionary)
 659 {
 660     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
 661     if (U_SUCCESS(status)) {
 662         setCharacters(fBurmeseWordSet);
 663     }
 664     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
 665     fMarkSet.add(0x0020);
 666     fEndWordSet = fBurmeseWordSet;
 667     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
 668
 669     // Compact for caching.
 670     fMarkSet.compact();
 671     fEndWordSet.compact();
 672     fBeginWordSet.compact();
 673 }
 674
 675 BurmeseBreakEngine::~BurmeseBreakEngine() {
 676     delete fDictionary;
 677 }
 678
 679 int32_t
 680 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
 681                                                 int32_t rangeStart,
 682                                                 int32_t rangeEnd,
 683                                                 UStack &foundBreaks ) const {
 684     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
 685         return 0;       // Not enough characters for two words
 686     }
 687
 688     uint32_t wordsFound = 0;
 689     int32_t cpWordLength = 0;
 690     int32_t cuWordLength = 0;
 691     int32_t current;
 692     UErrorCode status = U_ZERO_ERROR;
 693     PossibleWord words[BURMESE_LOOKAHEAD];
 694
 695     utext_setNativeIndex(text, rangeStart);
 696
 697     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 698         cuWordLength = 0;
 699         cpWordLength = 0;
 700
 701         // Look for candidate words at the current position
 702         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 703
 704         // If we found exactly one, use that
 705         if (candidates == 1) {
 706             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
 707             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
 708             wordsFound += 1;
 709         }
 710         // If there was more than one, see which one can take us forward the most words
 711         else if (candidates > 1) {
 712             // If we're already at the end of the range, we're done
 713             if (utext_getNativeIndex(text) >= rangeEnd) {
 714                 goto foundBest;
 715             }
 716             do {
 717                 int32_t wordsMatched = 1;
 718                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 719                     if (wordsMatched < 2) {
 720                         // Followed by another dictionary word; mark first word as a good candidate
 721                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
 722                         wordsMatched = 2;
 723                     }
 724
 725                     // If we're already at the end of the range, we're done
 726                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 727                         goto foundBest;
 728                     }
 729
 730                     // See if any of the possible second words is followed by a third word
 731                     do {
 732                         // If we find a third word, stop right away
 733                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 734                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
 735                             goto foundBest;
 736                         }
 737                     }
 738                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
 739                 }
 740             }
 741             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
 742 foundBest:
 743             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
 744             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
 745             wordsFound += 1;
 746         }
 747
 748         // We come here after having either found a word or not. We look ahead to the
 749         // next word. If it's not a dictionary word, we will combine it withe the word we
 750         // just found (if there is one), but only if the preceding word does not exceed
 751         // the threshold.
 752         // The text iterator should now be positioned at the end of the word we found.
 753         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
 754             // if it is a dictionary word, do nothing. If it isn't, then if there is
 755             // no preceding word, or the non-word shares less than the minimum threshold
 756             // of characters with a dictionary word, then scan to resynchronize
 757             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 758                   && (cuWordLength == 0
 759                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
 760                 // Look for a plausible word boundary
 761                 int32_t remaining = rangeEnd - (current + cuWordLength);
 762                 UChar32 pc;
 763                 UChar32 uc;
 764                 int32_t chars = 0;
 765                 for (;;) {
 766                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 767                     pc = utext_next32(text);
 768                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 769                     chars += pcSize;
 770                     remaining -= pcSize;
 771                     if (remaining <= 0) {
 772                         break;
 773                     }
 774                     uc = utext_current32(text);
 775                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 776                         // Maybe. See if it's in the dictionary.
 777                         // TODO: this looks iffy; compare with old code.
 778                         int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 779                         utext_setNativeIndex(text, current + cuWordLength + chars);
 780                         if (candidates > 0) {
 781                             break;
 782                         }
 783                     }
 784                 }
 785
 786                 // Bump the word count if there wasn't already one
 787                 if (cuWordLength <= 0) {
 788                     wordsFound += 1;
 789                 }
 790
 791                 // Update the length with the passed-over characters
 792                 cuWordLength += chars;
 793             }
 794             else {
 795                 // Back up to where we were for next iteration
 796                 utext_setNativeIndex(text, current + cuWordLength);
 797             }
 798         }
 799
 800         // Never stop before a combining mark.
 801         int32_t currPos;
 802         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
 803             utext_next32(text);
 804             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
 805         }
 806
 807         // Look ahead for possible suffixes if a dictionary word does not follow.
 808         // We do this in code rather than using a rule so that the heuristic
 809         // resynch continues to function. For example, one of the suffix characters
 810         // could be a typo in the middle of a word.
 811         // NOT CURRENTLY APPLICABLE TO BURMESE
 812
 813         // Did we find a word on this iteration? If so, push it on the break stack
 814         if (cuWordLength > 0) {
 815             foundBreaks.push((current+cuWordLength), status);
 816         }
 817     }
 818
 819     // Don't return a break for the end of the dictionary range if there is one there.
 820     if (foundBreaks.peeki() >= rangeEnd) {
 821         (void) foundBreaks.popi();
 822         wordsFound -= 1;
 823     }
 824
 825     return wordsFound;
 826 }
 827
 828 /*
 829  ******************************************************************
 830  * KhmerBreakEngine
 831  */
 832
 833 // How many words in a row are "good enough"?
 834 static const int32_t KHMER_LOOKAHEAD = 3;
 835
 836 // Will not combine a non-word with a preceding dictionary word longer than this
 837 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
 838
 839 // Will not combine a non-word that shares at least this much prefix with a
 840 // dictionary word, with a preceding word
 841 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
 842
 843 // Minimum word size
 844 static const int32_t KHMER_MIN_WORD = 2;
 845
 846 // Minimum number of characters for two words
 847 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
 848
 849 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
 850     : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
 851       fDictionary(adoptDictionary)
 852 {
 853     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
 854     if (U_SUCCESS(status)) {
 855         setCharacters(fKhmerWordSet);
 856     }
 857     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
 858     fMarkSet.add(0x0020);
 859     fEndWordSet = fKhmerWordSet;
 860     fBeginWordSet.add(0x1780, 0x17B3);
 861     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
 862     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
 863     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
 864     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
 865     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
 866 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
 867 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
 868 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
 869 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
 870 //    fSuffixSet.add(THAI_PAIYANNOI);
 871 //    fSuffixSet.add(THAI_MAIYAMOK);
 872
 873     // Compact for caching.
 874     fMarkSet.compact();
 875     fEndWordSet.compact();
 876     fBeginWordSet.compact();
 877 //    fSuffixSet.compact();
 878 }
 879
 880 KhmerBreakEngine::~KhmerBreakEngine() {
 881     delete fDictionary;
 882 }
 883
 884 int32_t
 885 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
 886                                                 int32_t rangeStart,
 887                                                 int32_t rangeEnd,
 888                                                 UStack &foundBreaks ) const {
 889     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
 890         return 0;       // Not enough characters for two words
 891     }
 892
 893     uint32_t wordsFound = 0;
 894     int32_t cpWordLength = 0;
 895     int32_t cuWordLength = 0;
 896     int32_t current;
 897     UErrorCode status = U_ZERO_ERROR;
 898     PossibleWord words[KHMER_LOOKAHEAD];
 899
 900     utext_setNativeIndex(text, rangeStart);
 901
 902     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
 903         cuWordLength = 0;
 904         cpWordLength = 0;
 905
 906         // Look for candidate words at the current position
 907         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 908
 909         // If we found exactly one, use that
 910         if (candidates == 1) {
 911             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 912             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 913             wordsFound += 1;
 914         }
 915
 916         // If there was more than one, see which one can take us forward the most words
 917         else if (candidates > 1) {
 918             // If we're already at the end of the range, we're done
 919             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 920                 goto foundBest;
 921             }
 922             do {
 923                 int32_t wordsMatched = 1;
 924                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
 925                     if (wordsMatched < 2) {
 926                         // Followed by another dictionary word; mark first word as a good candidate
 927                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 928                         wordsMatched = 2;
 929                     }
 930
 931                     // If we're already at the end of the range, we're done
 932                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
 933                         goto foundBest;
 934                     }
 935
 936                     // See if any of the possible second words is followed by a third word
 937                     do {
 938                         // If we find a third word, stop right away
 939                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
 940                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
 941                             goto foundBest;
 942                         }
 943                     }
 944                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
 945                 }
 946             }
 947             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
 948 foundBest:
 949             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
 950             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
 951             wordsFound += 1;
 952         }
 953
 954         // We come here after having either found a word or not. We look ahead to the
 955         // next word. If it's not a dictionary word, we will combine it with the word we
 956         // just found (if there is one), but only if the preceding word does not exceed
 957         // the threshold.
 958         // The text iterator should now be positioned at the end of the word we found.
 959         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
 960             // if it is a dictionary word, do nothing. If it isn't, then if there is
 961             // no preceding word, or the non-word shares less than the minimum threshold
 962             // of characters with a dictionary word, then scan to resynchronize
 963             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
 964                   && (cuWordLength == 0
 965                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
 966                 // Look for a plausible word boundary
 967                 int32_t remaining = rangeEnd - (current+cuWordLength);
 968                 UChar32 pc;
 969                 UChar32 uc;
 970                 int32_t chars = 0;
 971                 for (;;) {
 972                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
 973                     pc = utext_next32(text);
 974                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
 975                     chars += pcSize;
 976                     remaining -= pcSize;
 977                     if (remaining <= 0) {
 978                         break;
 979                     }
 980                     uc = utext_current32(text);
 981                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
 982                         // Maybe. See if it's in the dictionary.
 983                         int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
 984                         utext_setNativeIndex(text, current+cuWordLength+chars);
 985                         if (candidates > 0) {
 986                             break;
 987                         }
 988                     }
 989                 }
 990
 991                 // Bump the word count if there wasn't already one
 992                 if (cuWordLength <= 0) {
 993                     wordsFound += 1;
 994                 }
 995
 996                 // Update the length with the passed-over characters
 997                 cuWordLength += chars;
 998             }
 999             else {
1000                 // Back up to where we were for next iteration
1001                 utext_setNativeIndex(text, current+cuWordLength);
1002             }
1003         }
1004
1005         // Never stop before a combining mark.
1006         int32_t currPos;
1007         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
1008             utext_next32(text);
1009             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
1010         }
1011
1012         // Look ahead for possible suffixes if a dictionary word does not follow.
1013         // We do this in code rather than using a rule so that the heuristic
1014         // resynch continues to function. For example, one of the suffix characters
1015         // could be a typo in the middle of a word.
1016 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1017 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1018 //                && fSuffixSet.contains(uc = utext_current32(text))) {
1019 //                if (uc == KHMER_PAIYANNOI) {
1020 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
1021 //                        // Skip over previous end and PAIYANNOI
1022 //                        utext_next32(text);
1023 //                        utext_next32(text);
1024 //                        wordLength += 1;            // Add PAIYANNOI to word
1025 //                        uc = utext_current32(text);     // Fetch next character
1026 //                    }
1027 //                    else {
1028 //                        // Restore prior position
1029 //                        utext_next32(text);
1030 //                    }
1031 //                }
1032 //                if (uc == KHMER_MAIYAMOK) {
1033 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1034 //                        // Skip over previous end and MAIYAMOK
1035 //                        utext_next32(text);
1036 //                        utext_next32(text);
1037 //                        wordLength += 1;            // Add MAIYAMOK to word
1038 //                    }
1039 //                    else {
1040 //                        // Restore prior position
1041 //                        utext_next32(text);
1042 //                    }
1043 //                }
1044 //            }
1045 //            else {
1046 //                utext_setNativeIndex(text, current+wordLength);
1047 //            }
1048 //        }
1049
1050         // Did we find a word on this iteration? If so, push it on the break stack
1051         if (cuWordLength > 0) {
1052             foundBreaks.push((current+cuWordLength), status);
1053         }
1054     }
1055
1056     // Don't return a break for the end of the dictionary range if there is one there.
1057     if (foundBreaks.peeki() >= rangeEnd) {
1058         (void) foundBreaks.popi();
1059         wordsFound -= 1;
1060     }
1061
1062     return wordsFound;
1063 }
1064
1065 #if !UCONFIG_NO_NORMALIZATION
1066 /*
1067  ******************************************************************
1068  * CjkBreakEngine
1069  */
1070 static const uint32_t kuint32max = 0xFFFFFFFF;
1071 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1072 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
1073     // Korean dictionary only includes Hangul syllables
1074     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1075     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1076     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1077     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1078     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1079
1080     if (U_SUCCESS(status)) {
1081         // handle Korean and Japanese/Chinese using different dictionaries
1082         if (type == kKorean) {
1083             setCharacters(fHangulWordSet);
1084         } else { //Chinese and Japanese
1085             UnicodeSet cjSet;
1086             cjSet.addAll(fHanWordSet);
1087             cjSet.addAll(fKatakanaWordSet);
1088             cjSet.addAll(fHiraganaWordSet);
1089             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1090             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1091             setCharacters(cjSet);
1092         }
1093     }
1094 }
1095
1096 CjkBreakEngine::~CjkBreakEngine(){
1097     delete fDictionary;
1098 }
1099
1100 // The katakanaCost values below are based on the length frequencies of all
1101 // katakana phrases in the dictionary
1102 static const int32_t kMaxKatakanaLength = 8;
1103 static const int32_t kMaxKatakanaGroupLength = 20;
1104 static const uint32_t maxSnlp = 255;
1105
1106 static inline uint32_t getKatakanaCost(int32_t wordLength){
1107     //TODO: fill array with actual values from dictionary!
1108     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1109                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1110     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1111 }
1112
1113 static inline bool isKatakana(uint16_t value) {
1114     return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
1115             (value >= 0xFF66u && value <= 0xFF9fu);
1116 }
1117
1118
1119 // Function for accessing internal utext flags.
1120 //   Replicates an internal UText function.
1121
1122 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1123     return (int32_t)1 << bitIndex;
1124 }
1125
1126
1127 /*
1128  * @param text A UText representing the text
1129  * @param rangeStart The start of the range of dictionary characters
1130  * @param rangeEnd The end of the range of dictionary characters
1131  * @param foundBreaks Output of C array of int32_t break positions, or 0
1132  * @return The number of breaks found
1133  */
1134 int32_t
1135 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1136         int32_t rangeStart,
1137         int32_t rangeEnd,
1138         UStack &foundBreaks ) const {
1139     if (rangeStart >= rangeEnd) {
1140         return 0;
1141     }
1142
1143     // UnicodeString version of input UText, NFKC normalized if necessary.
1144     UnicodeString inString;
1145
1146     // inputMap[inStringIndex] = corresponding native index from UText inText.
1147     // If NULL then mapping is 1:1
1148     LocalPointer<UVector32>     inputMap;
1149
1150     UErrorCode     status      = U_ZERO_ERROR;
1151
1152
1153     // if UText has the input string as one contiguous UTF-16 chunk
1154     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1155          inText->chunkNativeStart <= rangeStart &&
1156          inText->chunkNativeLimit >= rangeEnd   &&
1157          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1158
1159         // Input UText is in one contiguous UTF-16 chunk.
1160         // Use Read-only aliasing UnicodeString.
1161         inString.setTo(FALSE,
1162                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
1163                        rangeEnd - rangeStart);
1164     } else {
1165         // Copy the text from the original inText (UText) to inString (UnicodeString).
1166         // Create a map from UnicodeString indices -> UText offsets.
1167         utext_setNativeIndex(inText, rangeStart);
1168         int32_t limit = rangeEnd;
1169         U_ASSERT(limit <= utext_nativeLength(inText));
1170         if (limit > utext_nativeLength(inText)) {
1171             limit = (int32_t)utext_nativeLength(inText);
1172         }
1173         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1174         if (U_FAILURE(status)) {
1175             return 0;
1176         }
1177         while (utext_getNativeIndex(inText) < limit) {
1178             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1179             UChar32 c = utext_next32(inText);
1180             U_ASSERT(c != U_SENTINEL);
1181             inString.append(c);
1182             while (inputMap->size() < inString.length()) {
1183                 inputMap->addElement(nativePosition, status);
1184             }
1185         }
1186         inputMap->addElement(limit, status);
1187     }
1188
1189
1190     if (!nfkcNorm2->isNormalized(inString, status)) {
1191         UnicodeString normalizedInput;
1192         //  normalizedMap[normalizedInput position] ==  original UText position.
1193         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1194         if (U_FAILURE(status)) {
1195             return 0;
1196         }
1197
1198         UnicodeString fragment;
1199         UnicodeString normalizedFragment;
1200         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1201             fragment.remove();
1202             int32_t fragmentStartI = srcI;
1203             UChar32 c = inString.char32At(srcI);
1204             for (;;) {
1205                 fragment.append(c);
1206                 srcI = inString.moveIndex32(srcI, 1);
1207                 if (srcI == inString.length()) {
1208                     break;
1209                 }
1210                 c = inString.char32At(srcI);
1211                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1212                     break;
1213                 }
1214             }
1215             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1216             normalizedInput.append(normalizedFragment);
1217
1218             // Map every position in the normalized chunk to the start of the chunk
1219             //   in the original input.
1220             int32_t fragmentOriginalStart = inputMap.isValid() ?
1221                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1222             while (normalizedMap->size() < normalizedInput.length()) {
1223                 normalizedMap->addElement(fragmentOriginalStart, status);
1224                 if (U_FAILURE(status)) {
1225                     break;
1226                 }
1227             }
1228         }
1229         U_ASSERT(normalizedMap->size() == normalizedInput.length());
1230         int32_t nativeEnd = inputMap.isValid() ?
1231                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1232         normalizedMap->addElement(nativeEnd, status);
1233
1234         inputMap.moveFrom(normalizedMap);
1235         inString.moveFrom(normalizedInput);
1236     }
1237
1238     int32_t numCodePts = inString.countChar32();
1239     if (numCodePts != inString.length()) {
1240         // There are supplementary characters in the input.
1241         // The dictionary will produce boundary positions in terms of code point indexes,
1242         //   not in terms of code unit string indexes.
1243         // Use the inputMap mechanism to take care of this in addition to indexing differences
1244         //    from normalization and/or UTF-8 input.
1245         UBool hadExistingMap = inputMap.isValid();
1246         if (!hadExistingMap) {
1247             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1248             if (U_FAILURE(status)) {
1249                 return 0;
1250             }
1251         }
1252         int32_t cpIdx = 0;
1253         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1254             U_ASSERT(cuIdx >= cpIdx);
1255             if (hadExistingMap) {
1256                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1257             } else {
1258                 inputMap->addElement(cuIdx+rangeStart, status);
1259             }
1260             cpIdx++;
1261             if (cuIdx == inString.length()) {
1262                break;
1263             }
1264         }
1265     }
1266
1267     // bestSnlp[i] is the snlp of the best segmentation of the first i
1268     // code points in the range to be matched.
1269     UVector32 bestSnlp(numCodePts + 1, status);
1270     bestSnlp.addElement(0, status);
1271     for(int32_t i = 1; i <= numCodePts; i++) {
1272         bestSnlp.addElement(kuint32max, status);
1273     }
1274
1275
1276     // prev[i] is the index of the last CJK code point in the previous word in
1277     // the best segmentation of the first i characters.
1278     UVector32 prev(numCodePts + 1, status);
1279     for(int32_t i = 0; i <= numCodePts; i++){
1280         prev.addElement(-1, status);
1281     }
1282
1283     const int32_t maxWordSize = 20;
1284     UVector32 values(numCodePts, status);
1285     values.setSize(numCodePts);
1286     UVector32 lengths(numCodePts, status);
1287     lengths.setSize(numCodePts);
1288
1289     UText fu = UTEXT_INITIALIZER;
1290     utext_openUnicodeString(&fu, &inString, &status);
1291
1292     // Dynamic programming to find the best segmentation.
1293
1294     // In outer loop, i  is the code point index,
1295     //                ix is the corresponding string (code unit) index.
1296     //    They differ when the string contains supplementary characters.
1297     int32_t ix = 0;
1298     bool is_prev_katakana = false;
1299     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1300         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1301             continue;
1302         }
1303
1304         int32_t count;
1305         utext_setNativeIndex(&fu, ix);
1306         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1307                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1308                              // Note: lengths is filled with code point lengths
1309                              //       The NULL parameter is the ignored code unit lengths.
1310
1311         // if there are no single character matches found in the dictionary
1312         // starting with this character, treat character as a 1-character word
1313         // with the highest value possible, i.e. the least likely to occur.
1314         // Exclude Korean characters from this treatment, as they should be left
1315         // together by default.
1316         if ((count == 0 || lengths.elementAti(0) != 1) &&
1317                 !fHangulWordSet.contains(inString.char32At(ix))) {
1318             values.setElementAt(maxSnlp, count);   // 255
1319             lengths.setElementAt(1, count++);
1320         }
1321
1322         for (int32_t j = 0; j < count; j++) {
1323             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1324             int32_t ln_j_i = lengths.elementAti(j) + i;
1325             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1326                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1327                 prev.setElementAt(i, ln_j_i);
1328             }
1329         }
1330
1331         // In Japanese,
1332         // Katakana word in single character is pretty rare. So we apply
1333         // the following heuristic to Katakana: any continuous run of Katakana
1334         // characters is considered a candidate word with a default cost
1335         // specified in the katakanaCost table according to its length.
1336
1337         bool is_katakana = isKatakana(inString.char32At(ix));
1338         int32_t katakanaRunLength = 1;
1339         if (!is_prev_katakana && is_katakana) {
1340             int32_t j = inString.moveIndex32(ix, 1);
1341             // Find the end of the continuous run of Katakana characters
1342             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1343                     isKatakana(inString.char32At(j))) {
1344                 j = inString.moveIndex32(j, 1);
1345                 katakanaRunLength++;
1346             }
1347             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1348                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1349                 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
1350                     bestSnlp.setElementAt(newSnlp, j);
1351                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1352                 }
1353             }
1354         }
1355         is_prev_katakana = is_katakana;
1356     }
1357     utext_close(&fu);
1358
1359     // Start pushing the optimal offset index into t_boundary (t for tentative).
1360     // prev[numCodePts] is guaranteed to be meaningful.
1361     // We'll first push in the reverse order, i.e.,
1362     // t_boundary[0] = numCodePts, and afterwards do a swap.
1363     UVector32 t_boundary(numCodePts+1, status);
1364
1365     int32_t numBreaks = 0;
1366     // No segmentation found, set boundary to end of range
1367     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1368         t_boundary.addElement(numCodePts, status);
1369         numBreaks++;
1370     } else {
1371         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1372             t_boundary.addElement(i, status);
1373             numBreaks++;
1374         }
1375         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1376     }
1377
1378     // Add a break for the start of the dictionary range if there is not one
1379     // there already.
1380     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1381         t_boundary.addElement(0, status);
1382         numBreaks++;
1383     }
1384
1385     // Now that we're done, convert positions in t_boundary[] (indices in
1386     // the normalized input string) back to indices in the original input UText
1387     // while reversing t_boundary and pushing values to foundBreaks.
1388     for (int32_t i = numBreaks-1; i >= 0; i--) {
1389         int32_t cpPos = t_boundary.elementAti(i);
1390         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1391         // Boundaries are added to foundBreaks output in ascending order.
1392         U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
1393         foundBreaks.push(utextPos, status);
1394     }
1395
1396     // inString goes out of scope
1397     // inputMap goes out of scope
1398     return numBreaks;
1399 }
1400 #endif
1401
1402 U_NAMESPACE_END
1403
1404 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1405