Imported Upstream version 58.1
[platform/upstream/icu.git] / source / common / dictbe.cpp
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2016, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  *******************************************************************************
8  */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_BREAK_ITERATION
13
14 #include "brkeng.h"
15 #include "dictbe.h"
16 #include "unicode/uniset.h"
17 #include "unicode/chariter.h"
18 #include "unicode/ubrk.h"
19 #include "uvectr32.h"
20 #include "uvector.h"
21 #include "uassert.h"
22 #include "unicode/normlzr.h"
23 #include "cmemory.h"
24 #include "dictionarydata.h"
25
26 U_NAMESPACE_BEGIN
27
28 /*
29  ******************************************************************
30  */
31
32 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
33     fTypes = breakTypes;
34 }
35
36 DictionaryBreakEngine::~DictionaryBreakEngine() {
37 }
38
39 UBool
40 DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
41     return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
42             && fSet.contains(c));
43 }
44
45 int32_t
46 DictionaryBreakEngine::findBreaks( UText *text,
47                                  int32_t startPos,
48                                  int32_t endPos,
49                                  UBool reverse,
50                                  int32_t breakType,
51                                  UStack &foundBreaks ) const {
52     int32_t result = 0;
53
54     // Find the span of characters included in the set.
55     //   The span to break begins at the current position in the text, and
56     //   extends towards the start or end of the text, depending on 'reverse'.
57
58     int32_t start = (int32_t)utext_getNativeIndex(text);
59     int32_t current;
60     int32_t rangeStart;
61     int32_t rangeEnd;
62     UChar32 c = utext_current32(text);
63     if (reverse) {
64         UBool   isDict = fSet.contains(c);
65         while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
66             c = utext_previous32(text);
67             isDict = fSet.contains(c);
68         }
69         if (current < startPos) {
70             rangeStart = startPos;
71         } else {
72             rangeStart = current;
73             if (!isDict) {
74                 utext_next32(text);
75                 rangeStart = (int32_t)utext_getNativeIndex(text);
76             }
77         }
78         // rangeEnd = start + 1;
79         utext_setNativeIndex(text, start);
80         utext_next32(text);
81         rangeEnd = (int32_t)utext_getNativeIndex(text);
82     }
83     else {
84         while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
85             utext_next32(text);         // TODO:  recast loop for postincrement
86             c = utext_current32(text);
87         }
88         rangeStart = start;
89         rangeEnd = current;
90     }
91     if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
92         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
93         utext_setNativeIndex(text, current);
94     }
95     
96     return result;
97 }
98
99 void
100 DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
101     fSet = set;
102     // Compact for caching
103     fSet.compact();
104 }
105
106 /*
107  ******************************************************************
108  * PossibleWord
109  */
110
111 // Helper class for improving readability of the Thai/Lao/Khmer word break
112 // algorithm. The implementation is completely inline.
113
114 // List size, limited by the maximum number of words in the dictionary
115 // that form a nested sequence.
116 static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
117
118 class PossibleWord {
119 private:
120     // list of word candidate lengths, in increasing length order
121     // TODO: bytes would be sufficient for word lengths.
122     int32_t   count;      // Count of candidates
123     int32_t   prefix;     // The longest match with a dictionary word
124     int32_t   offset;     // Offset in the text of these candidates
125     int32_t   mark;       // The preferred candidate's offset
126     int32_t   current;    // The candidate we're currently looking at
127     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
128     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
129
130 public:
131     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
132     ~PossibleWord() {};
133   
134     // Fill the list of candidates if needed, select the longest, and return the number found
135     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
136   
137     // Select the currently marked candidate, point after it in the text, and invalidate self
138     int32_t   acceptMarked( UText *text );
139   
140     // Back up from the current candidate to the next shorter one; return TRUE if that exists
141     // and point the text after it
142     UBool     backUp( UText *text );
143   
144     // Return the longest prefix this candidate location shares with a dictionary word
145     // Return value is in code points.
146     int32_t   longestPrefix() { return prefix; };
147   
148     // Mark the current candidate as the one we like
149     void      markCurrent() { mark = current; };
150     
151     // Get length in code points of the marked word.
152     int32_t   markedCPLength() { return cpLengths[mark]; };
153 };
154
155
156 int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
157     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
158     int32_t start = (int32_t)utext_getNativeIndex(text);
159     if (start != offset) {
160         offset = start;
161         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
162         // Dictionary leaves text after longest prefix, not longest word. Back up.
163         if (count <= 0) {
164             utext_setNativeIndex(text, start);
165         }
166     }
167     if (count > 0) {
168         utext_setNativeIndex(text, start+cuLengths[count-1]);
169     }
170     current = count-1;
171     mark = current;
172     return count;
173 }
174
175 int32_t
176 PossibleWord::acceptMarked( UText *text ) {
177     utext_setNativeIndex(text, offset + cuLengths[mark]);
178     return cuLengths[mark];
179 }
180
181
182 UBool
183 PossibleWord::backUp( UText *text ) {
184     if (current > 0) {
185         utext_setNativeIndex(text, offset + cuLengths[--current]);
186         return TRUE;
187     }
188     return FALSE;
189 }
190
191 /*
192  ******************************************************************
193  * ThaiBreakEngine
194  */
195
196 // How many words in a row are "good enough"?
197 static const int32_t THAI_LOOKAHEAD = 3;
198
199 // Will not combine a non-word with a preceding dictionary word longer than this
200 static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
201
202 // Will not combine a non-word that shares at least this much prefix with a
203 // dictionary word, with a preceding word
204 static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
205
206 // Ellision character
207 static const int32_t THAI_PAIYANNOI = 0x0E2F;
208
209 // Repeat character
210 static const int32_t THAI_MAIYAMOK = 0x0E46;
211
212 // Minimum word size
213 static const int32_t THAI_MIN_WORD = 2;
214
215 // Minimum number of characters for two words
216 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
217
218 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
219     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
220       fDictionary(adoptDictionary)
221 {
222     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
223     if (U_SUCCESS(status)) {
224         setCharacters(fThaiWordSet);
225     }
226     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
227     fMarkSet.add(0x0020);
228     fEndWordSet = fThaiWordSet;
229     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
230     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
231     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
232     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
233     fSuffixSet.add(THAI_PAIYANNOI);
234     fSuffixSet.add(THAI_MAIYAMOK);
235
236     // Compact for caching.
237     fMarkSet.compact();
238     fEndWordSet.compact();
239     fBeginWordSet.compact();
240     fSuffixSet.compact();
241 }
242
243 ThaiBreakEngine::~ThaiBreakEngine() {
244     delete fDictionary;
245 }
246
247 int32_t
248 ThaiBreakEngine::divideUpDictionaryRange( UText *text,
249                                                 int32_t rangeStart,
250                                                 int32_t rangeEnd,
251                                                 UStack &foundBreaks ) const {
252     utext_setNativeIndex(text, rangeStart);
253     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
254     if (utext_getNativeIndex(text) >= rangeEnd) {
255         return 0;       // Not enough characters for two words
256     }
257     utext_setNativeIndex(text, rangeStart);
258
259
260     uint32_t wordsFound = 0;
261     int32_t cpWordLength = 0;    // Word Length in Code Points.
262     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
263     int32_t current;
264     UErrorCode status = U_ZERO_ERROR;
265     PossibleWord words[THAI_LOOKAHEAD];
266     
267     utext_setNativeIndex(text, rangeStart);
268     
269     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
270         cpWordLength = 0;
271         cuWordLength = 0;
272
273         // Look for candidate words at the current position
274         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
275         
276         // If we found exactly one, use that
277         if (candidates == 1) {
278             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
279             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
280             wordsFound += 1;
281         }
282         // If there was more than one, see which one can take us forward the most words
283         else if (candidates > 1) {
284             // If we're already at the end of the range, we're done
285             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
286                 goto foundBest;
287             }
288             do {
289                 int32_t wordsMatched = 1;
290                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
291                     if (wordsMatched < 2) {
292                         // Followed by another dictionary word; mark first word as a good candidate
293                         words[wordsFound%THAI_LOOKAHEAD].markCurrent();
294                         wordsMatched = 2;
295                     }
296                     
297                     // If we're already at the end of the range, we're done
298                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
299                         goto foundBest;
300                     }
301                     
302                     // See if any of the possible second words is followed by a third word
303                     do {
304                         // If we find a third word, stop right away
305                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
306                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();
307                             goto foundBest;
308                         }
309                     }
310                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
311                 }
312             }
313             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
314 foundBest:
315             // Set UText position to after the accepted word.
316             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
317             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
318             wordsFound += 1;
319         }
320         
321         // We come here after having either found a word or not. We look ahead to the
322         // next word. If it's not a dictionary word, we will combine it with the word we
323         // just found (if there is one), but only if the preceding word does not exceed
324         // the threshold.
325         // The text iterator should now be positioned at the end of the word we found.
326         
327         UChar32 uc = 0;
328         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
329             // if it is a dictionary word, do nothing. If it isn't, then if there is
330             // no preceding word, or the non-word shares less than the minimum threshold
331             // of characters with a dictionary word, then scan to resynchronize
332             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
333                   && (cuWordLength == 0
334                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
335                 // Look for a plausible word boundary
336                 int32_t remaining = rangeEnd - (current+cuWordLength);
337                 UChar32 pc;
338                 int32_t chars = 0;
339                 for (;;) {
340                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
341                     pc = utext_next32(text);
342                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
343                     chars += pcSize;
344                     remaining -= pcSize;
345                     if (remaining <= 0) {
346                         break;
347                     }
348                     uc = utext_current32(text);
349                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
350                         // Maybe. See if it's in the dictionary.
351                         // NOTE: In the original Apple code, checked that the next
352                         // two characters after uc were not 0x0E4C THANTHAKHAT before
353                         // checking the dictionary. That is just a performance filter,
354                         // but it's not clear it's faster than checking the trie.
355                         int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
356                         utext_setNativeIndex(text, current + cuWordLength + chars);
357                         if (candidates > 0) {
358                             break;
359                         }
360                     }
361                 }
362                 
363                 // Bump the word count if there wasn't already one
364                 if (cuWordLength <= 0) {
365                     wordsFound += 1;
366                 }
367                 
368                 // Update the length with the passed-over characters
369                 cuWordLength += chars;
370             }
371             else {
372                 // Back up to where we were for next iteration
373                 utext_setNativeIndex(text, current+cuWordLength);
374             }
375         }
376         
377         // Never stop before a combining mark.
378         int32_t currPos;
379         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
380             utext_next32(text);
381             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
382         }
383         
384         // Look ahead for possible suffixes if a dictionary word does not follow.
385         // We do this in code rather than using a rule so that the heuristic
386         // resynch continues to function. For example, one of the suffix characters
387         // could be a typo in the middle of a word.
388         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
389             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
390                 && fSuffixSet.contains(uc = utext_current32(text))) {
391                 if (uc == THAI_PAIYANNOI) {
392                     if (!fSuffixSet.contains(utext_previous32(text))) {
393                         // Skip over previous end and PAIYANNOI
394                         utext_next32(text);
395                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
396                         utext_next32(text);
397                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
398                         uc = utext_current32(text);     // Fetch next character
399                     }
400                     else {
401                         // Restore prior position
402                         utext_next32(text);
403                     }
404                 }
405                 if (uc == THAI_MAIYAMOK) {
406                     if (utext_previous32(text) != THAI_MAIYAMOK) {
407                         // Skip over previous end and MAIYAMOK
408                         utext_next32(text);
409                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
410                         utext_next32(text);
411                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
412                     }
413                     else {
414                         // Restore prior position
415                         utext_next32(text);
416                     }
417                 }
418             }
419             else {
420                 utext_setNativeIndex(text, current+cuWordLength);
421             }
422         }
423
424         // Did we find a word on this iteration? If so, push it on the break stack
425         if (cuWordLength > 0) {
426             foundBreaks.push((current+cuWordLength), status);
427         }
428     }
429
430     // Don't return a break for the end of the dictionary range if there is one there.
431     if (foundBreaks.peeki() >= rangeEnd) {
432         (void) foundBreaks.popi();
433         wordsFound -= 1;
434     }
435
436     return wordsFound;
437 }
438
439 /*
440  ******************************************************************
441  * LaoBreakEngine
442  */
443
444 // How many words in a row are "good enough"?
445 static const int32_t LAO_LOOKAHEAD = 3;
446
447 // Will not combine a non-word with a preceding dictionary word longer than this
448 static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
449
450 // Will not combine a non-word that shares at least this much prefix with a
451 // dictionary word, with a preceding word
452 static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
453
454 // Minimum word size
455 static const int32_t LAO_MIN_WORD = 2;
456
457 // Minimum number of characters for two words
458 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
459
460 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
461     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
462       fDictionary(adoptDictionary)
463 {
464     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
465     if (U_SUCCESS(status)) {
466         setCharacters(fLaoWordSet);
467     }
468     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
469     fMarkSet.add(0x0020);
470     fEndWordSet = fLaoWordSet;
471     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
472     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
473     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
474     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
475
476     // Compact for caching.
477     fMarkSet.compact();
478     fEndWordSet.compact();
479     fBeginWordSet.compact();
480 }
481
482 LaoBreakEngine::~LaoBreakEngine() {
483     delete fDictionary;
484 }
485
486 int32_t
487 LaoBreakEngine::divideUpDictionaryRange( UText *text,
488                                                 int32_t rangeStart,
489                                                 int32_t rangeEnd,
490                                                 UStack &foundBreaks ) const {
491     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
492         return 0;       // Not enough characters for two words
493     }
494
495     uint32_t wordsFound = 0;
496     int32_t cpWordLength = 0;
497     int32_t cuWordLength = 0;
498     int32_t current;
499     UErrorCode status = U_ZERO_ERROR;
500     PossibleWord words[LAO_LOOKAHEAD];
501     
502     utext_setNativeIndex(text, rangeStart);
503     
504     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
505         cuWordLength = 0;
506         cpWordLength = 0;
507
508         // Look for candidate words at the current position
509         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
510         
511         // If we found exactly one, use that
512         if (candidates == 1) {
513             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
514             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
515             wordsFound += 1;
516         }
517         // If there was more than one, see which one can take us forward the most words
518         else if (candidates > 1) {
519             // If we're already at the end of the range, we're done
520             if (utext_getNativeIndex(text) >= rangeEnd) {
521                 goto foundBest;
522             }
523             do {
524                 int32_t wordsMatched = 1;
525                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
526                     if (wordsMatched < 2) {
527                         // Followed by another dictionary word; mark first word as a good candidate
528                         words[wordsFound%LAO_LOOKAHEAD].markCurrent();
529                         wordsMatched = 2;
530                     }
531                     
532                     // If we're already at the end of the range, we're done
533                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
534                         goto foundBest;
535                     }
536                     
537                     // See if any of the possible second words is followed by a third word
538                     do {
539                         // If we find a third word, stop right away
540                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
541                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();
542                             goto foundBest;
543                         }
544                     }
545                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
546                 }
547             }
548             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
549 foundBest:
550             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
551             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
552             wordsFound += 1;
553         }
554         
555         // We come here after having either found a word or not. We look ahead to the
556         // next word. If it's not a dictionary word, we will combine it withe the word we
557         // just found (if there is one), but only if the preceding word does not exceed
558         // the threshold.
559         // The text iterator should now be positioned at the end of the word we found.
560         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
561             // if it is a dictionary word, do nothing. If it isn't, then if there is
562             // no preceding word, or the non-word shares less than the minimum threshold
563             // of characters with a dictionary word, then scan to resynchronize
564             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
565                   && (cuWordLength == 0
566                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
567                 // Look for a plausible word boundary
568                 int32_t remaining = rangeEnd - (current + cuWordLength);
569                 UChar32 pc;
570                 UChar32 uc;
571                 int32_t chars = 0;
572                 for (;;) {
573                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
574                     pc = utext_next32(text);
575                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
576                     chars += pcSize;
577                     remaining -= pcSize;
578                     if (remaining <= 0) {
579                         break;
580                     }
581                     uc = utext_current32(text);
582                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
583                         // Maybe. See if it's in the dictionary.
584                         // TODO: this looks iffy; compare with old code.
585                         int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
586                         utext_setNativeIndex(text, current + cuWordLength + chars);
587                         if (candidates > 0) {
588                             break;
589                         }
590                     }
591                 }
592                 
593                 // Bump the word count if there wasn't already one
594                 if (cuWordLength <= 0) {
595                     wordsFound += 1;
596                 }
597                 
598                 // Update the length with the passed-over characters
599                 cuWordLength += chars;
600             }
601             else {
602                 // Back up to where we were for next iteration
603                 utext_setNativeIndex(text, current + cuWordLength);
604             }
605         }
606         
607         // Never stop before a combining mark.
608         int32_t currPos;
609         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
610             utext_next32(text);
611             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
612         }
613         
614         // Look ahead for possible suffixes if a dictionary word does not follow.
615         // We do this in code rather than using a rule so that the heuristic
616         // resynch continues to function. For example, one of the suffix characters
617         // could be a typo in the middle of a word.
618         // NOT CURRENTLY APPLICABLE TO LAO
619
620         // Did we find a word on this iteration? If so, push it on the break stack
621         if (cuWordLength > 0) {
622             foundBreaks.push((current+cuWordLength), status);
623         }
624     }
625
626     // Don't return a break for the end of the dictionary range if there is one there.
627     if (foundBreaks.peeki() >= rangeEnd) {
628         (void) foundBreaks.popi();
629         wordsFound -= 1;
630     }
631
632     return wordsFound;
633 }
634
635 /*
636  ******************************************************************
637  * BurmeseBreakEngine
638  */
639
640 // How many words in a row are "good enough"?
641 static const int32_t BURMESE_LOOKAHEAD = 3;
642
643 // Will not combine a non-word with a preceding dictionary word longer than this
644 static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
645
646 // Will not combine a non-word that shares at least this much prefix with a
647 // dictionary word, with a preceding word
648 static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
649
650 // Minimum word size
651 static const int32_t BURMESE_MIN_WORD = 2;
652
653 // Minimum number of characters for two words
654 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
655
656 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
657     : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
658       fDictionary(adoptDictionary)
659 {
660     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
661     if (U_SUCCESS(status)) {
662         setCharacters(fBurmeseWordSet);
663     }
664     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
665     fMarkSet.add(0x0020);
666     fEndWordSet = fBurmeseWordSet;
667     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
668
669     // Compact for caching.
670     fMarkSet.compact();
671     fEndWordSet.compact();
672     fBeginWordSet.compact();
673 }
674
675 BurmeseBreakEngine::~BurmeseBreakEngine() {
676     delete fDictionary;
677 }
678
679 int32_t
680 BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
681                                                 int32_t rangeStart,
682                                                 int32_t rangeEnd,
683                                                 UStack &foundBreaks ) const {
684     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
685         return 0;       // Not enough characters for two words
686     }
687
688     uint32_t wordsFound = 0;
689     int32_t cpWordLength = 0;
690     int32_t cuWordLength = 0;
691     int32_t current;
692     UErrorCode status = U_ZERO_ERROR;
693     PossibleWord words[BURMESE_LOOKAHEAD];
694     
695     utext_setNativeIndex(text, rangeStart);
696     
697     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
698         cuWordLength = 0;
699         cpWordLength = 0;
700
701         // Look for candidate words at the current position
702         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
703         
704         // If we found exactly one, use that
705         if (candidates == 1) {
706             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
707             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
708             wordsFound += 1;
709         }
710         // If there was more than one, see which one can take us forward the most words
711         else if (candidates > 1) {
712             // If we're already at the end of the range, we're done
713             if (utext_getNativeIndex(text) >= rangeEnd) {
714                 goto foundBest;
715             }
716             do {
717                 int32_t wordsMatched = 1;
718                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
719                     if (wordsMatched < 2) {
720                         // Followed by another dictionary word; mark first word as a good candidate
721                         words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
722                         wordsMatched = 2;
723                     }
724                     
725                     // If we're already at the end of the range, we're done
726                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
727                         goto foundBest;
728                     }
729                     
730                     // See if any of the possible second words is followed by a third word
731                     do {
732                         // If we find a third word, stop right away
733                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
734                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
735                             goto foundBest;
736                         }
737                     }
738                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
739                 }
740             }
741             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
742 foundBest:
743             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
744             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
745             wordsFound += 1;
746         }
747         
748         // We come here after having either found a word or not. We look ahead to the
749         // next word. If it's not a dictionary word, we will combine it withe the word we
750         // just found (if there is one), but only if the preceding word does not exceed
751         // the threshold.
752         // The text iterator should now be positioned at the end of the word we found.
753         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
754             // if it is a dictionary word, do nothing. If it isn't, then if there is
755             // no preceding word, or the non-word shares less than the minimum threshold
756             // of characters with a dictionary word, then scan to resynchronize
757             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
758                   && (cuWordLength == 0
759                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
760                 // Look for a plausible word boundary
761                 int32_t remaining = rangeEnd - (current + cuWordLength);
762                 UChar32 pc;
763                 UChar32 uc;
764                 int32_t chars = 0;
765                 for (;;) {
766                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
767                     pc = utext_next32(text);
768                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
769                     chars += pcSize;
770                     remaining -= pcSize;
771                     if (remaining <= 0) {
772                         break;
773                     }
774                     uc = utext_current32(text);
775                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
776                         // Maybe. See if it's in the dictionary.
777                         // TODO: this looks iffy; compare with old code.
778                         int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
779                         utext_setNativeIndex(text, current + cuWordLength + chars);
780                         if (candidates > 0) {
781                             break;
782                         }
783                     }
784                 }
785                 
786                 // Bump the word count if there wasn't already one
787                 if (cuWordLength <= 0) {
788                     wordsFound += 1;
789                 }
790                 
791                 // Update the length with the passed-over characters
792                 cuWordLength += chars;
793             }
794             else {
795                 // Back up to where we were for next iteration
796                 utext_setNativeIndex(text, current + cuWordLength);
797             }
798         }
799         
800         // Never stop before a combining mark.
801         int32_t currPos;
802         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
803             utext_next32(text);
804             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
805         }
806         
807         // Look ahead for possible suffixes if a dictionary word does not follow.
808         // We do this in code rather than using a rule so that the heuristic
809         // resynch continues to function. For example, one of the suffix characters
810         // could be a typo in the middle of a word.
811         // NOT CURRENTLY APPLICABLE TO BURMESE
812
813         // Did we find a word on this iteration? If so, push it on the break stack
814         if (cuWordLength > 0) {
815             foundBreaks.push((current+cuWordLength), status);
816         }
817     }
818
819     // Don't return a break for the end of the dictionary range if there is one there.
820     if (foundBreaks.peeki() >= rangeEnd) {
821         (void) foundBreaks.popi();
822         wordsFound -= 1;
823     }
824
825     return wordsFound;
826 }
827
828 /*
829  ******************************************************************
830  * KhmerBreakEngine
831  */
832
833 // How many words in a row are "good enough"?
834 static const int32_t KHMER_LOOKAHEAD = 3;
835
836 // Will not combine a non-word with a preceding dictionary word longer than this
837 static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
838
839 // Will not combine a non-word that shares at least this much prefix with a
840 // dictionary word, with a preceding word
841 static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
842
843 // Minimum word size
844 static const int32_t KHMER_MIN_WORD = 2;
845
846 // Minimum number of characters for two words
847 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
848
849 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
850     : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
851       fDictionary(adoptDictionary)
852 {
853     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
854     if (U_SUCCESS(status)) {
855         setCharacters(fKhmerWordSet);
856     }
857     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
858     fMarkSet.add(0x0020);
859     fEndWordSet = fKhmerWordSet;
860     fBeginWordSet.add(0x1780, 0x17B3);
861     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
862     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
863     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
864     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
865     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
866 //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
867 //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
868 //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
869 //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
870 //    fSuffixSet.add(THAI_PAIYANNOI);
871 //    fSuffixSet.add(THAI_MAIYAMOK);
872
873     // Compact for caching.
874     fMarkSet.compact();
875     fEndWordSet.compact();
876     fBeginWordSet.compact();
877 //    fSuffixSet.compact();
878 }
879
880 KhmerBreakEngine::~KhmerBreakEngine() {
881     delete fDictionary;
882 }
883
884 int32_t
885 KhmerBreakEngine::divideUpDictionaryRange( UText *text,
886                                                 int32_t rangeStart,
887                                                 int32_t rangeEnd,
888                                                 UStack &foundBreaks ) const {
889     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
890         return 0;       // Not enough characters for two words
891     }
892
893     uint32_t wordsFound = 0;
894     int32_t cpWordLength = 0;
895     int32_t cuWordLength = 0;
896     int32_t current;
897     UErrorCode status = U_ZERO_ERROR;
898     PossibleWord words[KHMER_LOOKAHEAD];
899
900     utext_setNativeIndex(text, rangeStart);
901
902     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
903         cuWordLength = 0;
904         cpWordLength = 0;
905
906         // Look for candidate words at the current position
907         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
908
909         // If we found exactly one, use that
910         if (candidates == 1) {
911             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
912             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
913             wordsFound += 1;
914         }
915
916         // If there was more than one, see which one can take us forward the most words
917         else if (candidates > 1) {
918             // If we're already at the end of the range, we're done
919             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
920                 goto foundBest;
921             }
922             do {
923                 int32_t wordsMatched = 1;
924                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
925                     if (wordsMatched < 2) {
926                         // Followed by another dictionary word; mark first word as a good candidate
927                         words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
928                         wordsMatched = 2;
929                     }
930
931                     // If we're already at the end of the range, we're done
932                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
933                         goto foundBest;
934                     }
935
936                     // See if any of the possible second words is followed by a third word
937                     do {
938                         // If we find a third word, stop right away
939                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
940                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
941                             goto foundBest;
942                         }
943                     }
944                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
945                 }
946             }
947             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
948 foundBest:
949             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
950             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
951             wordsFound += 1;
952         }
953
954         // We come here after having either found a word or not. We look ahead to the
955         // next word. If it's not a dictionary word, we will combine it with the word we
956         // just found (if there is one), but only if the preceding word does not exceed
957         // the threshold.
958         // The text iterator should now be positioned at the end of the word we found.
959         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
960             // if it is a dictionary word, do nothing. If it isn't, then if there is
961             // no preceding word, or the non-word shares less than the minimum threshold
962             // of characters with a dictionary word, then scan to resynchronize
963             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
964                   && (cuWordLength == 0
965                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
966                 // Look for a plausible word boundary
967                 int32_t remaining = rangeEnd - (current+cuWordLength);
968                 UChar32 pc;
969                 UChar32 uc;
970                 int32_t chars = 0;
971                 for (;;) {
972                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
973                     pc = utext_next32(text);
974                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
975                     chars += pcSize;
976                     remaining -= pcSize;
977                     if (remaining <= 0) {
978                         break;
979                     }
980                     uc = utext_current32(text);
981                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
982                         // Maybe. See if it's in the dictionary.
983                         int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
984                         utext_setNativeIndex(text, current+cuWordLength+chars);
985                         if (candidates > 0) {
986                             break;
987                         }
988                     }
989                 }
990
991                 // Bump the word count if there wasn't already one
992                 if (cuWordLength <= 0) {
993                     wordsFound += 1;
994                 }
995
996                 // Update the length with the passed-over characters
997                 cuWordLength += chars;
998             }
999             else {
1000                 // Back up to where we were for next iteration
1001                 utext_setNativeIndex(text, current+cuWordLength);
1002             }
1003         }
1004
1005         // Never stop before a combining mark.
1006         int32_t currPos;
1007         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
1008             utext_next32(text);
1009             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
1010         }
1011
1012         // Look ahead for possible suffixes if a dictionary word does not follow.
1013         // We do this in code rather than using a rule so that the heuristic
1014         // resynch continues to function. For example, one of the suffix characters
1015         // could be a typo in the middle of a word.
1016 //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1017 //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1018 //                && fSuffixSet.contains(uc = utext_current32(text))) {
1019 //                if (uc == KHMER_PAIYANNOI) {
1020 //                    if (!fSuffixSet.contains(utext_previous32(text))) {
1021 //                        // Skip over previous end and PAIYANNOI
1022 //                        utext_next32(text);
1023 //                        utext_next32(text);
1024 //                        wordLength += 1;            // Add PAIYANNOI to word
1025 //                        uc = utext_current32(text);     // Fetch next character
1026 //                    }
1027 //                    else {
1028 //                        // Restore prior position
1029 //                        utext_next32(text);
1030 //                    }
1031 //                }
1032 //                if (uc == KHMER_MAIYAMOK) {
1033 //                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1034 //                        // Skip over previous end and MAIYAMOK
1035 //                        utext_next32(text);
1036 //                        utext_next32(text);
1037 //                        wordLength += 1;            // Add MAIYAMOK to word
1038 //                    }
1039 //                    else {
1040 //                        // Restore prior position
1041 //                        utext_next32(text);
1042 //                    }
1043 //                }
1044 //            }
1045 //            else {
1046 //                utext_setNativeIndex(text, current+wordLength);
1047 //            }
1048 //        }
1049
1050         // Did we find a word on this iteration? If so, push it on the break stack
1051         if (cuWordLength > 0) {
1052             foundBreaks.push((current+cuWordLength), status);
1053         }
1054     }
1055     
1056     // Don't return a break for the end of the dictionary range if there is one there.
1057     if (foundBreaks.peeki() >= rangeEnd) {
1058         (void) foundBreaks.popi();
1059         wordsFound -= 1;
1060     }
1061
1062     return wordsFound;
1063 }
1064
1065 #if !UCONFIG_NO_NORMALIZATION
1066 /*
1067  ******************************************************************
1068  * CjkBreakEngine
1069  */
1070 static const uint32_t kuint32max = 0xFFFFFFFF;
1071 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1072 : DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
1073     // Korean dictionary only includes Hangul syllables
1074     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1075     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1076     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1077     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1078     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1079
1080     if (U_SUCCESS(status)) {
1081         // handle Korean and Japanese/Chinese using different dictionaries
1082         if (type == kKorean) {
1083             setCharacters(fHangulWordSet);
1084         } else { //Chinese and Japanese
1085             UnicodeSet cjSet;
1086             cjSet.addAll(fHanWordSet);
1087             cjSet.addAll(fKatakanaWordSet);
1088             cjSet.addAll(fHiraganaWordSet);
1089             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1090             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1091             setCharacters(cjSet);
1092         }
1093     }
1094 }
1095
1096 CjkBreakEngine::~CjkBreakEngine(){
1097     delete fDictionary;
1098 }
1099
1100 // The katakanaCost values below are based on the length frequencies of all
1101 // katakana phrases in the dictionary
1102 static const int32_t kMaxKatakanaLength = 8;
1103 static const int32_t kMaxKatakanaGroupLength = 20;
1104 static const uint32_t maxSnlp = 255;
1105
1106 static inline uint32_t getKatakanaCost(int32_t wordLength){
1107     //TODO: fill array with actual values from dictionary!
1108     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1109                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1110     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1111 }
1112
1113 static inline bool isKatakana(uint16_t value) {
1114     return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
1115             (value >= 0xFF66u && value <= 0xFF9fu);
1116 }
1117
1118
1119 // Function for accessing internal utext flags.
1120 //   Replicates an internal UText function.
1121
1122 static inline int32_t utext_i32_flag(int32_t bitIndex) {
1123     return (int32_t)1 << bitIndex;
1124 }
1125
1126        
1127 /*
1128  * @param text A UText representing the text
1129  * @param rangeStart The start of the range of dictionary characters
1130  * @param rangeEnd The end of the range of dictionary characters
1131  * @param foundBreaks Output of C array of int32_t break positions, or 0
1132  * @return The number of breaks found
1133  */
1134 int32_t 
1135 CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1136         int32_t rangeStart,
1137         int32_t rangeEnd,
1138         UStack &foundBreaks ) const {
1139     if (rangeStart >= rangeEnd) {
1140         return 0;
1141     }
1142
1143     // UnicodeString version of input UText, NFKC normalized if necessary.
1144     UnicodeString inString;
1145
1146     // inputMap[inStringIndex] = corresponding native index from UText inText.
1147     // If NULL then mapping is 1:1
1148     LocalPointer<UVector32>     inputMap;
1149
1150     UErrorCode     status      = U_ZERO_ERROR;
1151
1152
1153     // if UText has the input string as one contiguous UTF-16 chunk
1154     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1155          inText->chunkNativeStart <= rangeStart &&
1156          inText->chunkNativeLimit >= rangeEnd   &&
1157          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1158
1159         // Input UText is in one contiguous UTF-16 chunk.
1160         // Use Read-only aliasing UnicodeString.
1161         inString.setTo(FALSE,
1162                        inText->chunkContents + rangeStart - inText->chunkNativeStart,
1163                        rangeEnd - rangeStart);
1164     } else {
1165         // Copy the text from the original inText (UText) to inString (UnicodeString).
1166         // Create a map from UnicodeString indices -> UText offsets.
1167         utext_setNativeIndex(inText, rangeStart);
1168         int32_t limit = rangeEnd;
1169         U_ASSERT(limit <= utext_nativeLength(inText));
1170         if (limit > utext_nativeLength(inText)) {
1171             limit = (int32_t)utext_nativeLength(inText);
1172         }
1173         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1174         if (U_FAILURE(status)) {
1175             return 0;
1176         }
1177         while (utext_getNativeIndex(inText) < limit) {
1178             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1179             UChar32 c = utext_next32(inText);
1180             U_ASSERT(c != U_SENTINEL);
1181             inString.append(c);
1182             while (inputMap->size() < inString.length()) {
1183                 inputMap->addElement(nativePosition, status);
1184             }
1185         }
1186         inputMap->addElement(limit, status);
1187     }
1188
1189
1190     if (!nfkcNorm2->isNormalized(inString, status)) {
1191         UnicodeString normalizedInput;
1192         //  normalizedMap[normalizedInput position] ==  original UText position.
1193         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1194         if (U_FAILURE(status)) {
1195             return 0;
1196         }
1197         
1198         UnicodeString fragment;
1199         UnicodeString normalizedFragment;
1200         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1201             fragment.remove();
1202             int32_t fragmentStartI = srcI;
1203             UChar32 c = inString.char32At(srcI);
1204             for (;;) {
1205                 fragment.append(c);
1206                 srcI = inString.moveIndex32(srcI, 1);
1207                 if (srcI == inString.length()) {
1208                     break;
1209                 }
1210                 c = inString.char32At(srcI);
1211                 if (nfkcNorm2->hasBoundaryBefore(c)) {
1212                     break;
1213                 }
1214             }
1215             nfkcNorm2->normalize(fragment, normalizedFragment, status);
1216             normalizedInput.append(normalizedFragment);
1217
1218             // Map every position in the normalized chunk to the start of the chunk
1219             //   in the original input.
1220             int32_t fragmentOriginalStart = inputMap.isValid() ?
1221                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1222             while (normalizedMap->size() < normalizedInput.length()) {
1223                 normalizedMap->addElement(fragmentOriginalStart, status);
1224                 if (U_FAILURE(status)) {
1225                     break;
1226                 }
1227             }
1228         }
1229         U_ASSERT(normalizedMap->size() == normalizedInput.length());
1230         int32_t nativeEnd = inputMap.isValid() ?
1231                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1232         normalizedMap->addElement(nativeEnd, status);
1233
1234         inputMap.moveFrom(normalizedMap);
1235         inString.moveFrom(normalizedInput);
1236     }
1237
1238     int32_t numCodePts = inString.countChar32();
1239     if (numCodePts != inString.length()) {
1240         // There are supplementary characters in the input.
1241         // The dictionary will produce boundary positions in terms of code point indexes,
1242         //   not in terms of code unit string indexes.
1243         // Use the inputMap mechanism to take care of this in addition to indexing differences
1244         //    from normalization and/or UTF-8 input.
1245         UBool hadExistingMap = inputMap.isValid();
1246         if (!hadExistingMap) {
1247             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1248             if (U_FAILURE(status)) {
1249                 return 0;
1250             }
1251         }
1252         int32_t cpIdx = 0;
1253         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1254             U_ASSERT(cuIdx >= cpIdx);
1255             if (hadExistingMap) {
1256                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1257             } else {
1258                 inputMap->addElement(cuIdx+rangeStart, status);
1259             }
1260             cpIdx++;
1261             if (cuIdx == inString.length()) {
1262                break;
1263             }
1264         }
1265     }
1266                 
1267     // bestSnlp[i] is the snlp of the best segmentation of the first i
1268     // code points in the range to be matched.
1269     UVector32 bestSnlp(numCodePts + 1, status);
1270     bestSnlp.addElement(0, status);
1271     for(int32_t i = 1; i <= numCodePts; i++) {
1272         bestSnlp.addElement(kuint32max, status);
1273     }
1274
1275
1276     // prev[i] is the index of the last CJK code point in the previous word in 
1277     // the best segmentation of the first i characters.
1278     UVector32 prev(numCodePts + 1, status);
1279     for(int32_t i = 0; i <= numCodePts; i++){
1280         prev.addElement(-1, status);
1281     }
1282
1283     const int32_t maxWordSize = 20;
1284     UVector32 values(numCodePts, status);
1285     values.setSize(numCodePts);
1286     UVector32 lengths(numCodePts, status);
1287     lengths.setSize(numCodePts);
1288
1289     UText fu = UTEXT_INITIALIZER;
1290     utext_openUnicodeString(&fu, &inString, &status);
1291
1292     // Dynamic programming to find the best segmentation.
1293
1294     // In outer loop, i  is the code point index,
1295     //                ix is the corresponding string (code unit) index.
1296     //    They differ when the string contains supplementary characters.
1297     int32_t ix = 0;
1298     bool is_prev_katakana = false;
1299     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1300         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1301             continue;
1302         }
1303
1304         int32_t count;
1305         utext_setNativeIndex(&fu, ix);
1306         count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1307                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1308                              // Note: lengths is filled with code point lengths
1309                              //       The NULL parameter is the ignored code unit lengths.
1310
1311         // if there are no single character matches found in the dictionary 
1312         // starting with this character, treat character as a 1-character word 
1313         // with the highest value possible, i.e. the least likely to occur.
1314         // Exclude Korean characters from this treatment, as they should be left
1315         // together by default.
1316         if ((count == 0 || lengths.elementAti(0) != 1) &&
1317                 !fHangulWordSet.contains(inString.char32At(ix))) {
1318             values.setElementAt(maxSnlp, count);   // 255
1319             lengths.setElementAt(1, count++);
1320         }
1321
1322         for (int32_t j = 0; j < count; j++) {
1323             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1324             int32_t ln_j_i = lengths.elementAti(j) + i;
1325             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1326                 bestSnlp.setElementAt(newSnlp, ln_j_i);
1327                 prev.setElementAt(i, ln_j_i);
1328             }
1329         }
1330
1331         // In Japanese,
1332         // Katakana word in single character is pretty rare. So we apply
1333         // the following heuristic to Katakana: any continuous run of Katakana
1334         // characters is considered a candidate word with a default cost
1335         // specified in the katakanaCost table according to its length.
1336
1337         bool is_katakana = isKatakana(inString.char32At(ix));
1338         int32_t katakanaRunLength = 1;
1339         if (!is_prev_katakana && is_katakana) {
1340             int32_t j = inString.moveIndex32(ix, 1);
1341             // Find the end of the continuous run of Katakana characters
1342             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1343                     isKatakana(inString.char32At(j))) {
1344                 j = inString.moveIndex32(j, 1);
1345                 katakanaRunLength++;
1346             }
1347             if (katakanaRunLength < kMaxKatakanaGroupLength) {
1348                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1349                 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
1350                     bestSnlp.setElementAt(newSnlp, j);
1351                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1352                 }
1353             }
1354         }
1355         is_prev_katakana = is_katakana;
1356     }
1357     utext_close(&fu);
1358
1359     // Start pushing the optimal offset index into t_boundary (t for tentative).
1360     // prev[numCodePts] is guaranteed to be meaningful.
1361     // We'll first push in the reverse order, i.e.,
1362     // t_boundary[0] = numCodePts, and afterwards do a swap.
1363     UVector32 t_boundary(numCodePts+1, status);
1364
1365     int32_t numBreaks = 0;
1366     // No segmentation found, set boundary to end of range
1367     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1368         t_boundary.addElement(numCodePts, status);
1369         numBreaks++;
1370     } else {
1371         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1372             t_boundary.addElement(i, status);
1373             numBreaks++;
1374         }
1375         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1376     }
1377
1378     // Add a break for the start of the dictionary range if there is not one
1379     // there already.
1380     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1381         t_boundary.addElement(0, status);
1382         numBreaks++;
1383     }
1384
1385     // Now that we're done, convert positions in t_boundary[] (indices in 
1386     // the normalized input string) back to indices in the original input UText
1387     // while reversing t_boundary and pushing values to foundBreaks.
1388     for (int32_t i = numBreaks-1; i >= 0; i--) {
1389         int32_t cpPos = t_boundary.elementAti(i);
1390         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1391         // Boundaries are added to foundBreaks output in ascending order.
1392         U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
1393         foundBreaks.push(utextPos, status);
1394     }
1395
1396     // inString goes out of scope
1397     // inputMap goes out of scope
1398     return numBreaks;
1399 }
1400 #endif
1401
1402 U_NAMESPACE_END
1403
1404 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1405