843b1f703572f6075a8966e1c861090df77d8a33
[platform/framework/web/crosswalk.git] / src / third_party / icu / source / tools / gennorm2 / n2builder.cpp
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2009-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  n2builder.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2009nov25
14 *   created by: Markus W. Scherer
15 *
16 * Builds Normalizer2 data and writes a binary .nrm file.
17 * For the file format see source/common/normalizer2impl.h.
18 */
19
20 #include "unicode/utypes.h"
21 #include "n2builder.h"
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #if U_HAVE_STD_STRING
27 #include <vector>
28 #endif
29 #include "unicode/errorcode.h"
30 #include "unicode/localpointer.h"
31 #include "unicode/putil.h"
32 #include "unicode/udata.h"
33 #include "unicode/uniset.h"
34 #include "unicode/unistr.h"
35 #include "unicode/ustring.h"
36 #include "hash.h"
37 #include "normalizer2impl.h"
38 #include "toolutil.h"
39 #include "unewdata.h"
40 #include "utrie2.h"
41 #include "uvectr32.h"
42
43 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
44
45 #if !UCONFIG_NO_NORMALIZATION
46
47 /* UDataInfo cf. udata.h */
48 static UDataInfo dataInfo={
49     sizeof(UDataInfo),
50     0,
51
52     U_IS_BIG_ENDIAN,
53     U_CHARSET_FAMILY,
54     U_SIZEOF_UCHAR,
55     0,
56
57     { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
58     { 2, 0, 0, 0 },             /* formatVersion */
59     { 5, 2, 0, 0 }              /* dataVersion (Unicode version) */
60 };
61
62 U_NAMESPACE_BEGIN
63
64 class HangulIterator {
65 public:
66     struct Range {
67         UChar32 start, limit;
68         uint16_t norm16;
69     };
70
71     HangulIterator() : rangeIndex(0) {}
72     const Range *nextRange() {
73         if(rangeIndex<LENGTHOF(ranges)) {
74             return ranges+rangeIndex++;
75         } else {
76             return NULL;
77         }
78     }
79     void reset() { rangeIndex=0; }
80 private:
81     static const Range ranges[4];
82     int32_t rangeIndex;
83 };
84
85 const HangulIterator::Range HangulIterator::ranges[4]={
86     { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
87     { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
88     // JAMO_T_BASE+1: not U+11A7
89     { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
90     { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 },  // will become minYesNo
91 };
92
93 struct CompositionPair {
94     CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
95     UChar32 trail, composite;
96 };
97
98 struct Norm {
99     enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
100
101     UBool hasMapping() const { return mappingType>REMOVED; }
102
103     // Requires hasMapping() and well-formed mapping.
104     void setMappingCP() {
105         UChar32 c;
106         if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
107             mappingCP=c;
108         } else {
109             mappingCP=U_SENTINEL;
110         }
111     }
112
113     const CompositionPair *getCompositionPairs(int32_t &length) const {
114         if(compositions==NULL) {
115             length=0;
116             return NULL;
117         } else {
118             length=compositions->size()/2;
119             return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
120         }
121     }
122
123     UnicodeString *mapping;
124     UnicodeString *rawMapping;  // non-NULL if the mapping is further decomposed
125     UChar32 mappingCP;  // >=0 if mapping to 1 code point
126     int32_t mappingPhase;
127     MappingType mappingType;
128
129     UVector32 *compositions;  // (trail, composite) pairs
130     uint8_t cc;
131     UBool combinesBack;
132     UBool hasNoCompBoundaryAfter;
133
134     enum OffsetType {
135         OFFSET_NONE,
136         // Composition for back-combining character. Allowed, but not normally used.
137         OFFSET_MAYBE_YES,
138         // Composition for a starter that does not have a decomposition mapping.
139         OFFSET_YES_YES,
140         // Round-trip mapping & composition for a starter.
141         OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
142         // Round-trip mapping for a starter that itself does not combine-forward.
143         OFFSET_YES_NO_MAPPING_ONLY,
144         // One-way mapping.
145         OFFSET_NO_NO,
146         // Delta for an algorithmic one-way mapping.
147         OFFSET_DELTA
148     };
149     enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
150     int32_t offset;
151 };
152
153 class Normalizer2DBEnumerator {
154 public:
155     Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {}
156     virtual ~Normalizer2DBEnumerator() {}
157     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0;
158     Normalizer2DBEnumerator *ptr() { return this; }
159 protected:
160     Normalizer2DataBuilder &builder;
161 };
162
163 U_CDECL_BEGIN
164
165 static UBool U_CALLCONV
166 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
167     return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value);
168 }
169
170 U_CDECL_END
171
172 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
173         phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL) {
174     memset(unicodeVersion, 0, sizeof(unicodeVersion));
175     normTrie=utrie2_open(0, 0, &errorCode);
176     normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
177     norms=allocNorm();  // unused Norm struct at index 0
178     memset(indexes, 0, sizeof(indexes));
179     memset(smallFCD, 0, sizeof(smallFCD));
180 }
181
182 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
183     utrie2_close(normTrie);
184     int32_t normsLength=utm_countItems(normMem);
185     for(int32_t i=1; i<normsLength; ++i) {
186         delete norms[i].mapping;
187         delete norms[i].rawMapping;
188         delete norms[i].compositions;
189     }
190     utm_close(normMem);
191     utrie2_close(norm16Trie);
192 }
193
194 void
195 Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
196     UVersionInfo nullVersion={ 0, 0, 0, 0 };
197     UVersionInfo version;
198     u_versionFromString(version, v);
199     if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
200         0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
201     ) {
202         char buffer[U_MAX_VERSION_STRING_LENGTH];
203         u_versionToString(unicodeVersion, buffer);
204         fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
205                 buffer, v);
206         exit(U_ILLEGAL_ARGUMENT_ERROR);
207     }
208     memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
209 }
210
211 Norm *Normalizer2DataBuilder::allocNorm() {
212     Norm *p=(Norm *)utm_alloc(normMem);
213     norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
214     return p;
215 }
216
217 /* get an existing Norm unit */
218 Norm *Normalizer2DataBuilder::getNorm(UChar32 c) {
219     uint32_t i=utrie2_get32(normTrie, c);
220     if(i==0) {
221         return NULL;
222     }
223     return norms+i;
224 }
225
226 const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const {
227     return norms[utrie2_get32(normTrie, c)];
228 }
229
230 /*
231  * get or create a Norm unit;
232  * get or create the intermediate trie entries for it as well
233  */
234 Norm *Normalizer2DataBuilder::createNorm(UChar32 c) {
235     uint32_t i=utrie2_get32(normTrie, c);
236     if(i!=0) {
237         return norms+i;
238     } else {
239         /* allocate Norm */
240         Norm *p=allocNorm();
241         IcuToolErrorCode errorCode("gennorm2/createNorm()");
242         utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
243         return p;
244     }
245 }
246
247 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
248     if(p!=NULL) {
249         if(p->mappingType!=Norm::NONE) {
250             if( overrideHandling==OVERRIDE_NONE ||
251                 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
252             ) {
253                 fprintf(stderr,
254                         "error in gennorm2 phase %d: "
255                         "not permitted to override mapping for U+%04lX from phase %d\n",
256                         (int)phase, (long)c, (int)p->mappingPhase);
257                 exit(U_INVALID_FORMAT_ERROR);
258             }
259             delete p->mapping;
260             p->mapping=NULL;
261         }
262         p->mappingPhase=phase;
263     }
264     return p;
265 }
266
267 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
268     overrideHandling=oh;
269     ++phase;
270 }
271
272 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
273     createNorm(c)->cc=cc;
274 }
275
276 uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const {
277     return getNormRef(c).cc;
278 }
279
280 static UBool isWellFormed(const UnicodeString &s) {
281     UErrorCode errorCode=U_ZERO_ERROR;
282     u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode);
283     return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
284 }
285
286 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
287     if(!isWellFormed(m)) {
288         fprintf(stderr,
289                 "error in gennorm2 phase %d: "
290                 "illegal one-way mapping from U+%04lX to malformed string\n",
291                 (int)phase, (long)c);
292         exit(U_INVALID_FORMAT_ERROR);
293     }
294     Norm *p=checkNormForMapping(createNorm(c), c);
295     p->mapping=new UnicodeString(m);
296     p->mappingType=Norm::ONE_WAY;
297     p->setMappingCP();
298 }
299
300 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
301     if(U_IS_SURROGATE(c)) {
302         fprintf(stderr,
303                 "error in gennorm2 phase %d: "
304                 "illegal round-trip mapping from surrogate code point U+%04lX\n",
305                 (int)phase, (long)c);
306         exit(U_INVALID_FORMAT_ERROR);
307     }
308     if(!isWellFormed(m)) {
309         fprintf(stderr,
310                 "error in gennorm2 phase %d: "
311                 "illegal round-trip mapping from U+%04lX to malformed string\n",
312                 (int)phase, (long)c);
313         exit(U_INVALID_FORMAT_ERROR);
314     }
315     int32_t numCP=u_countChar32(m.getBuffer(), m.length());
316     if(numCP!=2) {
317         fprintf(stderr,
318                 "error in gennorm2 phase %d: "
319                 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
320                 (int)phase, (long)c, (int)numCP);
321         exit(U_INVALID_FORMAT_ERROR);
322     }
323     Norm *p=checkNormForMapping(createNorm(c), c);
324     p->mapping=new UnicodeString(m);
325     p->mappingType=Norm::ROUND_TRIP;
326     p->mappingCP=U_SENTINEL;
327 }
328
329 void Normalizer2DataBuilder::removeMapping(UChar32 c) {
330     Norm *p=checkNormForMapping(getNorm(c), c);
331     if(p!=NULL) {
332         p->mappingType=Norm::REMOVED;
333     }
334 }
335
336 class CompositionBuilder : public Normalizer2DBEnumerator {
337 public:
338     CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
339     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
340         builder.addComposition(start, end, value);
341         return TRUE;
342     }
343 };
344
345 void
346 Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) {
347     if(norms[value].mappingType==Norm::ROUND_TRIP) {
348         if(start!=end) {
349             fprintf(stderr,
350                     "gennorm2 error: same round-trip mapping for "
351                     "more than 1 code point U+%04lX..U+%04lX\n",
352                     (long)start, (long)end);
353             exit(U_INVALID_FORMAT_ERROR);
354         }
355         if(norms[value].cc!=0) {
356             fprintf(stderr,
357                     "gennorm2 error: "
358                     "U+%04lX has a round-trip mapping and ccc!=0, "
359                     "not possible in Unicode normalization\n",
360                     (long)start);
361             exit(U_INVALID_FORMAT_ERROR);
362         }
363         // setRoundTripMapping() ensured that there are exactly two code points.
364         const UnicodeString &m=*norms[value].mapping;
365         UChar32 lead=m.char32At(0);
366         UChar32 trail=m.char32At(m.length()-1);
367         if(getCC(lead)!=0) {
368             fprintf(stderr,
369                     "gennorm2 error: "
370                     "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
371                     "not possible in Unicode normalization\n",
372                     (long)start, (long)lead);
373             exit(U_INVALID_FORMAT_ERROR);
374         }
375         // Flag for trailing character.
376         createNorm(trail)->combinesBack=TRUE;
377         // Insert (trail, composite) pair into compositions list for the lead character.
378         IcuToolErrorCode errorCode("gennorm2/addComposition()");
379         Norm *leadNorm=createNorm(lead);
380         UVector32 *compositions=leadNorm->compositions;
381         int32_t i;
382         if(compositions==NULL) {
383             compositions=leadNorm->compositions=new UVector32(errorCode);
384             i=0;  // "insert" the first pair at index 0
385         } else {
386             // Insertion sort, and check for duplicate trail characters.
387             int32_t length;
388             const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
389             for(i=0; i<length; ++i) {
390                 if(trail==pairs[i].trail) {
391                     fprintf(stderr,
392                             "gennorm2 error: same round-trip mapping for "
393                             "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
394                             (long)start, (long)lead, (long)trail);
395                     exit(U_INVALID_FORMAT_ERROR);
396                 }
397                 if(trail<pairs[i].trail) {
398                     break;
399                 }
400             }
401         }
402         compositions->insertElementAt(trail, 2*i, errorCode);
403         compositions->insertElementAt(start, 2*i+1, errorCode);
404     }
405 }
406
407 UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm,
408                                                     uint8_t lowCC, uint8_t highCC) const {
409     if((highCC-lowCC)>=2) {
410         int32_t length;
411         const CompositionPair *pairs=norm.getCompositionPairs(length);
412         for(int32_t i=0; i<length; ++i) {
413             uint8_t trailCC=getCC(pairs[i].trail);
414             if(lowCC<trailCC && trailCC<highCC) {
415                 return TRUE;
416             }
417         }
418     }
419     return FALSE;
420 }
421
422 UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const {
423     int32_t length;
424     const CompositionPair *pairs=norm.getCompositionPairs(length);
425     for(int32_t i=0; i<length; ++i) {
426         if(trail==pairs[i].trail) {
427             return pairs[i].composite;
428         }
429         if(trail<pairs[i].trail) {
430             break;
431         }
432     }
433     return U_SENTINEL;
434 }
435
436 class Decomposer : public Normalizer2DBEnumerator {
437 public:
438     Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {}
439     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
440         didDecompose|=builder.decompose(start, end, value);
441         return TRUE;
442     }
443     UBool didDecompose;
444 };
445
446 UBool
447 Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
448     if(norms[value].hasMapping()) {
449         Norm &norm=norms[value];
450         const UnicodeString &m=*norm.mapping;
451         UnicodeString *decomposed=NULL;
452         const UChar *s=m.getBuffer();
453         int32_t length=m.length();
454         int32_t prev, i=0;
455         UChar32 c;
456         while(i<length) {
457             prev=i;
458             U16_NEXT(s, i, length, c);
459             if(start<=c && c<=end) {
460                 fprintf(stderr,
461                         "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
462                         (long)c);
463                 exit(U_INVALID_FORMAT_ERROR);
464             }
465             const Norm &cNorm=getNormRef(c);
466             if(cNorm.hasMapping()) {
467                 if(norm.mappingType==Norm::ROUND_TRIP) {
468                     if(prev==0) {
469                         if(cNorm.mappingType!=Norm::ROUND_TRIP) {
470                             fprintf(stderr,
471                                     "gennorm2 error: "
472                                     "U+%04lX's round-trip mapping's starter "
473                                     "U+%04lX one-way-decomposes, "
474                                     "not possible in Unicode normalization\n",
475                                     (long)start, (long)c);
476                             exit(U_INVALID_FORMAT_ERROR);
477                         }
478                         uint8_t myTrailCC=getCC(m.char32At(i));
479                         UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
480                         uint8_t cTrailCC=getCC(cTrailChar);
481                         if(cTrailCC>myTrailCC) {
482                             fprintf(stderr,
483                                     "gennorm2 error: "
484                                     "U+%04lX's round-trip mapping's starter "
485                                     "U+%04lX decomposes and the "
486                                     "inner/earlier tccc=%hu > outer/following tccc=%hu, "
487                                     "not possible in Unicode normalization\n",
488                                     (long)start, (long)c,
489                                     (short)cTrailCC, (short)myTrailCC);
490                             exit(U_INVALID_FORMAT_ERROR);
491                         }
492                     } else {
493                         fprintf(stderr,
494                                 "gennorm2 error: "
495                                 "U+%04lX's round-trip mapping's non-starter "
496                                 "U+%04lX decomposes, "
497                                 "not possible in Unicode normalization\n",
498                                 (long)start, (long)c);
499                         exit(U_INVALID_FORMAT_ERROR);
500                     }
501                 }
502                 if(decomposed==NULL) {
503                     decomposed=new UnicodeString(m, 0, prev);
504                 }
505                 decomposed->append(*cNorm.mapping);
506             } else if(Hangul::isHangul(c)) {
507                 UChar buffer[3];
508                 int32_t hangulLength=Hangul::decompose(c, buffer);
509                 if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
510                     fprintf(stderr,
511                             "gennorm2 error: "
512                             "U+%04lX's round-trip mapping's non-starter "
513                             "U+%04lX decomposes, "
514                             "not possible in Unicode normalization\n",
515                             (long)start, (long)c);
516                     exit(U_INVALID_FORMAT_ERROR);
517                 }
518                 if(decomposed==NULL) {
519                     decomposed=new UnicodeString(m, 0, prev);
520                 }
521                 decomposed->append(buffer, hangulLength);
522             } else if(decomposed!=NULL) {
523                 decomposed->append(m, prev, i-prev);
524             }
525         }
526         if(decomposed!=NULL) {
527             if(norm.rawMapping==NULL) {
528                 // Remember the original mapping when decomposing recursively.
529                 norm.rawMapping=norm.mapping;
530             } else {
531                 delete norm.mapping;
532             }
533             norm.mapping=decomposed;
534             // Not  norm.setMappingCP();  because the original mapping
535             // is most likely to be encodable as a delta.
536             return TRUE;
537         }
538     }
539     return FALSE;
540 }
541
542 class BuilderReorderingBuffer {
543 public:
544     BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
545     void reset() {
546         fLength=0;
547         fLastStarterIndex=-1;
548         fDidReorder=FALSE;
549     }
550     int32_t length() const { return fLength; }
551     UBool isEmpty() const { return fLength==0; }
552     int32_t lastStarterIndex() const { return fLastStarterIndex; }
553     UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
554     uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
555     UBool didReorder() const { return fDidReorder; }
556     void append(UChar32 c, uint8_t cc) {
557         if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
558             if(cc==0) {
559                 fLastStarterIndex=fLength;
560             }
561             fArray[fLength++]=(c<<8)|cc;
562             return;
563         }
564         // Let this character bubble back to its canonical order.
565         int32_t i=fLength-1;
566         while(i>fLastStarterIndex && ccAt(i)>cc) {
567             --i;
568         }
569         ++i;  // after the last starter or prevCC<=cc
570         // Move this and the following characters forward one to make space.
571         for(int32_t j=fLength; i<j; --j) {
572             fArray[j]=fArray[j-1];
573         }
574         fArray[i]=(c<<8)|cc;
575         ++fLength;
576         fDidReorder=TRUE;
577     }
578     void toString(UnicodeString &dest) {
579         dest.remove();
580         for(int32_t i=0; i<fLength; ++i) {
581             dest.append(charAt(i));
582         }
583     }
584     void setComposite(UChar32 composite, int32_t combMarkIndex) {
585         fArray[fLastStarterIndex]=composite<<8;
586         // Remove the combining mark that contributed to the composite.
587         --fLength;
588         while(combMarkIndex<fLength) {
589             fArray[combMarkIndex]=fArray[combMarkIndex+1];
590             ++combMarkIndex;
591         }
592     }
593 private:
594     int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
595     int32_t fLength;
596     int32_t fLastStarterIndex;
597     UBool fDidReorder;
598 };
599
600 void
601 Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
602     UnicodeString &m=*p->mapping;
603     int32_t length=m.length();
604     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
605         return;  // writeMapping() will complain about it and print the code point.
606     }
607     const UChar *s=m.getBuffer();
608     int32_t i=0;
609     UChar32 c;
610     while(i<length) {
611         U16_NEXT(s, i, length, c);
612         buffer.append(c, getCC(c));
613     }
614     if(buffer.didReorder()) {
615         buffer.toString(m);
616     }
617 }
618
619 /*
620  * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
621  * A starter character with a mapping does not have a composition boundary after it
622  * if the character itself combines-forward (which is tested by the caller of this function),
623  * or it is deleted (mapped to the empty string),
624  * or its mapping contains no starter,
625  * or the last starter combines-forward.
626  */
627 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
628     if(buffer.isEmpty()) {
629         return TRUE;  // maps-to-empty-string is no boundary of any kind
630     }
631     int32_t lastStarterIndex=buffer.lastStarterIndex();
632     if(lastStarterIndex<0) {
633         return TRUE;  // no starter
634     }
635     UChar32 starter=buffer.charAt(lastStarterIndex);
636     if( Hangul::isJamoL(starter) ||
637         (Hangul::isJamoV(starter) &&
638          0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))
639     ) {
640         // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
641         // otherwise it is blocked.
642         return lastStarterIndex==buffer.length()-1;
643     }
644     // Note: There can be no Hangul syllable in the fully decomposed mapping.
645     const Norm *starterNorm=&getNormRef(starter);
646     if(starterNorm->compositions==NULL) {
647         return FALSE;  // the last starter does not combine forward
648     }
649     // Compose as far as possible, and see if further compositions are possible.
650     uint8_t prevCC=0;
651     for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
652         uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
653         if(combinesWithCCBetween(*starterNorm, prevCC, cc)) {
654             return TRUE;
655         }
656         if( prevCC<cc &&
657             (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0
658         ) {
659             buffer.setComposite(starter, combMarkIndex);
660             starterNorm=&getNormRef(starter);
661             if(starterNorm->compositions==NULL) {
662                 return FALSE;  // the composite does not combine further
663             }
664         } else {
665             prevCC=cc;
666             ++combMarkIndex;
667         }
668     }
669     // TRUE if the final, forward-combining starter is at the end.
670     return prevCC==0;
671 }
672
673 // Requires p->hasMapping().
674 // Returns the offset of the "first unit" from the beginning of the extraData for c.
675 // That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
676 int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
677     UnicodeString &m=*p->mapping;
678     int32_t length=m.length();
679     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
680         fprintf(stderr,
681                 "gennorm2 error: "
682                 "mapping for U+%04lX longer than maximum of %d\n",
683                 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
684         exit(U_INVALID_FORMAT_ERROR);
685     }
686     int32_t leadCC, trailCC;
687     if(length==0) {
688         leadCC=trailCC=0;
689     } else {
690         leadCC=getCC(m.char32At(0));
691         trailCC=getCC(m.char32At(length-1));
692     }
693     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) {
694         fprintf(stderr,
695                 "gennorm2 error: "
696                 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
697                 (long)c);
698         exit(U_INVALID_FORMAT_ERROR);
699     }
700     // Write small-FCD data.
701     if((leadCC|trailCC)!=0) {
702         UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
703         smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
704     }
705     // Write the mapping & raw mapping extraData.
706     int32_t firstUnit=length|(trailCC<<8);
707     int32_t preMappingLength=0;
708     if(p->rawMapping!=NULL) {
709         UnicodeString &rm=*p->rawMapping;
710         int32_t rmLength=rm.length();
711         if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
712             fprintf(stderr,
713                     "gennorm2 error: "
714                     "raw mapping for U+%04lX longer than maximum of %d\n",
715                     (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
716             exit(U_INVALID_FORMAT_ERROR);
717         }
718         UChar rm0=rm.charAt(0);
719         if( rmLength==length-1 &&
720             // 99: overlong substring lengths get pinned to remainder lengths anyway
721             0==rm.compare(1, 99, m, 2, 99) &&
722             rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
723         ) {
724             // Compression:
725             // rawMapping=rm0+mapping.substring(2) -> store only rm0
726             //
727             // The raw mapping is the same as the final mapping after replacing
728             // the final mapping's first two code units with the raw mapping's first one.
729             // In this case, we store only that first unit, rm0.
730             // This helps with a few hundred mappings.
731             dataString.append(rm0);
732             preMappingLength=1;
733         } else {
734             // Store the raw mapping with its length.
735             dataString.append(rm);
736             dataString.append((UChar)rmLength);
737             preMappingLength=rmLength+1;
738         }
739         firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
740     }
741     int32_t cccLccc=p->cc|(leadCC<<8);
742     if(cccLccc!=0) {
743         dataString.append((UChar)cccLccc);
744         ++preMappingLength;
745         firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
746     }
747     if(p->hasNoCompBoundaryAfter) {
748         firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
749     }
750     dataString.append((UChar)firstUnit);
751     dataString.append(m);
752     return preMappingLength;
753 }
754
755 // Requires p->compositions!=NULL.
756 void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) {
757     if(p->cc!=0) {
758         fprintf(stderr,
759                 "gennorm2 error: "
760                 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
761                 (long)c);
762         exit(U_INVALID_FORMAT_ERROR);
763     }
764     int32_t length;
765     const CompositionPair *pairs=p->getCompositionPairs(length);
766     for(int32_t i=0; i<length; ++i) {
767         const CompositionPair &pair=pairs[i];
768         // 22 bits for the composite character and whether it combines forward.
769         UChar32 compositeAndFwd=pair.composite<<1;
770         if(getNormRef(pair.composite).compositions!=NULL) {
771             compositeAndFwd|=1;  // The composite character also combines-forward.
772         }
773         // Encode most pairs in two units and some in three.
774         int32_t firstUnit, secondUnit, thirdUnit;
775         if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
776             if(compositeAndFwd<=0xffff) {
777                 firstUnit=pair.trail<<1;
778                 secondUnit=compositeAndFwd;
779                 thirdUnit=-1;
780             } else {
781                 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
782                 secondUnit=compositeAndFwd>>16;
783                 thirdUnit=compositeAndFwd;
784             }
785         } else {
786             firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
787                        (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
788                       Normalizer2Impl::COMP_1_TRIPLE;
789             secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
790                        (compositeAndFwd>>16);
791             thirdUnit=compositeAndFwd;
792         }
793         // Set the high bit of the first unit if this is the last composition pair.
794         if(i==(length-1)) {
795             firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
796         }
797         dataString.append((UChar)firstUnit).append((UChar)secondUnit);
798         if(thirdUnit>=0) {
799             dataString.append((UChar)thirdUnit);
800         }
801     }
802 }
803
804 class ExtraDataWriter : public Normalizer2DBEnumerator {
805 public:
806     ExtraDataWriter(Normalizer2DataBuilder &b) :
807         Normalizer2DBEnumerator(b),
808         yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
809         yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {}  // 0=Hangul, 1=start of normal data
810     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
811         if(value!=0) {
812             if(start!=end) {
813                 fprintf(stderr,
814                         "gennorm2 error: unexpected shared data for "
815                         "multiple code points U+%04lX..U+%04lX\n",
816                         (long)start, (long)end);
817                 exit(U_INTERNAL_PROGRAM_ERROR);
818             }
819             builder.writeExtraData(start, value, *this);
820         }
821         return TRUE;
822     }
823     UnicodeString maybeYesCompositions;
824     UnicodeString yesYesCompositions;
825     UnicodeString yesNoMappingsAndCompositions;
826     UnicodeString yesNoMappingsOnly;
827     UnicodeString noNoMappings;
828     Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
829 };
830
831 void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
832     Norm *p=norms+value;
833     if(!p->hasMapping()) {
834         // Write small-FCD data.
835         // There is similar code in writeMapping() for characters that do have a mapping.
836         if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) {
837             fprintf(stderr,
838                     "gennorm2 error: "
839                     "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
840                     (long)c);
841             exit(U_INVALID_FORMAT_ERROR);
842         }
843         if(p->cc!=0) {
844             UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
845             smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
846         }
847     }
848     if(p->combinesBack) {
849         if(p->hasMapping()) {
850             fprintf(stderr,
851                     "gennorm2 error: "
852                     "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
853                     (long)c);
854             exit(U_INVALID_FORMAT_ERROR);
855         }
856         if(p->compositions!=NULL) {
857             p->offset=
858                 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
859                 Norm::OFFSET_MAYBE_YES;
860             writeCompositions(c, p, writer.maybeYesCompositions);
861         }
862     } else if(!p->hasMapping()) {
863         if(p->compositions!=NULL) {
864             p->offset=
865                 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
866                 Norm::OFFSET_YES_YES;
867             writeCompositions(c, p, writer.yesYesCompositions);
868         }
869     } else if(p->mappingType==Norm::ROUND_TRIP) {
870         if(p->compositions!=NULL) {
871             int32_t offset=writer.yesNoMappingsAndCompositions.length()+
872                            writeMapping(c, p, writer.yesNoMappingsAndCompositions);
873             p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
874             writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
875         } else {
876             int32_t offset=writer.yesNoMappingsOnly.length()+
877                            writeMapping(c, p, writer.yesNoMappingsOnly);
878             p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
879         }
880     } else /* one-way */ {
881         if(p->compositions!=NULL) {
882             fprintf(stderr,
883                     "gennorm2 error: "
884                     "U+%04lX combines-forward and has a one-way mapping, "
885                     "not possible in Unicode normalization\n",
886                     (long)c);
887             exit(U_INVALID_FORMAT_ERROR);
888         }
889         if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
890             // Try a compact, algorithmic encoding.
891             // Only for ccc=0, because we can't store additional information
892             // and we do not recursively follow an algorithmic encoding for access to the ccc.
893             //
894             // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
895             // if the mappingCP decomposes further, to ensure that there is a place to store it.
896             // We want to see that the final mapping does not have exactly 1 code point,
897             // or else we would have to recursively ensure that the final mapping is stored
898             // in normal extraData.
899             if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) {
900                 int32_t delta=p->mappingCP-c;
901                 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
902                     p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
903                 }
904             }
905         }
906         if(p->offset==0) {
907             int32_t oldNoNoLength=writer.noNoMappings.length();
908             int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings);
909             UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
910             int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
911             if(previousOffset!=0) {
912                 // Duplicate, remove the new units and point to the old ones.
913                 writer.noNoMappings.truncate(oldNoNoLength);
914                 p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
915             } else {
916                 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
917                 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
918                 writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode);
919                 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
920             }
921         }
922     }
923 }
924
925 class Norm16Writer : public Normalizer2DBEnumerator {
926 public:
927     Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
928     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
929         builder.writeNorm16(start, end, value);
930         return TRUE;
931     }
932 };
933
934 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) {
935     if(value!=0) {
936         const Norm *p=norms+value;
937         int32_t offset=p->offset>>Norm::OFFSET_SHIFT;
938         int32_t norm16=0;
939         UBool isDecompNo=FALSE;
940         UBool isCompNoMaybe=FALSE;
941         switch(p->offset&Norm::OFFSET_MASK) {
942         case Norm::OFFSET_NONE:
943             // No mapping, no compositions list.
944             if(p->combinesBack) {
945                 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc;
946                 isDecompNo=(UBool)(p->cc!=0);
947                 isCompNoMaybe=TRUE;
948             } else if(p->cc!=0) {
949                 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc;
950                 isDecompNo=isCompNoMaybe=TRUE;
951             }
952             break;
953         case Norm::OFFSET_MAYBE_YES:
954             norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
955             isCompNoMaybe=TRUE;
956             break;
957         case Norm::OFFSET_YES_YES:
958             norm16=offset;
959             break;
960         case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
961             norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
962             isDecompNo=TRUE;
963             break;
964         case Norm::OFFSET_YES_NO_MAPPING_ONLY:
965             norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
966             isDecompNo=TRUE;
967             break;
968         case Norm::OFFSET_NO_NO:
969             norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
970             isDecompNo=isCompNoMaybe=TRUE;
971             break;
972         case Norm::OFFSET_DELTA:
973             norm16=getCenterNoNoDelta()+offset;
974             isDecompNo=isCompNoMaybe=TRUE;
975             break;
976         default:  // Should not occur.
977             exit(U_INTERNAL_PROGRAM_ERROR);
978         }
979         IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
980         utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
981         if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
982             indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
983         }
984         if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
985             indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
986         }
987     }
988 }
989
990 void Normalizer2DataBuilder::setHangulData() {
991     HangulIterator hi;
992     const HangulIterator::Range *range;
993     // Check that none of the Hangul/Jamo code points have data.
994     while((range=hi.nextRange())!=NULL) {
995         for(UChar32 c=range->start; c<range->limit; ++c) {
996             if(utrie2_get32(norm16Trie, c)!=0) {
997                 fprintf(stderr,
998                         "gennorm2 error: "
999                         "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
1000                         (long)c);
1001                 exit(U_INVALID_FORMAT_ERROR);
1002             }
1003         }
1004     }
1005     // Set data for algorithmic runtime handling.
1006     IcuToolErrorCode errorCode("gennorm2/setHangulData()");
1007     hi.reset();
1008     while((range=hi.nextRange())!=NULL) {
1009         uint16_t norm16=range->norm16;
1010         if(norm16==0) {
1011             norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO];  // Hangul LV/LVT encoded as minYesNo
1012             if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
1013                 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
1014             }
1015         } else {
1016             if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {  // Jamo V/T are maybeYes
1017                 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
1018             }
1019         }
1020         utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
1021         errorCode.assertSuccess();
1022     }
1023 }
1024
1025 U_CDECL_BEGIN
1026
1027 static UBool U_CALLCONV
1028 enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1029     uint32_t *pMaxValue=(uint32_t *)context;
1030     if(value>*pMaxValue) {
1031         *pMaxValue=value;
1032     }
1033     return TRUE;
1034 }
1035
1036 U_CDECL_END
1037
1038 void Normalizer2DataBuilder::processData() {
1039     IcuToolErrorCode errorCode("gennorm2/processData()");
1040     norm16Trie=utrie2_open(0, 0, errorCode);
1041     errorCode.assertSuccess();
1042
1043     utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr());
1044
1045     Decomposer decomposer(*this);
1046     do {
1047         decomposer.didDecompose=FALSE;
1048         utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer);
1049     } while(decomposer.didDecompose);
1050
1051     BuilderReorderingBuffer buffer;
1052     int32_t normsLength=utm_countItems(normMem);
1053     for(int32_t i=1; i<normsLength; ++i) {
1054         // Set the hasNoCompBoundaryAfter flag for use by the last code branch
1055         // in Normalizer2Impl::hasCompBoundaryAfter().
1056         // For details see the comments on hasNoCompBoundaryAfter(buffer).
1057         const Norm &norm=norms[i];
1058         if(norm.hasMapping()) {
1059             if(norm.compositions!=NULL) {
1060                 norms[i].hasNoCompBoundaryAfter=TRUE;
1061             } else {
1062                 buffer.reset();
1063                 reorder(norms+i, buffer);
1064                 norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
1065             }
1066         }
1067     }
1068
1069     indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
1070     indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
1071
1072     ExtraDataWriter extraDataWriter(*this);
1073     utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter);
1074
1075     extraData=extraDataWriter.maybeYesCompositions;
1076     extraData.append(extraDataWriter.yesYesCompositions).
1077               append(extraDataWriter.yesNoMappingsAndCompositions).
1078               append(extraDataWriter.yesNoMappingsOnly).
1079               append(extraDataWriter.noNoMappings);
1080     // Pad to even length for 4-byte alignment of following data.
1081     if(extraData.length()&1) {
1082         extraData.append((UChar)0);
1083     }
1084
1085     indexes[Normalizer2Impl::IX_MIN_YES_NO]=
1086         extraDataWriter.yesYesCompositions.length();
1087     indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
1088         indexes[Normalizer2Impl::IX_MIN_YES_NO]+
1089         extraDataWriter.yesNoMappingsAndCompositions.length();
1090     indexes[Normalizer2Impl::IX_MIN_NO_NO]=
1091         indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
1092         extraDataWriter.yesNoMappingsOnly.length();
1093     indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
1094         indexes[Normalizer2Impl::IX_MIN_NO_NO]+
1095         extraDataWriter.noNoMappings.length();
1096     indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
1097         Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
1098         extraDataWriter.maybeYesCompositions.length();
1099
1100     int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
1101     if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
1102         fprintf(stderr,
1103                 "gennorm2 error: "
1104                 "data structure overflow, too much mapping composition data\n");
1105         exit(U_BUFFER_OVERFLOW_ERROR);
1106     }
1107
1108     utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr());
1109
1110     setHangulData();
1111
1112     // Look for the "worst" norm16 value of any supplementary code point
1113     // corresponding to a lead surrogate, and set it as that surrogate's value.
1114     // Enables quick check inner loops to look at only code units.
1115     //
1116     // We could be more sophisticated:
1117     // We could collect a bit set for whether there are values in the different
1118     // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
1119     // and select the best value that only breaks the composition and/or decomposition
1120     // inner loops if necessary.
1121     // However, that seems like overkill for an optimization for supplementary characters.
1122     for(UChar lead=0xd800; lead<0xdc00; ++lead) {
1123         uint32_t maxValue=utrie2_get32(norm16Trie, lead);
1124         utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
1125         if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
1126             maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
1127         ) {
1128             // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
1129             // Otherwise it might end up at something like JAMO_VT which stays in
1130             // the inner decomposition quick check loop.
1131             maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
1132         }
1133         utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
1134     }
1135
1136     // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
1137     // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
1138     // which is harmless.
1139     // As a result, the minimum code points are always BMP code points.
1140     int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
1141     if(minCP>=0x10000) {
1142         indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
1143     }
1144     minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
1145     if(minCP>=0x10000) {
1146         indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
1147     }
1148 }
1149
1150 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
1151     processData();
1152
1153     IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
1154     utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
1155     int32_t norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
1156     if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
1157         fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
1158                 errorCode.errorName());
1159         exit(errorCode.reset());
1160     }
1161     errorCode.reset();
1162     LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
1163     utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
1164     errorCode.assertSuccess();
1165
1166     int32_t offset=(int32_t)sizeof(indexes);
1167     indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
1168     offset+=norm16TrieLength;
1169     indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
1170     offset+=extraData.length()*2;
1171     indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
1172     offset+=sizeof(smallFCD);
1173     int32_t totalSize=offset;
1174     for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
1175         indexes[i]=totalSize;
1176     }
1177
1178     if(beVerbose) {
1179         printf("size of normalization trie:         %5ld bytes\n", (long)norm16TrieLength);
1180         printf("size of 16-bit extra data:          %5ld uint16_t\n", (long)extraData.length());
1181         printf("size of small-FCD data:             %5ld bytes\n", (long)sizeof(smallFCD));
1182         printf("size of binary data file contents:  %5ld bytes\n", (long)totalSize);
1183         printf("minDecompNoCodePoint:              U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
1184         printf("minCompNoMaybeCodePoint:           U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
1185         printf("minYesNo:                          0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
1186         printf("minYesNoMappingsOnly:              0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
1187         printf("minNoNo:                           0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
1188         printf("limitNoNo:                         0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
1189         printf("minMaybeYes:                       0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
1190     }
1191
1192     UVersionInfo nullVersion={ 0, 0, 0, 0 };
1193     if(0==memcmp(nullVersion, unicodeVersion, 4)) {
1194         u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
1195     }
1196     memcpy(dataInfo.dataVersion, unicodeVersion, 4);
1197     UNewDataMemory *pData=
1198         udata_create(NULL, NULL, filename, &dataInfo,
1199                      haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
1200     if(errorCode.isFailure()) {
1201         fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
1202                 filename, errorCode.errorName());
1203         exit(errorCode.reset());
1204     }
1205     udata_writeBlock(pData, indexes, sizeof(indexes));
1206     udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
1207     udata_writeUString(pData, extraData.getBuffer(), extraData.length());
1208     udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
1209     int32_t writtenSize=udata_finish(pData, errorCode);
1210     if(errorCode.isFailure()) {
1211         fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
1212         exit(errorCode.reset());
1213     }
1214     if(writtenSize!=totalSize) {
1215         fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
1216             (long)writtenSize, (long)totalSize);
1217         exit(U_INTERNAL_PROGRAM_ERROR);
1218     }
1219 }
1220
1221 U_NAMESPACE_END
1222
1223 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1224
1225 /*
1226  * Hey, Emacs, please set the following:
1227  *
1228  * Local Variables:
1229  * indent-tabs-mode: nil
1230  * End:
1231  */