Imported Upstream version 58.1
[platform/upstream/icu.git] / source / tools / makeconv / makeconv.cpp
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ********************************************************************************
5  *
6  *   Copyright (C) 1998-2015, International Business Machines
7  *   Corporation and others.  All Rights Reserved.
8  *
9  ********************************************************************************
10  *
11  *
12  *  makeconv.cpp:
13  *  tool creating a binary (compressed) representation of the conversion mapping
14  *  table (IBM NLTC ucmap format).
15  *
16  *  05/04/2000    helena     Added fallback mapping into the picture...
17  *  06/29/2000  helena      Major rewrite of the callback APIs.
18  */
19
20 #include <stdio.h>
21 #include "unicode/putil.h"
22 #include "unicode/ucnv_err.h"
23 #include "charstr.h"
24 #include "ucnv_bld.h"
25 #include "ucnv_imp.h"
26 #include "ucnv_cnv.h"
27 #include "cstring.h"
28 #include "cmemory.h"
29 #include "uinvchar.h"
30 #include "filestrm.h"
31 #include "toolutil.h"
32 #include "uoptions.h"
33 #include "unicode/udata.h"
34 #include "unewdata.h"
35 #include "uparse.h"
36 #include "ucm.h"
37 #include "makeconv.h"
38 #include "genmbcs.h"
39
40 #define DEBUG 0
41
42 typedef struct ConvData {
43     UCMFile *ucm;
44     NewConverter *cnvData, *extData;
45     UConverterSharedData sharedData;
46     UConverterStaticData staticData;
47 } ConvData;
48
49 static void
50 initConvData(ConvData *data) {
51     uprv_memset(data, 0, sizeof(ConvData));
52     data->sharedData.structSize=sizeof(UConverterSharedData);
53     data->staticData.structSize=sizeof(UConverterStaticData);
54     data->sharedData.staticData=&data->staticData;
55 }
56
57 static void
58 cleanupConvData(ConvData *data) {
59     if(data!=NULL) {
60         if(data->cnvData!=NULL) {
61             data->cnvData->close(data->cnvData);
62             data->cnvData=NULL;
63         }
64         if(data->extData!=NULL) {
65             data->extData->close(data->extData);
66             data->extData=NULL;
67         }
68         ucm_close(data->ucm);
69         data->ucm=NULL;
70     }
71 }
72
73 /*
74  * from ucnvstat.c - static prototypes of data-based converters
75  */
76 U_CAPI const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES];
77
78 /*
79  * Global - verbosity
80  */
81 UBool VERBOSE = FALSE;
82 UBool QUIET = FALSE;
83 UBool SMALL = FALSE;
84 UBool IGNORE_SISO_CHECK = FALSE;
85
86 static void
87 createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
88
89 /*
90  * Set up the UNewData and write the converter..
91  */
92 static void
93 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status);
94
95 UBool haveCopyright=TRUE;
96
97 static UDataInfo dataInfo={
98     sizeof(UDataInfo),
99     0,
100
101     U_IS_BIG_ENDIAN,
102     U_CHARSET_FAMILY,
103     sizeof(UChar),
104     0,
105
106     {0x63, 0x6e, 0x76, 0x74},     /* dataFormat="cnvt" */
107     {6, 2, 0, 0},                 /* formatVersion */
108     {0, 0, 0, 0}                  /* dataVersion (calculated at runtime) */
109 };
110
111 static void
112 writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status)
113 {
114     UNewDataMemory *mem = NULL;
115     uint32_t sz2;
116     uint32_t size = 0;
117     int32_t tableType;
118
119     if(U_FAILURE(*status))
120       {
121         return;
122       }
123
124     tableType=TABLE_NONE;
125     if(data->cnvData!=NULL) {
126         tableType|=TABLE_BASE;
127     }
128     if(data->extData!=NULL) {
129         tableType|=TABLE_EXT;
130     }
131
132     mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status);
133
134     if(U_FAILURE(*status))
135       {
136         fprintf(stderr, "Couldn't create the udata %s.%s: %s\n",
137                 cnvName,
138                 "cnv",
139                 u_errorName(*status));
140         return;
141       }
142
143     if(VERBOSE)
144       {
145         printf("- Opened udata %s.%s\n", cnvName, "cnv");
146       }
147
148
149     /* all read only, clean, platform independent data.  Mmmm. :)  */
150     udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData));
151     size += sizeof(UConverterStaticData); /* Is 4-aligned  - by size */
152     /* Now, write the table */
153     if(tableType&TABLE_BASE) {
154         size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType);
155     }
156     if(tableType&TABLE_EXT) {
157         size += data->extData->write(data->extData, &data->staticData, mem, tableType);
158     }
159
160     sz2 = udata_finish(mem, status);
161     if(size != sz2)
162     {
163         fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size);
164         *status=U_INTERNAL_PROGRAM_ERROR;
165     }
166     if(VERBOSE)
167     {
168       printf("- Wrote %u bytes to the udata.\n", (int)sz2);
169     }
170 }
171
172 enum {
173     OPT_HELP_H,
174     OPT_HELP_QUESTION_MARK,
175     OPT_COPYRIGHT,
176     OPT_VERSION,
177     OPT_DESTDIR,
178     OPT_VERBOSE,
179     OPT_SMALL,
180     OPT_IGNORE_SISO_CHECK,
181     OPT_QUIET,
182
183     OPT_COUNT
184 };
185
186 static UOption options[]={
187     UOPTION_HELP_H,
188     UOPTION_HELP_QUESTION_MARK,
189     UOPTION_COPYRIGHT,
190     UOPTION_VERSION,
191     UOPTION_DESTDIR,
192     UOPTION_VERBOSE,
193     { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
194     { "ignore-siso-check", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
195     UOPTION_QUIET,
196 };
197
198 int main(int argc, char* argv[])
199 {
200     ConvData data;
201     char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
202
203     U_MAIN_INIT_ARGS(argc, argv);
204
205     /* Set up the ICU version number */
206     UVersionInfo icuVersion;
207     u_getVersion(icuVersion);
208     uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
209
210     /* preset then read command line options */
211     options[OPT_DESTDIR].value=u_getDataDirectory();
212     argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
213
214     /* error handling, printing usage message */
215     if(argc<0) {
216         fprintf(stderr,
217             "error in command line argument \"%s\"\n",
218             argv[-argc]);
219     } else if(argc<2) {
220         argc=-1;
221     }
222     if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
223         FILE *stdfile=argc<0 ? stderr : stdout;
224         fprintf(stdfile,
225             "usage: %s [-options] files...\n"
226             "\tread .ucm codepage mapping files and write .cnv files\n"
227             "options:\n"
228             "\t-h or -? or --help  this usage text\n"
229             "\t-V or --version     show a version message\n"
230             "\t-c or --copyright   include a copyright notice\n"
231             "\t-d or --destdir     destination directory, followed by the path\n"
232             "\t-v or --verbose     Turn on verbose output\n"
233             "\t-q or --quiet       do not display warnings and progress\n",
234             argv[0]);
235         fprintf(stdfile,
236             "\t      --small       Generate smaller .cnv files. They will be\n"
237             "\t                    significantly smaller but may not be compatible with\n"
238             "\t                    older versions of ICU and will require heap memory\n"
239             "\t                    allocation when loaded.\n"
240             "\t      --ignore-siso-check         Use SI/SO other than 0xf/0xe.\n");
241         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
242     }
243
244     if(options[OPT_VERSION].doesOccur) {
245         printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
246                dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
247         printf("%s\n", U_COPYRIGHT_STRING);
248         exit(0);
249     }
250
251     /* get the options values */
252     haveCopyright = options[OPT_COPYRIGHT].doesOccur;
253     const char *destdir = options[OPT_DESTDIR].value;
254     VERBOSE = options[OPT_VERBOSE].doesOccur;
255     QUIET = options[OPT_QUIET].doesOccur;
256     SMALL = options[OPT_SMALL].doesOccur;
257
258     if (options[OPT_IGNORE_SISO_CHECK].doesOccur) {
259         IGNORE_SISO_CHECK = TRUE;
260     }
261
262     icu::CharString outFileName;
263     UErrorCode err = U_ZERO_ERROR;
264     if (destdir != NULL && *destdir != 0) {
265         outFileName.append(destdir, err).ensureEndsWithFileSeparator(err);
266         if (U_FAILURE(err)) {
267             return err;
268         }
269     }
270     int32_t outBasenameStart = outFileName.length();
271
272 #if DEBUG
273     {
274       int i;
275       printf("makeconv: processing %d files...\n", argc - 1);
276       for(i=1; i<argc; ++i) {
277         printf("%s ", argv[i]);
278       }
279       printf("\n");
280       fflush(stdout);
281     }
282 #endif
283
284     UBool printFilename = (UBool) (argc > 2 || VERBOSE);
285     for (++argv; --argc; ++argv)
286     {
287         UErrorCode localError = U_ZERO_ERROR;
288         const char *arg = getLongPathname(*argv);
289
290         /*produces the right destination path for display*/
291         outFileName.truncate(outBasenameStart);
292         if (outBasenameStart != 0)
293         {
294             /* find the last file sepator */
295             const char *basename = findBasename(arg);
296             outFileName.append(basename, localError);
297         }
298         else
299         {
300             outFileName.append(arg, localError);
301         }
302         if (U_FAILURE(localError)) {
303             return localError;
304         }
305
306         /*removes the extension if any is found*/
307         int32_t lastDotIndex = outFileName.lastIndexOf('.');
308         if (lastDotIndex >= outBasenameStart) {
309             outFileName.truncate(lastDotIndex);
310         }
311
312         /* the basename without extension is the converter name */
313         if ((outFileName.length() - outBasenameStart) >= UPRV_LENGTHOF(cnvName)) {
314             fprintf(stderr, "converter name %s too long\n", outFileName.data() + outBasenameStart);
315             return U_BUFFER_OVERFLOW_ERROR;
316         }
317         uprv_strcpy(cnvName, outFileName.data() + outBasenameStart);
318
319         /*Adds the target extension*/
320         outFileName.append(CONVERTER_FILE_EXTENSION, localError);
321         if (U_FAILURE(localError)) {
322             return localError;
323         }
324
325 #if DEBUG
326         printf("makeconv: processing %s  ...\n", arg);
327         fflush(stdout);
328 #endif
329         initConvData(&data);
330         createConverter(&data, arg, &localError);
331
332         if (U_FAILURE(localError))
333         {
334             /* if an error is found, print out an error msg and keep going */
335             fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n",
336                     outFileName.data(), arg, u_errorName(localError));
337             if(U_SUCCESS(err)) {
338                 err = localError;
339             }
340         }
341         else
342         {
343             /* Insure the static data name matches the  file name */
344             /* Changed to ignore directory and only compare base name
345              LDH 1/2/08*/
346             char *p;
347             p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */
348
349             if(p == NULL)            /* OK, try alternate */
350             {
351                 p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR);
352                 if(p == NULL)
353                 {
354                     p=cnvName; /* If no separators, no problem */
355                 }
356             }
357             else
358             {
359                 p++;   /* If found separator, don't include it in compare */
360             }
361             if(uprv_stricmp(p,data.staticData.name) && !QUIET)
362             {
363                 fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
364                     cnvName,  CONVERTER_FILE_EXTENSION,
365                     data.staticData.name);
366             }
367
368             uprv_strcpy((char*)data.staticData.name, cnvName);
369
370             if(!uprv_isInvariantString((char*)data.staticData.name, -1)) {
371                 fprintf(stderr,
372                     "Error: A converter name must contain only invariant characters.\n"
373                     "%s is not a valid converter name.\n",
374                     data.staticData.name);
375                 if(U_SUCCESS(err)) {
376                     err = U_INVALID_TABLE_FORMAT;
377                 }
378             }
379
380             localError = U_ZERO_ERROR;
381             writeConverterData(&data, cnvName, destdir, &localError);
382
383             if(U_FAILURE(localError))
384             {
385                 /* if an error is found, print out an error msg and keep going*/
386                 fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName.data(), arg,
387                     u_errorName(localError));
388                 if(U_SUCCESS(err)) {
389                     err = localError;
390                 }
391             }
392             else if (printFilename)
393             {
394                 puts(outFileName.data() + outBasenameStart);
395             }
396         }
397         fflush(stdout);
398         fflush(stderr);
399
400         cleanupConvData(&data);
401     }
402
403     return err;
404 }
405
406 static void
407 getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) {
408     if( (name[0]=='i' || name[0]=='I') &&
409         (name[1]=='b' || name[1]=='B') &&
410         (name[2]=='m' || name[2]=='M')
411     ) {
412         name+=3;
413         if(*name=='-') {
414             ++name;
415         }
416         *pPlatform=UCNV_IBM;
417         *pCCSID=(int32_t)uprv_strtoul(name, NULL, 10);
418     } else {
419         *pPlatform=UCNV_UNKNOWN;
420         *pCCSID=0;
421     }
422 }
423
424 static void
425 readHeader(ConvData *data,
426            FileStream* convFile,
427            UErrorCode *pErrorCode) {
428     char line[1024];
429     char *s, *key, *value;
430     const UConverterStaticData *prototype;
431     UConverterStaticData *staticData;
432
433     if(U_FAILURE(*pErrorCode)) {
434         return;
435     }
436
437     staticData=&data->staticData;
438     staticData->platform=UCNV_IBM;
439     staticData->subCharLen=0;
440
441     while(T_FileStream_readLine(convFile, line, sizeof(line))) {
442         /* basic parsing and handling of state-related items */
443         if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) {
444             continue;
445         }
446
447         /* stop at the beginning of the mapping section */
448         if(uprv_strcmp(line, "CHARMAP")==0) {
449             break;
450         }
451
452         /* collect the information from the header field, ignore unknown keys */
453         if(uprv_strcmp(key, "code_set_name")==0) {
454             if(*value!=0) {
455                 uprv_strcpy((char *)staticData->name, value);
456                 getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage);
457             }
458         } else if(uprv_strcmp(key, "subchar")==0) {
459             uint8_t bytes[UCNV_EXT_MAX_BYTES];
460             int8_t length;
461
462             s=value;
463             length=ucm_parseBytes(bytes, line, (const char **)&s);
464             if(1<=length && length<=4 && *s==0) {
465                 staticData->subCharLen=length;
466                 uprv_memcpy(staticData->subChar, bytes, length);
467             } else {
468                 fprintf(stderr, "error: illegal <subchar> %s\n", value);
469                 *pErrorCode=U_INVALID_TABLE_FORMAT;
470                 return;
471             }
472         } else if(uprv_strcmp(key, "subchar1")==0) {
473             uint8_t bytes[UCNV_EXT_MAX_BYTES];
474
475             s=value;
476             if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) {
477                 staticData->subChar1=bytes[0];
478             } else {
479                 fprintf(stderr, "error: illegal <subchar1> %s\n", value);
480                 *pErrorCode=U_INVALID_TABLE_FORMAT;
481                 return;
482             }
483         }
484     }
485
486     /* copy values from the UCMFile to the static data */
487     staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength;
488     staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength;
489     staticData->conversionType=data->ucm->states.conversionType;
490
491     if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
492         fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
493         *pErrorCode=U_INVALID_TABLE_FORMAT;
494         return;
495     }
496
497     /*
498      * Now that we know the type, copy any 'default' values from the table.
499      * We need not check the type any further because the parser only
500      * recognizes what we have prototypes for.
501      *
502      * For delta (extension-only) tables, copy values from the base file
503      * instead, see createConverter().
504      */
505     if(data->ucm->baseName[0]==0) {
506         prototype=ucnv_converterStaticData[staticData->conversionType];
507         if(prototype!=NULL) {
508             if(staticData->name[0]==0) {
509                 uprv_strcpy((char *)staticData->name, prototype->name);
510             }
511
512             if(staticData->codepage==0) {
513                 staticData->codepage=prototype->codepage;
514             }
515
516             if(staticData->platform==0) {
517                 staticData->platform=prototype->platform;
518             }
519
520             if(staticData->minBytesPerChar==0) {
521                 staticData->minBytesPerChar=prototype->minBytesPerChar;
522             }
523
524             if(staticData->maxBytesPerChar==0) {
525                 staticData->maxBytesPerChar=prototype->maxBytesPerChar;
526             }
527
528             if(staticData->subCharLen==0) {
529                 staticData->subCharLen=prototype->subCharLen;
530                 if(prototype->subCharLen>0) {
531                     uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
532                 }
533             }
534         }
535     }
536
537     if(data->ucm->states.outputType<0) {
538         data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1;
539     }
540
541     if( staticData->subChar1!=0 &&
542             (staticData->minBytesPerChar>1 ||
543                 (staticData->conversionType!=UCNV_MBCS &&
544                  staticData->conversionType!=UCNV_EBCDIC_STATEFUL))
545     ) {
546         fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n");
547         *pErrorCode=U_INVALID_TABLE_FORMAT;
548     }
549 }
550
551 /* return TRUE if a base table was read, FALSE for an extension table */
552 static UBool
553 readFile(ConvData *data, const char* converterName,
554          UErrorCode *pErrorCode) {
555     char line[1024];
556     char *end;
557     FileStream *convFile;
558
559     UCMStates *baseStates;
560     UBool dataIsBase;
561
562     if(U_FAILURE(*pErrorCode)) {
563         return FALSE;
564     }
565
566     data->ucm=ucm_open();
567
568     convFile=T_FileStream_open(converterName, "r");
569     if(convFile==NULL) {
570         *pErrorCode=U_FILE_ACCESS_ERROR;
571         return FALSE;
572     }
573
574     readHeader(data, convFile, pErrorCode);
575     if(U_FAILURE(*pErrorCode)) {
576         return FALSE;
577     }
578
579     if(data->ucm->baseName[0]==0) {
580         dataIsBase=TRUE;
581         baseStates=&data->ucm->states;
582         ucm_processStates(baseStates, IGNORE_SISO_CHECK);
583     } else {
584         dataIsBase=FALSE;
585         baseStates=NULL;
586     }
587
588     /* read the base table */
589     ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode);
590     if(U_FAILURE(*pErrorCode)) {
591         return FALSE;
592     }
593
594     /* read an extension table if there is one */
595     while(T_FileStream_readLine(convFile, line, sizeof(line))) {
596         end=uprv_strchr(line, 0);
597         while(line<end &&
598               (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
599             --end;
600         }
601         *end=0;
602
603         if(line[0]=='#' || u_skipWhitespace(line)==end) {
604             continue; /* ignore empty and comment lines */
605         }
606
607         if(0==uprv_strcmp(line, "CHARMAP")) {
608             /* read the extension table */
609             ucm_readTable(data->ucm, convFile, FALSE, baseStates, pErrorCode);
610         } else {
611             fprintf(stderr, "unexpected text after the base mapping table\n");
612         }
613         break;
614     }
615
616     T_FileStream_close(convFile);
617
618     if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
619         fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
620         *pErrorCode=U_INVALID_TABLE_FORMAT;
621     }
622
623     return dataIsBase;
624 }
625
626 static void
627 createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) {
628     ConvData baseData;
629     UBool dataIsBase;
630
631     UConverterStaticData *staticData;
632     UCMStates *states, *baseStates;
633
634     if(U_FAILURE(*pErrorCode)) {
635         return;
636     }
637
638     initConvData(data);
639
640     dataIsBase=readFile(data, converterName, pErrorCode);
641     if(U_FAILURE(*pErrorCode)) {
642         return;
643     }
644
645     staticData=&data->staticData;
646     states=&data->ucm->states;
647
648     if(dataIsBase) {
649         /*
650          * Build a normal .cnv file with a base table
651          * and an optional extension table.
652          */
653         data->cnvData=MBCSOpen(data->ucm);
654         if(data->cnvData==NULL) {
655             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
656
657         } else if(!data->cnvData->isValid(data->cnvData,
658                             staticData->subChar, staticData->subCharLen)
659         ) {
660             fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
661             *pErrorCode=U_INVALID_TABLE_FORMAT;
662
663         } else if(staticData->subChar1!=0 &&
664                     !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1)
665         ) {
666             fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
667             *pErrorCode=U_INVALID_TABLE_FORMAT;
668
669         } else if(
670             data->ucm->ext->mappingsLength>0 &&
671             !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
672         ) {
673             *pErrorCode=U_INVALID_TABLE_FORMAT;
674         } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
675             /* sort the table so that it can be turned into UTF-8-friendly data */
676             ucm_sortTable(data->ucm->base);
677         }
678
679         if(U_SUCCESS(*pErrorCode)) {
680             if(
681                 /* add the base table after ucm_checkBaseExt()! */
682                 !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
683             ) {
684                 *pErrorCode=U_INVALID_TABLE_FORMAT;
685             } else {
686                 /*
687                  * addTable() may have requested moving more mappings to the extension table
688                  * if they fit into the base toUnicode table but not into the
689                  * base fromUnicode table.
690                  * (Especially for UTF-8-friendly fromUnicode tables.)
691                  * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
692                  * to be excluded from the extension toUnicode data.
693                  * See MBCSOkForBaseFromUnicode() for which mappings do not fit into
694                  * the base fromUnicode table.
695                  */
696                 ucm_moveMappings(data->ucm->base, data->ucm->ext);
697                 ucm_sortTable(data->ucm->ext);
698                 if(data->ucm->ext->mappingsLength>0) {
699                     /* prepare the extension table, if there is one */
700                     data->extData=CnvExtOpen(data->ucm);
701                     if(data->extData==NULL) {
702                         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
703                     } else if(
704                         !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
705                     ) {
706                         *pErrorCode=U_INVALID_TABLE_FORMAT;
707                     }
708                 }
709             }
710         }
711     } else {
712         /* Build an extension-only .cnv file. */
713         char baseFilename[500];
714         char *basename;
715
716         initConvData(&baseData);
717
718         /* assemble a path/filename for data->ucm->baseName */
719         uprv_strcpy(baseFilename, converterName);
720         basename=(char *)findBasename(baseFilename);
721         uprv_strcpy(basename, data->ucm->baseName);
722         uprv_strcat(basename, ".ucm");
723
724         /* read the base table */
725         dataIsBase=readFile(&baseData, baseFilename, pErrorCode);
726         if(U_FAILURE(*pErrorCode)) {
727             return;
728         } else if(!dataIsBase) {
729             fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename);
730             *pErrorCode=U_INVALID_TABLE_FORMAT;
731         } else {
732             /* prepare the extension table */
733             data->extData=CnvExtOpen(data->ucm);
734             if(data->extData==NULL) {
735                 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
736             } else {
737                 /* fill in gaps in extension file header fields */
738                 UCMapping *m, *mLimit;
739                 uint8_t fallbackFlags;
740
741                 baseStates=&baseData.ucm->states;
742                 if(states->conversionType==UCNV_DBCS) {
743                     staticData->minBytesPerChar=(int8_t)(states->minCharLength=2);
744                 } else if(states->minCharLength==0) {
745                     staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength);
746                 }
747                 if(states->maxCharLength<states->minCharLength) {
748                     staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength);
749                 }
750
751                 if(staticData->subCharLen==0) {
752                     uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4);
753                     staticData->subCharLen=baseData.staticData.subCharLen;
754                 }
755                 /*
756                  * do not copy subChar1 -
757                  * only use what is explicitly specified
758                  * because it cannot be unset in the extension file header
759                  */
760
761                 /* get the fallback flags */
762                 fallbackFlags=0;
763                 for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
764                     m<mLimit && fallbackFlags!=3;
765                     ++m
766                 ) {
767                     if(m->f==1) {
768                         fallbackFlags|=1;
769                     } else if(m->f==3) {
770                         fallbackFlags|=2;
771                     }
772                 }
773
774                 if(fallbackFlags&1) {
775                     staticData->hasFromUnicodeFallback=TRUE;
776                 }
777                 if(fallbackFlags&2) {
778                     staticData->hasToUnicodeFallback=TRUE;
779                 }
780
781                 if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) {
782                     fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
783                     *pErrorCode=U_INVALID_TABLE_FORMAT;
784
785                 } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
786                     fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
787                     *pErrorCode=U_INVALID_TABLE_FORMAT;
788
789                 } else if(
790                     !ucm_checkValidity(data->ucm->ext, baseStates) ||
791                     !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
792                 ) {
793                     *pErrorCode=U_INVALID_TABLE_FORMAT;
794                 } else {
795                     if(states->maxCharLength>1) {
796                         /*
797                          * When building a normal .cnv file with a base table
798                          * for an MBCS (not SBCS) table with explicit precision flags,
799                          * the MBCSAddTable() function marks some mappings for moving
800                          * to the extension table.
801                          * They fit into the base toUnicode table but not into the
802                          * base fromUnicode table.
803                          * (Note: We do have explicit precision flags because they are
804                          * required for extension table generation, and
805                          * ucm_checkBaseExt() verified it.)
806                          *
807                          * We do not call MBCSAddTable() here (we probably could)
808                          * so we need to do the analysis before building the extension table.
809                          * We assume that MBCSAddTable() will build a UTF-8-friendly table.
810                          * Redundant mappings in the extension table are ok except they cost some size.
811                          *
812                          * Do this after ucm_checkBaseExt().
813                          */
814                         const MBCSData *mbcsData=MBCSGetDummy();
815                         int32_t needsMove=0;
816                         for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
817                             m<mLimit;
818                             ++m
819                         ) {
820                             if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
821                                 m->f|=MBCS_FROM_U_EXT_FLAG;
822                                 m->moveFlag=UCM_MOVE_TO_EXT;
823                                 ++needsMove;
824                             }
825                         }
826
827                         if(needsMove!=0) {
828                             ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
829                             ucm_sortTable(data->ucm->ext);
830                         }
831                     }
832                     if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
833                         *pErrorCode=U_INVALID_TABLE_FORMAT;
834                     }
835                 }
836             }
837         }
838
839         cleanupConvData(&baseData);
840     }
841 }
842
843 /*
844  * Hey, Emacs, please set the following:
845  *
846  * Local Variables:
847  * indent-tabs-mode: nil
848  * End:
849  *
850  */