Imported Upstream version 58.1
[platform/upstream/icu.git] / source / tools / gennorm2 / gennorm2.cpp
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  gennorm2.cpp
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009nov25
16 *   created by: Markus W. Scherer
17 *
18 *   This program reads text files that define Unicode normalization,
19 *   parses them, and builds a binary data file.
20 */
21
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include "unicode/errorcode.h"
29 #include "unicode/localpointer.h"
30 #include "unicode/putil.h"
31 #include "unicode/uchar.h"
32 #include "unicode/unistr.h"
33 #include "charstr.h"
34 #include "normalizer2impl.h"
35 #include "toolutil.h"
36 #include "uoptions.h"
37 #include "uparse.h"
38
39 #if UCONFIG_NO_NORMALIZATION
40 #include "unewdata.h"
41 #endif
42
43 U_NAMESPACE_BEGIN
44
45 UBool beVerbose=FALSE, haveCopyright=TRUE;
46
47 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
48
49 #if !UCONFIG_NO_NORMALIZATION
50 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
51 #endif
52
53 /* -------------------------------------------------------------------------- */
54
55 enum {
56     HELP_H,
57     HELP_QUESTION_MARK,
58     VERBOSE,
59     COPYRIGHT,
60     SOURCEDIR,
61     OUTPUT_FILENAME,
62     UNICODE_VERSION,
63     WRITE_C_SOURCE,
64     OPT_FAST
65 };
66
67 static UOption options[]={
68     UOPTION_HELP_H,
69     UOPTION_HELP_QUESTION_MARK,
70     UOPTION_VERBOSE,
71     UOPTION_COPYRIGHT,
72     UOPTION_SOURCEDIR,
73     UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
74     UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
75     UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
76     UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
77 };
78
79 extern "C" int
80 main(int argc, char* argv[]) {
81     U_MAIN_INIT_ARGS(argc, argv);
82
83     /* preset then read command line options */
84     options[SOURCEDIR].value="";
85     argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
86
87     /* error handling, printing usage message */
88     if(argc<0) {
89         fprintf(stderr,
90             "error in command line argument \"%s\"\n",
91             argv[-argc]);
92     }
93     if(!options[OUTPUT_FILENAME].doesOccur) {
94         argc=-1;
95     }
96     if( argc<2 ||
97         options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
98     ) {
99         /*
100          * Broken into chunks because the C89 standard says the minimum
101          * required supported string length is 509 bytes.
102          */
103         fprintf(stderr,
104             "Usage: %s [-options] infiles+ -o outputfilename\n"
105             "\n"
106             "Reads the infiles with normalization data and\n"
107             "creates a binary or C source file (outputfilename) with the data.\n"
108             "\n",
109             argv[0]);
110         fprintf(stderr,
111             "Options:\n"
112             "\t-h or -? or --help  this usage text\n"
113             "\t-v or --verbose     verbose output\n"
114             "\t-c or --copyright   include a copyright notice\n"
115             "\t-u or --unicode     Unicode version, followed by the version like 5.2.0\n");
116         fprintf(stderr,
117             "\t-s or --sourcedir   source directory, followed by the path\n"
118             "\t-o or --output      output filename\n"
119             "\t      --csource     writes a C source file with initializers\n");
120         fprintf(stderr,
121             "\t      --fast        optimize the data for fast normalization,\n"
122             "\t                    which might increase its size  (Writes fully decomposed\n"
123             "\t                    regular mappings instead of delta mappings.\n"
124             "\t                    You should measure the runtime speed to make sure that\n"
125             "\t                    this is a good trade-off.)\n");
126         return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
127     }
128
129     beVerbose=options[VERBOSE].doesOccur;
130     haveCopyright=options[COPYRIGHT].doesOccur;
131
132     IcuToolErrorCode errorCode("gennorm2/main()");
133
134 #if UCONFIG_NO_NORMALIZATION
135
136     fprintf(stderr,
137         "gennorm2 writes a dummy binary data file "
138         "because UCONFIG_NO_NORMALIZATION is set, \n"
139         "see icu/source/common/unicode/uconfig.h\n");
140     udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
141     // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
142     // return U_UNSUPPORTED_ERROR;
143     return 0;
144
145 #else
146
147     LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
148     errorCode.assertSuccess();
149
150     if(options[UNICODE_VERSION].doesOccur) {
151         builder->setUnicodeVersion(options[UNICODE_VERSION].value);
152     }
153
154     if(options[OPT_FAST].doesOccur) {
155         builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
156     }
157
158     // prepare the filename beginning with the source dir
159     CharString filename(options[SOURCEDIR].value, errorCode);
160     int32_t pathLength=filename.length();
161     if( pathLength>0 &&
162         filename[pathLength-1]!=U_FILE_SEP_CHAR &&
163         filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
164     ) {
165         filename.append(U_FILE_SEP_CHAR, errorCode);
166         pathLength=filename.length();
167     }
168
169     for(int i=1; i<argc; ++i) {
170         printf("gennorm2: processing %s\n", argv[i]);
171         filename.append(argv[i], errorCode);
172         LocalStdioFilePointer f(fopen(filename.data(), "r"));
173         if(f==NULL) {
174             fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
175             exit(U_FILE_ACCESS_ERROR);
176         }
177         builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
178         parseFile(f.getAlias(), *builder);
179         filename.truncate(pathLength);
180     }
181
182     if(options[WRITE_C_SOURCE].doesOccur) {
183         builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
184     } else {
185         builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
186     }
187
188     return errorCode.get();
189
190 #endif
191 }
192
193 #if !UCONFIG_NO_NORMALIZATION
194
195 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
196     IcuToolErrorCode errorCode("gennorm2/parseFile()");
197     char line[300];
198     uint32_t startCP, endCP;
199     while(NULL!=fgets(line, (int)sizeof(line), f)) {
200         char *comment=(char *)strchr(line, '#');
201         if(comment!=NULL) {
202             *comment=0;
203         }
204         u_rtrim(line);
205         if(line[0]==0) {
206             continue;  // skip empty and comment-only lines
207         }
208         if(line[0]=='*') {
209             const char *s=u_skipWhitespace(line+1);
210             if(0==strncmp(s, "Unicode", 7)) {
211                 s=u_skipWhitespace(s+7);
212                 builder.setUnicodeVersion(s);
213             }
214             continue;  // reserved syntax
215         }
216         const char *delimiter;
217         int32_t rangeLength=
218             u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
219         if(errorCode.isFailure()) {
220             fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
221             exit(errorCode.reset());
222         }
223         delimiter=u_skipWhitespace(delimiter);
224         if(*delimiter==':') {
225             const char *s=u_skipWhitespace(delimiter+1);
226             char *end;
227             unsigned long value=strtoul(s, &end, 10);
228             if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
229                 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
230                 exit(U_PARSE_ERROR);
231             }
232             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
233                 builder.setCC(c, (uint8_t)value);
234             }
235             continue;
236         }
237         if(*delimiter=='-') {
238             if(*u_skipWhitespace(delimiter+1)!=0) {
239                 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
240                 exit(U_PARSE_ERROR);
241             }
242             for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
243                 builder.removeMapping(c);
244             }
245             continue;
246         }
247         if(*delimiter=='=' || *delimiter=='>') {
248             UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
249             int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
250             if(errorCode.isFailure()) {
251                 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
252                 exit(errorCode.reset());
253             }
254             UnicodeString mapping(FALSE, uchars, length);
255             if(*delimiter=='=') {
256                 if(rangeLength!=1) {
257                     fprintf(stderr,
258                             "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
259                             line);
260                     exit(U_PARSE_ERROR);
261                 }
262                 builder.setRoundTripMapping((UChar32)startCP, mapping);
263             } else {
264                 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
265                     builder.setOneWayMapping(c, mapping);
266                 }
267             }
268             continue;
269         }
270         fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
271         exit(U_PARSE_ERROR);
272     }
273 }
274
275 #endif // !UCONFIG_NO_NORMALIZATION
276
277 U_NAMESPACE_END
278
279 /*
280  * Hey, Emacs, please set the following:
281  *
282  * Local Variables:
283  * indent-tabs-mode: nil
284  * End:
285  *
286  */