1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: gennorm2.cpp
12 * tab size: 8 (not used)
15 * created on: 2009nov25
16 * created by: Markus W. Scherer
18 * This program reads text files that define Unicode normalization,
19 * parses them, and builds a binary data file.
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
28 #include "unicode/errorcode.h"
29 #include "unicode/localpointer.h"
30 #include "unicode/putil.h"
31 #include "unicode/uchar.h"
32 #include "unicode/unistr.h"
34 #include "normalizer2impl.h"
39 #if UCONFIG_NO_NORMALIZATION
45 UBool beVerbose=FALSE, haveCopyright=TRUE;
47 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
49 #if !UCONFIG_NO_NORMALIZATION
50 void parseFile(FILE *f, Normalizer2DataBuilder &builder);
53 /* -------------------------------------------------------------------------- */
67 static UOption options[]={
69 UOPTION_HELP_QUESTION_MARK,
73 UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
74 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
75 UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
76 UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
80 main(int argc, char* argv[]) {
81 U_MAIN_INIT_ARGS(argc, argv);
83 /* preset then read command line options */
84 options[SOURCEDIR].value="";
85 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
87 /* error handling, printing usage message */
90 "error in command line argument \"%s\"\n",
93 if(!options[OUTPUT_FILENAME].doesOccur) {
97 options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
100 * Broken into chunks because the C89 standard says the minimum
101 * required supported string length is 509 bytes.
104 "Usage: %s [-options] infiles+ -o outputfilename\n"
106 "Reads the infiles with normalization data and\n"
107 "creates a binary or C source file (outputfilename) with the data.\n"
112 "\t-h or -? or --help this usage text\n"
113 "\t-v or --verbose verbose output\n"
114 "\t-c or --copyright include a copyright notice\n"
115 "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
117 "\t-s or --sourcedir source directory, followed by the path\n"
118 "\t-o or --output output filename\n"
119 "\t --csource writes a C source file with initializers\n");
121 "\t --fast optimize the data for fast normalization,\n"
122 "\t which might increase its size (Writes fully decomposed\n"
123 "\t regular mappings instead of delta mappings.\n"
124 "\t You should measure the runtime speed to make sure that\n"
125 "\t this is a good trade-off.)\n");
126 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
129 beVerbose=options[VERBOSE].doesOccur;
130 haveCopyright=options[COPYRIGHT].doesOccur;
132 IcuToolErrorCode errorCode("gennorm2/main()");
134 #if UCONFIG_NO_NORMALIZATION
137 "gennorm2 writes a dummy binary data file "
138 "because UCONFIG_NO_NORMALIZATION is set, \n"
139 "see icu/source/common/unicode/uconfig.h\n");
140 udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
141 // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on.
142 // return U_UNSUPPORTED_ERROR;
147 LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
148 errorCode.assertSuccess();
150 if(options[UNICODE_VERSION].doesOccur) {
151 builder->setUnicodeVersion(options[UNICODE_VERSION].value);
154 if(options[OPT_FAST].doesOccur) {
155 builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
158 // prepare the filename beginning with the source dir
159 CharString filename(options[SOURCEDIR].value, errorCode);
160 int32_t pathLength=filename.length();
162 filename[pathLength-1]!=U_FILE_SEP_CHAR &&
163 filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
165 filename.append(U_FILE_SEP_CHAR, errorCode);
166 pathLength=filename.length();
169 for(int i=1; i<argc; ++i) {
170 printf("gennorm2: processing %s\n", argv[i]);
171 filename.append(argv[i], errorCode);
172 LocalStdioFilePointer f(fopen(filename.data(), "r"));
174 fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data());
175 exit(U_FILE_ACCESS_ERROR);
177 builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
178 parseFile(f.getAlias(), *builder);
179 filename.truncate(pathLength);
182 if(options[WRITE_C_SOURCE].doesOccur) {
183 builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
185 builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
188 return errorCode.get();
193 #if !UCONFIG_NO_NORMALIZATION
195 void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
196 IcuToolErrorCode errorCode("gennorm2/parseFile()");
198 uint32_t startCP, endCP;
199 while(NULL!=fgets(line, (int)sizeof(line), f)) {
200 char *comment=(char *)strchr(line, '#');
206 continue; // skip empty and comment-only lines
209 const char *s=u_skipWhitespace(line+1);
210 if(0==strncmp(s, "Unicode", 7)) {
211 s=u_skipWhitespace(s+7);
212 builder.setUnicodeVersion(s);
214 continue; // reserved syntax
216 const char *delimiter;
218 u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
219 if(errorCode.isFailure()) {
220 fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
221 exit(errorCode.reset());
223 delimiter=u_skipWhitespace(delimiter);
224 if(*delimiter==':') {
225 const char *s=u_skipWhitespace(delimiter+1);
227 unsigned long value=strtoul(s, &end, 10);
228 if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
229 fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
232 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
233 builder.setCC(c, (uint8_t)value);
237 if(*delimiter=='-') {
238 if(*u_skipWhitespace(delimiter+1)!=0) {
239 fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
242 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
243 builder.removeMapping(c);
247 if(*delimiter=='=' || *delimiter=='>') {
248 UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
249 int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), NULL, errorCode);
250 if(errorCode.isFailure()) {
251 fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
252 exit(errorCode.reset());
254 UnicodeString mapping(FALSE, uchars, length);
255 if(*delimiter=='=') {
258 "gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
262 builder.setRoundTripMapping((UChar32)startCP, mapping);
264 for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
265 builder.setOneWayMapping(c, mapping);
270 fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
275 #endif // !UCONFIG_NO_NORMALIZATION
280 * Hey, Emacs, please set the following:
283 * indent-tabs-mode: nil