2 *******************************************************************************
3 * Copyright (C) 2011-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
8 * tab size: 8 (not used)
11 * created on: 2011dec11
12 * created by: Markus W. Scherer
15 #include "unicode/utypes.h"
16 #include "unicode/uchar.h"
26 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
30 PropertyNames::~PropertyNames() {}
33 PropertyNames::getPropertyEnum(const char *name) const {
34 return u_getPropertyEnum(name);
38 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
39 return u_getPropertyValueEnum((UProperty)property, name);
43 : start(U_SENTINEL), end(U_SENTINEL),
44 bmg(U_SENTINEL), bpb(U_SENTINEL),
45 scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
46 digitValue(-1), numericValue(NULL),
47 name(NULL), nameAlias(NULL) {
48 memset(binProps, 0, sizeof(binProps));
49 memset(intProps, 0, sizeof(intProps));
53 UniProps::~UniProps() {}
55 const int32_t PreparsedUCD::kNumLineBuffers;
57 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
58 : icuPnames(new PropertyNames()), pnames(icuPnames),
60 defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
63 fieldLimit(NULL), lineLimit(NULL) {
64 if(U_FAILURE(errorCode)) { return; }
66 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
70 file=fopen(filename, "r");
73 perror("error opening preparsed UCD");
74 fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
75 errorCode=U_FILE_ACCESS_ERROR;
79 memset(ucdVersion, 0, 4);
83 PreparsedUCD::~PreparsedUCD() {
90 // Same order as the LineType values.
91 static const char *lineTypeStrings[]={
104 PreparsedUCD::LineType
105 PreparsedUCD::readLine(UErrorCode &errorCode) {
106 if(U_FAILURE(errorCode)) { return NO_LINE; }
107 // Select the next available line buffer.
108 while(!isLineBufferAvailable(lineIndex)) {
110 if (lineIndex == kNumLineBuffers) {
114 char *line=lines[lineIndex];
116 lineLimit=fieldLimit=line;
118 char *result=fgets(line, sizeof(lines[0]), file);
121 perror("error reading preparsed UCD");
122 fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
123 errorCode=U_FILE_ACCESS_ERROR;
129 fieldLimit=strchr(line, 0);
130 return lineType=EMPTY_LINE;
132 // Remove trailing /r/n.
134 char *limit=strchr(line, 0);
135 while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
136 // Remove trailing white space.
137 while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
142 return lineType=EMPTY_LINE;
146 while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
147 fieldLimit=strchr(line, 0);
148 // Determine the line type.
150 for(type=EMPTY_LINE+1;; ++type) {
151 if(type==LINE_TYPE_COUNT) {
153 "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
154 line, (long)lineNumber);
155 errorCode=U_PARSE_ERROR;
158 if(0==strcmp(line, lineTypeStrings[type])) {
162 lineType=(LineType)type;
163 if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
164 u_versionFromString(ucdVersion, fieldLimit+1);
170 PreparsedUCD::firstField() {
171 char *field=lines[lineIndex];
172 fieldLimit=strchr(field, 0);
177 PreparsedUCD::nextField() {
178 if(fieldLimit==lineLimit) { return NULL; }
179 char *field=fieldLimit+1;
180 fieldLimit=strchr(field, 0);
185 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
186 if(U_FAILURE(errorCode)) { return NULL; }
188 if(!lineHasPropertyValues()) {
189 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
193 const char *field=nextField();
195 // No range field after the type.
197 "error in preparsed UCD: missing default/block/cp range field "
198 "(no second field) on line %ld\n",
200 errorCode=U_PARSE_ERROR;
204 if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
208 if(defaultLineIndex>=0) {
210 "error in preparsed UCD: second line with default properties on line %ld\n",
212 errorCode=U_PARSE_ERROR;
215 if(start!=0 || end!=0x10ffff) {
217 "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
218 field, (long)lineNumber);
219 errorCode=U_PARSE_ERROR;
223 defaultLineIndex=lineIndex;
226 blockProps=defaultProps; // Block inherits default properties.
228 blockLineIndex=lineIndex;
231 if(blockProps.start<=start && end<=blockProps.end) {
232 // Code point range fully inside the last block inherits the block properties.
234 } else if(start>blockProps.end || end<blockProps.start) {
235 // Code point range fully outside the last block inherits the default properties.
236 cpProps=defaultProps;
238 // Code point range partially overlapping with the last block is illegal.
240 "error in preparsed UCD: cp range %s on line %ld only "
241 "partially overlaps with block range %04lX..%04lX\n",
242 field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
243 errorCode=U_PARSE_ERROR;
249 // Will not occur because of the range check above.
250 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
255 while((field=nextField())!=NULL) {
256 if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
261 static const struct {
264 } ppucdProperties[]={
265 { "Name_Alias", PPUCD_NAME_ALIAS },
266 { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
267 { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
270 // Returns TRUE for "ok to continue parsing fields".
272 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
273 UErrorCode &errorCode) {
276 const char *v=strchr(p, '=');
281 "error in preparsed UCD: mix of binary-property-no and "
282 "enum-property syntax '%s' on line %ld\n",
283 field, (long)lineNumber);
284 errorCode=U_PARSE_ERROR;
293 // Copy out the property name rather than modifying the field (writing a NUL).
294 pBuffer.append(p, (int32_t)(v-p), errorCode);
298 int32_t prop=pnames->getPropertyEnum(p);
300 for(int32_t i=0;; ++i) {
301 if(i==LENGTHOF(ppucdProperties)) {
302 // Ignore unknown property names.
305 if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
306 prop=ppucdProperties[i].prop;
312 if(prop<UCHAR_BINARY_LIMIT) {
314 props.binProps[prop]=(UBool)binaryValue;
316 // No binary value for a binary property.
318 "error in preparsed UCD: enum-property syntax '%s' "
319 "for binary property on line %ld\n",
320 field, (long)lineNumber);
321 errorCode=U_PARSE_ERROR;
323 } else if(binaryValue>=0) {
324 // Binary value for a non-binary property.
326 "error in preparsed UCD: binary-property syntax '%s' "
327 "for non-binary property on line %ld\n",
328 field, (long)lineNumber);
329 errorCode=U_PARSE_ERROR;
330 } else if (prop < UCHAR_INT_START) {
332 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
333 prop, (long)lineNumber);
334 errorCode=U_PARSE_ERROR;
335 } else if(prop<UCHAR_INT_LIMIT) {
336 int32_t value=pnames->getPropertyValueEnum(prop, v);
337 if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
338 // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
340 unsigned long ccc=uprv_strtoul(v, &end, 10);
341 if(v<end && *end==0 && ccc<=254) {
345 if(value==UCHAR_INVALID_CODE) {
347 "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
348 field, (long)lineNumber);
349 errorCode=U_PARSE_ERROR;
351 props.intProps[prop-UCHAR_INT_START]=value;
354 // Do not parse default values like <code point>, just set null values.
356 case UCHAR_BIDI_MIRRORING_GLYPH:
357 props.bmg=U_SENTINEL;
359 case UCHAR_BIDI_PAIRED_BRACKET:
360 props.bpb=U_SENTINEL;
362 case UCHAR_SIMPLE_CASE_FOLDING:
363 props.scf=U_SENTINEL;
365 case UCHAR_SIMPLE_LOWERCASE_MAPPING:
366 props.slc=U_SENTINEL;
368 case UCHAR_SIMPLE_TITLECASE_MAPPING:
369 props.stc=U_SENTINEL;
371 case UCHAR_SIMPLE_UPPERCASE_MAPPING:
372 props.suc=U_SENTINEL;
374 case UCHAR_CASE_FOLDING:
377 case UCHAR_LOWERCASE_MAPPING:
380 case UCHAR_TITLECASE_MAPPING:
383 case UCHAR_UPPERCASE_MAPPING:
386 case UCHAR_SCRIPT_EXTENSIONS:
391 "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
392 field, (long)lineNumber);
393 errorCode=U_PARSE_ERROR;
398 case UCHAR_NUMERIC_VALUE:
399 props.numericValue=v;
401 if('0'<=c && c<='9' && v[1]==0) {
402 props.digitValue=c-'0';
411 u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric.
413 case UCHAR_BIDI_MIRRORING_GLYPH:
414 props.bmg=parseCodePoint(v, errorCode);
416 case UCHAR_BIDI_PAIRED_BRACKET:
417 props.bpb=parseCodePoint(v, errorCode);
419 case UCHAR_SIMPLE_CASE_FOLDING:
420 props.scf=parseCodePoint(v, errorCode);
422 case UCHAR_SIMPLE_LOWERCASE_MAPPING:
423 props.slc=parseCodePoint(v, errorCode);
425 case UCHAR_SIMPLE_TITLECASE_MAPPING:
426 props.stc=parseCodePoint(v, errorCode);
428 case UCHAR_SIMPLE_UPPERCASE_MAPPING:
429 props.suc=parseCodePoint(v, errorCode);
431 case UCHAR_CASE_FOLDING:
432 parseString(v, props.cf, errorCode);
434 case UCHAR_LOWERCASE_MAPPING:
435 parseString(v, props.lc, errorCode);
437 case UCHAR_TITLECASE_MAPPING:
438 parseString(v, props.tc, errorCode);
440 case UCHAR_UPPERCASE_MAPPING:
441 parseString(v, props.uc, errorCode);
443 case PPUCD_NAME_ALIAS:
446 case PPUCD_CONDITIONAL_CASE_MAPPINGS:
447 case PPUCD_TURKIC_CASE_FOLDING:
448 // No need to parse their values: They are hardcoded in the runtime library.
450 case UCHAR_SCRIPT_EXTENSIONS:
451 parseScriptExtensions(v, props.scx, errorCode);
454 // Ignore unhandled properties.
458 if(U_SUCCESS(errorCode)) {
459 newValues.add((UChar32)prop);
467 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
468 if(U_FAILURE(errorCode)) { return FALSE; }
469 if(lineType!=ALG_NAMES_RANGE_LINE) {
470 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
474 const char *field=nextField();
476 // No range field after the type.
478 "error in preparsed UCD: missing algnamesrange range field "
479 "(no second field) on line %ld\n",
481 errorCode=U_PARSE_ERROR;
484 return parseCodePointRange(field, start, end, errorCode);
488 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
490 uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
491 if(end<=s || *end!=0 || value>=0x110000) {
493 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
494 s, (long)lineNumber);
495 errorCode=U_PARSE_ERROR;
498 return (UChar32)value;
502 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
504 u_parseCodePointRange(s, &st, &e, &errorCode);
505 if(U_FAILURE(errorCode)) {
507 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
508 s, (long)lineNumber);
517 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
518 UChar *buffer=uni.getBuffer(-1);
519 int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
520 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
521 errorCode=U_ZERO_ERROR;
522 uni.releaseBuffer(0);
523 buffer=uni.getBuffer(length);
524 length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
526 uni.releaseBuffer(length);
527 if(U_FAILURE(errorCode)) {
529 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
530 s, (long)lineNumber);
535 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
536 if(U_FAILURE(errorCode)) { return; }
541 const char *scLimit=strchr(s, ' ');
543 scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
544 if(U_FAILURE(errorCode)) { return; }
548 int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
549 if(script==UCHAR_INVALID_CODE) {
551 "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
552 scs, (long)lineNumber);
553 errorCode=U_PARSE_ERROR;
555 } else if(scx.contains(script)) {
557 "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
558 scs, (long)lineNumber);
559 errorCode=U_PARSE_ERROR;
571 fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
572 errorCode=U_PARSE_ERROR;