2 ******************************************************************************
4 * Copyright (C) 1999-2013, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 1999oct04
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/putil.h"
19 #include "unicode/uchar.h"
20 #include "unicode/udata.h"
21 #include "unicode/utf.h"
22 #include "unicode/utf16.h"
34 /* prototypes ------------------------------------------------------------- */
36 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
38 static const char DATA_NAME[] = "unames";
39 static const char DATA_TYPE[] = "icu";
42 #define LINES_PER_GROUP (1L<<GROUP_SHIFT)
43 #define GROUP_MASK (LINES_PER_GROUP-1)
46 * This struct was replaced by explicitly accessing equivalent
47 * fields from triples of uint16_t.
48 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
49 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
50 * would advance by 6 bytes (3 uint16_t).
52 * We can't just change the data structure because it's loaded from a data file,
53 * and we don't want to make it less compact, so we changed the access code.
55 * For details see ICU tickets 6331 and 6008.
58 offsetHigh, offsetLow; / * avoid padding * /
69 * Get the 32-bit group offset.
70 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
71 * @return group offset (int32_t)
73 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
75 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
76 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
80 uint8_t type, variant;
85 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
89 * Get the groups table from a UCharNames struct.
90 * The groups table consists of one uint16_t groupCount followed by
91 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
92 * and the comment for the old struct Group above.
94 * @param names (const UCharNames *) pointer to the UCharNames indexes
95 * @return (const uint16_t *) pointer to the groups table
97 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
100 const char *otherName;
104 #define DO_FIND_NAME NULL
106 static UDataMemory *uCharNamesData=NULL;
107 static UCharNames *uCharNames=NULL;
108 static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER;
111 * Maximum length of character names (regular & 1.0).
113 static int32_t gMaxNameLength=0;
116 * Set of chars used in character names (regular & 1.0).
117 * Chars are platform-dependent (can be EBCDIC).
119 static uint32_t gNameSet[8]={ 0 };
121 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
122 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
123 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
125 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
127 static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
136 "combining spacing mark",
137 "decimal digit number",
142 "paragraph separator",
150 "connector punctuation",
156 "initial punctuation",
163 /* implementation ----------------------------------------------------------- */
165 static UBool U_CALLCONV unames_cleanup(void)
168 udata_close(uCharNamesData);
169 uCharNamesData = NULL;
174 gCharNamesInitOnce.reset();
179 static UBool U_CALLCONV
180 isAcceptable(void * /*context*/,
181 const char * /*type*/, const char * /*name*/,
182 const UDataInfo *pInfo) {
185 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
186 pInfo->charsetFamily==U_CHARSET_FAMILY &&
187 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
188 pInfo->dataFormat[1]==0x6e &&
189 pInfo->dataFormat[2]==0x61 &&
190 pInfo->dataFormat[3]==0x6d &&
191 pInfo->formatVersion[0]==1);
194 static void U_CALLCONV
195 loadCharNames(UErrorCode &status) {
196 U_ASSERT(uCharNamesData == NULL);
197 U_ASSERT(uCharNames == NULL);
199 uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
200 if(U_FAILURE(status)) {
201 uCharNamesData = NULL;
203 uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
205 ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
210 isDataLoaded(UErrorCode *pErrorCode) {
211 umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
212 return U_SUCCESS(*pErrorCode);
215 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
216 if((bufferLength)>0) { \
223 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
226 * Important: expandName() and compareName() are almost the same -
227 * apply fixes to both.
229 * UnicodeData.txt uses ';' as a field separator, so no
230 * field can contain ';' as part of its contents.
231 * In unames.dat, it is marked as token[';']==-1 only if the
232 * semicolon is used in the data file - which is iff we
233 * have Unicode 1.0 names or ISO comments or aliases.
234 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
235 * although we know that it will never be part of a name.
238 expandName(UCharNames *names,
239 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
240 char *buffer, uint16_t bufferLength) {
241 uint16_t *tokens=(uint16_t *)names+8;
242 uint16_t token, tokenCount=*tokens++, bufferPos=0;
243 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
246 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
248 * skip the modern name if it is not requested _and_
249 * if the semicolon byte value is a character, not a token number
251 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
252 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
254 while(nameLength>0) {
260 } while(--fieldIndex>0);
263 * the semicolon byte value is a token number, therefore
264 * only modern names are stored in unames.dat and there is no
265 * such requested alternate name here
271 /* write each letter directly, and write a token word per token */
272 while(nameLength>0) {
278 /* implicit letter */
279 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
286 if(token==(uint16_t)(-2)) {
287 /* this is a lead byte for a double-byte token */
288 token=tokens[c<<8|*name++];
291 if(token==(uint16_t)(-1)) {
293 /* explicit letter */
294 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
296 /* stop, but skip the semicolon if we are seeking
297 extended names and there was no 2.0 name but there
299 if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
300 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
308 /* write token word */
309 uint8_t *tokenString=tokenStrings+token;
310 while((c=*tokenString++)!=0) {
311 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
326 * compareName() is almost the same as expandName() except that it compares
327 * the currently expanded name to an input name.
328 * It returns the match/no match result as soon as possible.
331 compareName(UCharNames *names,
332 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
333 const char *otherName) {
334 uint16_t *tokens=(uint16_t *)names+8;
335 uint16_t token, tokenCount=*tokens++;
336 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
338 const char *origOtherName = otherName;
340 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
342 * skip the modern name if it is not requested _and_
343 * if the semicolon byte value is a character, not a token number
345 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
346 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
348 while(nameLength>0) {
354 } while(--fieldIndex>0);
357 * the semicolon byte value is a token number, therefore
358 * only modern names are stored in unames.dat and there is no
359 * such requested alternate name here
365 /* compare each letter directly, and compare a token word per token */
366 while(nameLength>0) {
372 /* implicit letter */
373 if((char)c!=*otherName++) {
382 if(token==(uint16_t)(-2)) {
383 /* this is a lead byte for a double-byte token */
384 token=tokens[c<<8|*name++];
387 if(token==(uint16_t)(-1)) {
389 /* explicit letter */
390 if((char)c!=*otherName++) {
394 /* stop, but skip the semicolon if we are seeking
395 extended names and there was no 2.0 name but there
397 if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
398 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
406 /* write token word */
407 uint8_t *tokenString=tokenStrings+token;
408 while((c=*tokenString++)!=0) {
409 if((char)c!=*otherName++) {
417 /* complete match? */
418 return (UBool)(*otherName==0);
421 static uint8_t getCharCat(UChar32 cp) {
424 if (U_IS_UNICODE_NONCHAR(cp)) {
425 return U_NONCHARACTER_CODE_POINT;
428 if ((cat = u_charType(cp)) == U_SURROGATE) {
429 cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
435 static const char *getCharCatName(UChar32 cp) {
436 uint8_t cat = getCharCat(cp);
438 /* Return unknown if the table of names above is not up to
441 if (cat >= LENGTHOF(charCatNames)) {
444 return charCatNames[cat];
448 static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
449 const char *catname = getCharCatName(code);
455 WRITE_CHAR(buffer, bufferLength, length, '<');
456 while (catname[length - 1]) {
457 WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
459 WRITE_CHAR(buffer, bufferLength, length, '-');
460 for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
464 for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
465 uint8_t v = (uint8_t)(cp & 0xf);
466 buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
470 WRITE_CHAR(buffer, bufferLength, length, '>');
476 * getGroup() does a binary search for the group that contains the
477 * Unicode code point "code".
478 * The return value is always a valid Group* that may contain "code"
479 * or else is the highest group before "code".
480 * If the lowest group is after "code", then that one is returned.
482 static const uint16_t *
483 getGroup(UCharNames *names, uint32_t code) {
484 const uint16_t *groups=GET_GROUPS(names);
485 uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
490 /* binary search for the group of names that contains the one for code */
491 while(start<limit-1) {
492 number=(uint16_t)((start+limit)/2);
493 if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
500 /* return this regardless of whether it is an exact match */
501 return groups+start*GROUP_LENGTH;
505 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
506 * expands them into offsets and lengths for each string.
507 * Lengths are stored with a variable-width encoding in consecutive nibbles:
508 * If a nibble<0xc, then it is the length itself (0=empty string).
509 * If a nibble>=0xc, then it forms a length value with the following nibble.
510 * Calculation see below.
511 * The offsets and lengths arrays must be at least 33 (one more) long because
512 * there is no check here at the end if the last nibble is still used.
514 static const uint8_t *
515 expandGroupLengths(const uint8_t *s,
516 uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
517 /* read the lengths of the 32 strings in this group and get each string's offset */
518 uint16_t i=0, offset=0, length=0;
521 /* all 32 lengths must be read to get the offset of the first group string */
522 while(i<LINES_PER_GROUP) {
525 /* read even nibble - MSBs of lengthByte */
527 /* double-nibble length spread across two bytes */
528 length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
530 } else if((lengthByte /* &0xf0 */)>=0xc0) {
531 /* double-nibble length spread across this one byte */
532 length=(uint16_t)((lengthByte&0x3f)+12);
534 /* single-nibble length in MSBs */
535 length=(uint16_t)(lengthByte>>4);
545 /* read odd nibble - LSBs of lengthByte */
546 if((lengthByte&0xf0)==0) {
547 /* this nibble was not consumed for a double-nibble length above */
550 /* single-nibble length in LSBs */
558 length=0; /* prevent double-nibble detection in the next iteration */
562 /* now, s is at the first group string */
567 expandGroupName(UCharNames *names, const uint16_t *group,
568 uint16_t lineNumber, UCharNameChoice nameChoice,
569 char *buffer, uint16_t bufferLength) {
570 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
571 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
572 s=expandGroupLengths(s, offsets, lengths);
573 return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
574 buffer, bufferLength);
578 getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
579 char *buffer, uint16_t bufferLength) {
580 const uint16_t *group=getGroup(names, code);
581 if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
582 return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
583 buffer, bufferLength);
585 /* group not found */
595 * enumGroupNames() enumerates all the names in a 32-group
596 * and either calls the enumerator function or finds a given input name.
599 enumGroupNames(UCharNames *names, const uint16_t *group,
600 UChar32 start, UChar32 end,
601 UEnumCharNamesFn *fn, void *context,
602 UCharNameChoice nameChoice) {
603 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
604 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
606 s=expandGroupLengths(s, offsets, lengths);
607 if(fn!=DO_FIND_NAME) {
612 length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
613 if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
614 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
616 /* here, we assume that the buffer is large enough */
618 if(!fn(context, start, nameChoice, buffer, length)) {
625 const char *otherName=((FindName *)context)->otherName;
627 if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
628 ((FindName *)context)->code=start;
638 * enumExtNames enumerate extended names.
639 * It only needs to do it if it is called with a real function and not
640 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
641 * for extended names by itself.
644 enumExtNames(UChar32 start, UChar32 end,
645 UEnumCharNamesFn *fn, void *context)
647 if(fn!=DO_FIND_NAME) {
652 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
653 /* here, we assume that the buffer is large enough */
655 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
667 enumNames(UCharNames *names,
668 UChar32 start, UChar32 limit,
669 UEnumCharNamesFn *fn, void *context,
670 UCharNameChoice nameChoice) {
671 uint16_t startGroupMSB, endGroupMSB, groupCount;
672 const uint16_t *group, *groupLimit;
674 startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
675 endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
677 /* find the group that contains start, or the highest before it */
678 group=getGroup(names, start);
680 if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
681 /* enumerate synthetic names between start and the group start */
682 UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
686 if(!enumExtNames(start, extLimit-1, fn, context)) {
692 if(startGroupMSB==endGroupMSB) {
693 if(startGroupMSB==group[GROUP_MSB]) {
694 /* if start and limit-1 are in the same group, then enumerate only in that one */
695 return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
698 const uint16_t *groups=GET_GROUPS(names);
699 groupCount=*groups++;
700 groupLimit=groups+groupCount*GROUP_LENGTH;
702 if(startGroupMSB==group[GROUP_MSB]) {
703 /* enumerate characters in the partial start group */
704 if((start&GROUP_MASK)!=0) {
705 if(!enumGroupNames(names, group,
706 start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
707 fn, context, nameChoice)) {
710 group=NEXT_GROUP(group); /* continue with the next group */
712 } else if(startGroupMSB>group[GROUP_MSB]) {
713 /* make sure that we start enumerating with the first group after start */
714 const uint16_t *nextGroup=NEXT_GROUP(group);
715 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
716 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
720 if (!enumExtNames(start, end - 1, fn, context)) {
727 /* enumerate entire groups between the start- and end-groups */
728 while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
729 const uint16_t *nextGroup;
730 start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
731 if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
734 nextGroup=NEXT_GROUP(group);
735 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
736 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
740 if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
747 /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
748 if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
749 return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
750 } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
751 UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
760 /* we have not found a group, which means everything is made of
762 if (nameChoice == U_EXTENDED_CHAR_NAME) {
763 if (limit > UCHAR_MAX_VALUE + 1) {
764 limit = UCHAR_MAX_VALUE + 1;
766 return enumExtNames(start, limit - 1, fn, context);
773 writeFactorSuffix(const uint16_t *factors, uint16_t count,
774 const char *s, /* suffix elements */
776 uint16_t indexes[8], /* output fields from here */
777 const char *elementBases[8], const char *elements[8],
778 char *buffer, uint16_t bufferLength) {
779 uint16_t i, factor, bufferPos=0;
782 /* write elements according to the factors */
785 * the factorized elements are determined by modulo arithmetic
786 * with the factors of this algorithm
788 * note that for fewer operations, count is decremented here
791 for(i=count; i>0; --i) {
793 indexes[i]=(uint16_t)(code%factor);
797 * we don't need to calculate the last modulus because start<=code<=end
798 * guarantees here that code<=factors[0]
800 indexes[0]=(uint16_t)code;
802 /* write each element */
804 if(elementBases!=NULL) {
808 /* skip indexes[i] strings */
820 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
823 /* we do not need to perform the rest of this loop for i==count - break here */
828 /* skip the rest of the strings for this factors[i] */
829 factor=(uint16_t)(factors[i]-indexes[i]-1);
848 * Parts of findAlgName() are almost the same as some of getAlgName().
849 * Fixes must be applied to both.
852 getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
853 char *buffer, uint16_t bufferLength) {
854 uint16_t bufferPos=0;
856 /* Only the normative character name can be algorithmic. */
857 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
865 switch(range->type) {
867 /* name = prefix hex-digits */
868 const char *s=(const char *)(range+1);
875 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
878 /* write hexadecimal code point value */
879 count=range->variant;
882 if(count<bufferLength) {
887 if(--i<bufferLength) {
903 /* name = prefix factorized-elements */
905 const uint16_t *factors=(const uint16_t *)(range+1);
906 uint16_t count=range->variant;
907 const char *s=(const char *)(factors+count);
912 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
915 bufferPos+=writeFactorSuffix(factors, count,
916 s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
932 * Important: enumAlgNames() and findAlgName() are almost the same.
933 * Any fix must be applied to both.
936 enumAlgNames(AlgorithmicRange *range,
937 UChar32 start, UChar32 limit,
938 UEnumCharNamesFn *fn, void *context,
939 UCharNameChoice nameChoice) {
943 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
947 switch(range->type) {
952 /* get the full name of the start character */
953 length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
958 /* call the enumerator function with this first character */
959 if(!fn(context, start, nameChoice, buffer, length)) {
963 /* go to the end of the name; all these names have the same length */
969 /* enumerate the rest of the names */
970 while(++start<limit) {
971 /* increment the hexadecimal number on a character-basis */
975 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
986 if(!fn(context, start, nameChoice, buffer, length)) {
994 const char *elementBases[8], *elements[8];
995 const uint16_t *factors=(const uint16_t *)(range+1);
996 uint16_t count=range->variant;
997 const char *s=(const char *)(factors+count);
999 uint16_t prefixLength, i, idx;
1003 /* name = prefix factorized-elements */
1008 while((c=*s++)!=0) {
1013 /* append the suffix of the start character */
1014 length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1015 s, (uint32_t)start-range->start,
1016 indexes, elementBases, elements,
1017 suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1019 /* call the enumerator function with this first character */
1020 if(!fn(context, start, nameChoice, buffer, length)) {
1024 /* enumerate the rest of the names */
1025 while(++start<limit) {
1026 /* increment the indexes in lexical order bound by the factors */
1029 idx=(uint16_t)(indexes[--i]+1);
1030 if(idx<factors[i]) {
1031 /* skip one index and its element string */
1039 /* reset this index to 0 and its element string to the first one */
1041 elements[i]=elementBases[i];
1045 /* to make matters a little easier, just append all elements to the suffix */
1047 length=prefixLength;
1048 for(i=0; i<count; ++i) {
1050 while((c=*s++)!=0) {
1055 /* zero-terminate */
1058 if(!fn(context, start, nameChoice, buffer, length)) {
1065 /* undefined type */
1073 * findAlgName() is almost the same as enumAlgNames() except that it
1074 * returns the code point for a name if it fits into the range.
1075 * It returns 0xffff otherwise.
1078 findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1081 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1085 switch(range->type) {
1087 /* name = prefix hex-digits */
1088 const char *s=(const char *)(range+1);
1093 /* compare prefix */
1094 while((c=*s++)!=0) {
1095 if((char)c!=*otherName++) {
1100 /* read hexadecimal code point value */
1101 count=range->variant;
1103 for(i=0; i<count; ++i) {
1105 if('0'<=c && c<='9') {
1106 code=(code<<4)|(c-'0');
1107 } else if('A'<=c && c<='F') {
1108 code=(code<<4)|(c-'A'+10);
1114 /* does it fit into the range? */
1115 if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1122 uint16_t indexes[8];
1123 const char *elementBases[8], *elements[8];
1124 const uint16_t *factors=(const uint16_t *)(range+1);
1125 uint16_t count=range->variant;
1126 const char *s=(const char *)(factors+count), *t;
1127 UChar32 start, limit;
1132 /* name = prefix factorized-elements */
1134 /* compare prefix */
1135 while((c=*s++)!=0) {
1136 if((char)c!=*otherName++) {
1141 start=(UChar32)range->start;
1142 limit=(UChar32)(range->end+1);
1144 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1145 writeFactorSuffix(factors, count, s, 0,
1146 indexes, elementBases, elements, buffer, sizeof(buffer));
1148 /* compare the first suffix */
1149 if(0==uprv_strcmp(otherName, buffer)) {
1153 /* enumerate and compare the rest of the suffixes */
1154 while(++start<limit) {
1155 /* increment the indexes in lexical order bound by the factors */
1158 idx=(uint16_t)(indexes[--i]+1);
1159 if(idx<factors[i]) {
1160 /* skip one index and its element string */
1167 /* reset this index to 0 and its element string to the first one */
1169 elements[i]=elementBases[i];
1173 /* to make matters a little easier, just compare all elements of the suffix */
1175 for(i=0; i<count; ++i) {
1177 while((c=*s++)!=0) {
1179 s=""; /* does not match */
1191 /* undefined type */
1198 /* sets of name characters, maximum name lengths ---------------------------- */
1200 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1201 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1204 calcStringSetLength(uint32_t set[8], const char *s) {
1208 while((c=*s++)!=0) {
1216 calcAlgNameSetsLengths(int32_t maxNameLength) {
1217 AlgorithmicRange *range;
1219 uint32_t rangeCount;
1222 /* enumerate algorithmic ranges */
1223 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1225 range=(AlgorithmicRange *)(p+1);
1226 while(rangeCount>0) {
1227 switch(range->type) {
1229 /* name = prefix + (range->variant times) hex-digits */
1231 length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1232 if(length>maxNameLength) {
1233 maxNameLength=length;
1237 /* name = prefix factorized-elements */
1238 const uint16_t *factors=(const uint16_t *)(range+1);
1240 int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1243 s=(const char *)(factors+count);
1244 length=calcStringSetLength(gNameSet, s);
1245 s+=length+1; /* start of factor suffixes */
1247 /* get the set and maximum factor suffix length for each factor */
1248 for(i=0; i<count; ++i) {
1250 for(factor=factors[i]; factor>0; --factor) {
1251 factorLength=calcStringSetLength(gNameSet, s);
1253 if(factorLength>maxFactorLength) {
1254 maxFactorLength=factorLength;
1257 length+=maxFactorLength;
1260 if(length>maxNameLength) {
1261 maxNameLength=length;
1270 range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1273 return maxNameLength;
1277 calcExtNameSetsLengths(int32_t maxNameLength) {
1280 for(i=0; i<LENGTHOF(charCatNames); ++i) {
1282 * for each category, count the length of the category name
1286 * 6 for most hex digits per code point
1288 length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1289 if(length>maxNameLength) {
1290 maxNameLength=length;
1293 return maxNameLength;
1297 calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1299 const uint8_t **pLine, const uint8_t *lineLimit) {
1300 const uint8_t *line=*pLine;
1301 int32_t length=0, tokenLength;
1304 while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1306 /* implicit letter */
1311 if(token==(uint16_t)(-2)) {
1312 /* this is a lead byte for a double-byte token */
1316 if(token==(uint16_t)(-1)) {
1317 /* explicit letter */
1321 /* count token word */
1322 if(tokenLengths!=NULL) {
1323 /* use cached token length */
1324 tokenLength=tokenLengths[c];
1325 if(tokenLength==0) {
1326 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1327 tokenLengths[c]=(int8_t)tokenLength;
1330 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1332 length+=tokenLength;
1342 calcGroupNameSetsLengths(int32_t maxNameLength) {
1343 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1345 uint16_t *tokens=(uint16_t *)uCharNames+8;
1346 uint16_t tokenCount=*tokens++;
1347 uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1349 int8_t *tokenLengths;
1351 const uint16_t *group;
1352 const uint8_t *s, *line, *lineLimit;
1354 int32_t groupCount, lineNumber, length;
1356 tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1357 if(tokenLengths!=NULL) {
1358 uprv_memset(tokenLengths, 0, tokenCount);
1361 group=GET_GROUPS(uCharNames);
1362 groupCount=*group++;
1364 /* enumerate all groups */
1365 while(groupCount>0) {
1366 s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1367 s=expandGroupLengths(s, offsets, lengths);
1369 /* enumerate all lines in each group */
1370 for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1371 line=s+offsets[lineNumber];
1372 length=lengths[lineNumber];
1377 lineLimit=line+length;
1379 /* read regular name */
1380 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1381 if(length>maxNameLength) {
1382 maxNameLength=length;
1384 if(line==lineLimit) {
1388 /* read Unicode 1.0 name */
1389 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390 if(length>maxNameLength) {
1391 maxNameLength=length;
1393 if(line==lineLimit) {
1397 /* read ISO comment */
1398 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1401 group=NEXT_GROUP(group);
1405 if(tokenLengths!=NULL) {
1406 uprv_free(tokenLengths);
1409 /* set gMax... - name length last for threading */
1410 gMaxNameLength=maxNameLength;
1414 calcNameSetsLengths(UErrorCode *pErrorCode) {
1415 static const char extChars[]="0123456789ABCDEF<>-";
1416 int32_t i, maxNameLength;
1418 if(gMaxNameLength!=0) {
1422 if(!isDataLoaded(pErrorCode)) {
1426 /* set hex digits, used in various names, and <>-, used in extended names */
1427 for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
1428 SET_ADD(gNameSet, extChars[i]);
1431 /* set sets and lengths from algorithmic names */
1432 maxNameLength=calcAlgNameSetsLengths(0);
1434 /* set sets and lengths from extended names */
1435 maxNameLength=calcExtNameSetsLengths(maxNameLength);
1437 /* set sets and lengths from group names, set global maximum values */
1438 calcGroupNameSetsLengths(maxNameLength);
1443 /* public API --------------------------------------------------------------- */
1445 U_CAPI int32_t U_EXPORT2
1446 u_charName(UChar32 code, UCharNameChoice nameChoice,
1447 char *buffer, int32_t bufferLength,
1448 UErrorCode *pErrorCode) {
1449 AlgorithmicRange *algRange;
1454 /* check the argument values */
1455 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1457 } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1458 bufferLength<0 || (bufferLength>0 && buffer==NULL)
1460 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1464 if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1465 return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1470 /* try algorithmic names first */
1471 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1473 algRange=(AlgorithmicRange *)(p+1);
1475 if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1476 length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1479 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1484 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1485 length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1487 /* extended character name */
1488 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1491 /* normal character name */
1492 length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1496 return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1499 U_CAPI int32_t U_EXPORT2
1500 u_getISOComment(UChar32 /*c*/,
1501 char *dest, int32_t destCapacity,
1502 UErrorCode *pErrorCode) {
1503 /* check the argument values */
1504 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1506 } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1507 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1511 return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1514 U_CAPI UChar32 U_EXPORT2
1515 u_charFromName(UCharNameChoice nameChoice,
1517 UErrorCode *pErrorCode) {
1518 char upper[120], lower[120];
1520 AlgorithmicRange *algRange;
1525 UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
1527 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1531 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1532 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1536 if(!isDataLoaded(pErrorCode)) {
1540 /* construct the uppercase and lowercase of the name first */
1541 for(i=0; i<sizeof(upper); ++i) {
1542 if((c0=*name++)!=0) {
1543 upper[i]=uprv_toupper(c0);
1544 lower[i]=uprv_tolower(c0);
1546 upper[i]=lower[i]=0;
1550 if(i==sizeof(upper)) {
1551 /* name too long, there is no such character */
1552 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1556 /* try extended names first */
1557 if (lower[0] == '<') {
1558 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1559 if (lower[--i] == '>') {
1560 for (--i; lower[i] && lower[i] != '-'; --i) {
1563 if (lower[i] == '-') { /* We've got a category. */
1568 for (++i; lower[i] != '>'; ++i) {
1569 if (lower[i] >= '0' && lower[i] <= '9') {
1570 cp = (cp << 4) + lower[i] - '0';
1571 } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1572 cp = (cp << 4) + lower[i] - 'a' + 10;
1574 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1579 /* Now validate the category name.
1580 We could use a binary search, or a trie, if
1581 we really wanted to. */
1583 for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
1585 if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1586 if (getCharCat(cp) == cIdx) {
1596 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1600 /* try algorithmic names now */
1601 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1603 algRange=(AlgorithmicRange *)(p+1);
1605 if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1608 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1612 /* normal character name */
1613 findName.otherName=upper;
1614 findName.code=error;
1615 enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1616 if (findName.code == error) {
1617 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1619 return findName.code;
1622 U_CAPI void U_EXPORT2
1623 u_enumCharNames(UChar32 start, UChar32 limit,
1624 UEnumCharNamesFn *fn,
1626 UCharNameChoice nameChoice,
1627 UErrorCode *pErrorCode) {
1628 AlgorithmicRange *algRange;
1632 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1636 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1637 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1641 if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1642 limit = UCHAR_MAX_VALUE + 1;
1644 if((uint32_t)start>=(uint32_t)limit) {
1648 if(!isDataLoaded(pErrorCode)) {
1652 /* interleave the data-driven ones with the algorithmic ones */
1653 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1654 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1656 algRange=(AlgorithmicRange *)(p+1);
1658 /* enumerate the character names before the current algorithmic range */
1659 /* here: start<limit */
1660 if((uint32_t)start<algRange->start) {
1661 if((uint32_t)limit<=algRange->start) {
1662 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1665 if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1668 start=(UChar32)algRange->start;
1670 /* enumerate the character names in the current algorithmic range */
1671 /* here: algRange->start<=start<limit */
1672 if((uint32_t)start<=algRange->end) {
1673 if((uint32_t)limit<=(algRange->end+1)) {
1674 enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1677 if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1680 start=(UChar32)algRange->end+1;
1682 /* continue to the next algorithmic range (here: start<limit) */
1683 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1686 /* enumerate the character names after the last algorithmic range */
1687 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1690 U_CAPI int32_t U_EXPORT2
1691 uprv_getMaxCharNameLength() {
1692 UErrorCode errorCode=U_ZERO_ERROR;
1693 if(calcNameSetsLengths(&errorCode)) {
1694 return gMaxNameLength;
1701 * Converts the char set cset into a Unicode set uset.
1702 * @param cset Set of 256 bit flags corresponding to a set of chars.
1703 * @param uset USet to receive characters. Existing contents are deleted.
1706 charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1711 UErrorCode errorCode;
1713 errorCode=U_ZERO_ERROR;
1715 if(!calcNameSetsLengths(&errorCode)) {
1719 /* build a char string with all chars that are used in character names */
1721 for(i=0; i<256; ++i) {
1722 if(SET_CONTAINS(cset, i)) {
1723 cs[length++]=(char)i;
1727 /* convert the char string to a UChar string */
1728 u_charsToUChars(cs, us, length);
1730 /* add each UChar to the USet */
1731 for(i=0; i<length; ++i) {
1732 if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1733 sa->add(sa->set, us[i]);
1739 * Fills set with characters that are used in Unicode character names.
1740 * @param set USet to receive characters.
1742 U_CAPI void U_EXPORT2
1743 uprv_getCharNameCharacters(const USetAdder *sa) {
1744 charSetToUSet(gNameSet, sa);
1747 /* data swapping ------------------------------------------------------------ */
1750 * The token table contains non-negative entries for token bytes,
1751 * and -1 for bytes that represent themselves in the data file's charset.
1752 * -2 entries are used for lead bytes.
1754 * Direct bytes (-1 entries) must be translated from the input charset family
1755 * to the output charset family.
1756 * makeTokenMap() writes a permutation mapping for this.
1757 * Use it once for single-/lead-byte tokens and once more for all trail byte
1758 * tokens. (';' is an unused trail byte marked with -1.)
1761 makeTokenMap(const UDataSwapper *ds,
1762 int16_t tokens[], uint16_t tokenCount,
1764 UErrorCode *pErrorCode) {
1765 UBool usedOutChar[256];
1769 if(U_FAILURE(*pErrorCode)) {
1773 if(ds->inCharset==ds->outCharset) {
1774 /* Same charset family: identity permutation */
1775 for(i=0; i<256; ++i) {
1779 uprv_memset(map, 0, 256);
1780 uprv_memset(usedOutChar, 0, 256);
1782 if(tokenCount>256) {
1786 /* set the direct bytes (byte 0 always maps to itself) */
1787 for(i=1; i<tokenCount; ++i) {
1789 /* convert the direct byte character */
1791 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1792 if(U_FAILURE(*pErrorCode)) {
1793 udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1798 /* enter the converted character into the map and mark it used */
1800 usedOutChar[c2]=TRUE;
1804 /* set the mappings for the rest of the permutation */
1805 for(i=j=1; i<tokenCount; ++i) {
1806 /* set mappings that were not set for direct bytes */
1808 /* set an output byte value that was not used as an output byte above */
1809 while(usedOutChar[j]) {
1812 map[i]=(uint8_t)j++;
1817 * leave mappings at tokenCount and above unset if tokenCount<256
1818 * because they won't be used
1823 U_CAPI int32_t U_EXPORT2
1824 uchar_swapNames(const UDataSwapper *ds,
1825 const void *inData, int32_t length, void *outData,
1826 UErrorCode *pErrorCode) {
1827 const UDataInfo *pInfo;
1830 const uint8_t *inBytes;
1833 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1834 offset, i, count, stringsCount;
1836 const AlgorithmicRange *inRange;
1837 AlgorithmicRange *outRange;
1839 /* udata_swapDataHeader checks the arguments */
1840 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1841 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1845 /* check data format and format version */
1846 pInfo=(const UDataInfo *)((const char *)inData+4);
1848 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
1849 pInfo->dataFormat[1]==0x6e &&
1850 pInfo->dataFormat[2]==0x61 &&
1851 pInfo->dataFormat[3]==0x6d &&
1852 pInfo->formatVersion[0]==1
1854 udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1855 pInfo->dataFormat[0], pInfo->dataFormat[1],
1856 pInfo->dataFormat[2], pInfo->dataFormat[3],
1857 pInfo->formatVersion[0]);
1858 *pErrorCode=U_UNSUPPORTED_ERROR;
1862 inBytes=(const uint8_t *)inData+headerSize;
1863 outBytes=(uint8_t *)outData+headerSize;
1865 algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1869 (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1871 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1873 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1879 /* preflighting: iterate through algorithmic ranges */
1880 offset=algNamesOffset;
1881 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1884 for(i=0; i<count; ++i) {
1885 inRange=(const AlgorithmicRange *)(inBytes+offset);
1886 offset+=ds->readUInt16(inRange->size);
1893 int16_t tokens[512];
1894 uint16_t tokenCount;
1896 uint8_t map[256], trailMap[256];
1898 /* copy the data for inaccessible bytes */
1899 if(inBytes!=outBytes) {
1900 uprv_memcpy(outBytes, inBytes, length);
1903 /* the initial 4 offsets first */
1904 tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1905 groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1906 groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1907 ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1910 * now the tokens table
1911 * it needs to be permutated along with the compressed name strings
1913 p=(const uint16_t *)(inBytes+16);
1914 q=(uint16_t *)(outBytes+16);
1916 /* read and swap the tokenCount */
1917 tokenCount=ds->readUInt16(*p);
1918 ds->swapArray16(ds, p, 2, q, pErrorCode);
1922 /* read the first 512 tokens and make the token maps */
1923 if(tokenCount<=512) {
1928 for(i=0; i<count; ++i) {
1929 tokens[i]=udata_readInt16(ds, p[i]);
1932 tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1934 makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1935 makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1936 if(U_FAILURE(*pErrorCode)) {
1941 * swap and permutate the tokens
1942 * go through a temporary array to support in-place swapping
1944 temp=(uint16_t *)uprv_malloc(tokenCount*2);
1946 udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1948 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1952 /* swap and permutate single-/lead-byte tokens */
1953 for(i=0; i<tokenCount && i<256; ++i) {
1954 ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1957 /* swap and permutate trail-byte tokens */
1958 for(; i<tokenCount; ++i) {
1959 ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1962 /* copy the result into the output and free the temporary array */
1963 uprv_memcpy(q, temp, tokenCount*2);
1967 * swap the token strings but not a possible padding byte after
1968 * the terminating NUL of the last string
1970 udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1971 outBytes+tokenStringOffset, pErrorCode);
1972 if(U_FAILURE(*pErrorCode)) {
1973 udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1977 /* swap the group table */
1978 count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1979 ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1980 outBytes+groupsOffset, pErrorCode);
1983 * swap the group strings
1984 * swap the string bytes but not the nibble-encoded string lengths
1986 if(ds->inCharset!=ds->outCharset) {
1987 uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
1989 const uint8_t *inStrings, *nextInStrings;
1990 uint8_t *outStrings;
1994 inStrings=inBytes+groupStringOffset;
1995 outStrings=outBytes+groupStringOffset;
1997 stringsCount=algNamesOffset-groupStringOffset;
1999 /* iterate through string groups until only a few padding bytes are left */
2000 while(stringsCount>32) {
2001 nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2003 /* move past the length bytes */
2004 stringsCount-=(uint32_t)(nextInStrings-inStrings);
2005 outStrings+=nextInStrings-inStrings;
2006 inStrings=nextInStrings;
2008 count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2009 stringsCount-=count;
2011 /* swap the string bytes using map[] and trailMap[] */
2014 *outStrings++=map[c];
2018 /* token lead byte: swap the trail byte, too */
2019 *outStrings++=trailMap[*inStrings++];
2026 /* swap the algorithmic ranges */
2027 offset=algNamesOffset;
2028 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2029 ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2032 for(i=0; i<count; ++i) {
2033 if(offset>(uint32_t)length) {
2034 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2036 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2040 inRange=(const AlgorithmicRange *)(inBytes+offset);
2041 outRange=(AlgorithmicRange *)(outBytes+offset);
2042 offset+=ds->readUInt16(inRange->size);
2044 ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2045 ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2046 switch(inRange->type) {
2048 /* swap prefix string */
2049 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2050 outRange+1, pErrorCode);
2051 if(U_FAILURE(*pErrorCode)) {
2052 udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2059 /* swap factors and the prefix and factor strings */
2060 uint32_t factorsCount;
2062 factorsCount=inRange->variant;
2063 p=(const uint16_t *)(inRange+1);
2064 q=(uint16_t *)(outRange+1);
2065 ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2067 /* swap the strings, up to the last terminating NUL */
2070 stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2071 while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2074 ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2078 udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2080 *pErrorCode=U_UNSUPPORTED_ERROR;
2086 return headerSize+(int32_t)offset;
2092 * Hey, Emacs, please set the following:
2095 * indent-tabs-mode: nil