2 ******************************************************************************
4 * Copyright (C) 1999-2009, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 1999oct04
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/putil.h"
19 #include "unicode/uchar.h"
20 #include "unicode/udata.h"
29 /* prototypes ------------------------------------------------------------- */
31 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
33 static const char DATA_NAME[] = "unames";
34 static const char DATA_TYPE[] = "icu";
37 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
38 #define GROUP_MASK (LINES_PER_GROUP-1)
41 * This struct was replaced by explicitly accessing equivalent
42 * fields from triples of uint16_t.
43 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
44 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
45 * would advance by 6 bytes (3 uint16_t).
47 * We can't just change the data structure because it's loaded from a data file,
48 * and we don't want to make it less compact, so we changed the access code.
50 * For details see ICU tickets 6331 and 6008.
53 offsetHigh, offsetLow; / * avoid padding * /
64 * Get the 32-bit group offset.
65 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
66 * @return group offset (int32_t)
68 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
70 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
71 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
75 uint8_t type, variant;
80 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
84 * Get the groups table from a UCharNames struct.
85 * The groups table consists of one uint16_t groupCount followed by
86 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
87 * and the comment for the old struct Group above.
89 * @param names (const UCharNames *) pointer to the UCharNames indexes
90 * @return (const uint16_t *) pointer to the groups table
92 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
95 const char *otherName;
99 #define DO_FIND_NAME NULL
101 static UDataMemory *uCharNamesData=NULL;
102 static UCharNames *uCharNames=NULL;
103 static UErrorCode gLoadErrorCode=U_ZERO_ERROR;
106 * Maximum length of character names (regular & 1.0).
108 static int32_t gMaxNameLength=0;
111 * Set of chars used in character names (regular & 1.0).
112 * Chars are platform-dependent (can be EBCDIC).
114 static uint32_t gNameSet[8]={ 0 };
116 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
117 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
118 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
120 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
122 static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
131 "combining spacing mark",
132 "decimal digit number",
137 "paragraph separator",
145 "connector punctuation",
151 "initial punctuation",
158 /* implementation ----------------------------------------------------------- */
160 static UBool U_CALLCONV unames_cleanup(void)
163 udata_close(uCharNamesData);
164 uCharNamesData = NULL;
173 static UBool U_CALLCONV
174 isAcceptable(void *context,
175 const char *type, const char *name,
176 const UDataInfo *pInfo) {
179 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
180 pInfo->charsetFamily==U_CHARSET_FAMILY &&
181 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
182 pInfo->dataFormat[1]==0x6e &&
183 pInfo->dataFormat[2]==0x61 &&
184 pInfo->dataFormat[3]==0x6d &&
185 pInfo->formatVersion[0]==1);
189 isDataLoaded(UErrorCode *pErrorCode) {
190 /* load UCharNames from file if necessary */
193 /* do this because double-checked locking is broken */
194 UMTX_CHECK(NULL, (uCharNames!=NULL), isCached);
200 /* check error code from previous attempt */
201 if(U_FAILURE(gLoadErrorCode)) {
202 *pErrorCode=gLoadErrorCode;
206 /* open the data outside the mutex block */
207 data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
208 if(U_FAILURE(*pErrorCode)) {
209 gLoadErrorCode=*pErrorCode;
213 names=(UCharNames *)udata_getMemory(data);
215 /* in the mutex block, set the data for this process */
218 if(uCharNames==NULL) {
223 ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
228 /* if a different thread set it first, then close the extra data */
230 udata_close(data); /* NULL if it was set correctly */
236 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
237 if((bufferLength)>0) { \
244 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
247 * Important: expandName() and compareName() are almost the same -
248 * apply fixes to both.
250 * UnicodeData.txt uses ';' as a field separator, so no
251 * field can contain ';' as part of its contents.
252 * In unames.dat, it is marked as token[';']==-1 only if the
253 * semicolon is used in the data file - which is iff we
254 * have Unicode 1.0 names or ISO comments or aliases.
255 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
256 * although we know that it will never be part of a name.
259 expandName(UCharNames *names,
260 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
261 char *buffer, uint16_t bufferLength) {
262 uint16_t *tokens=(uint16_t *)names+8;
263 uint16_t token, tokenCount=*tokens++, bufferPos=0;
264 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
267 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
269 * skip the modern name if it is not requested _and_
270 * if the semicolon byte value is a character, not a token number
272 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
273 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
275 while(nameLength>0) {
281 } while(--fieldIndex>0);
284 * the semicolon byte value is a token number, therefore
285 * only modern names are stored in unames.dat and there is no
286 * such requested alternate name here
292 /* write each letter directly, and write a token word per token */
293 while(nameLength>0) {
299 /* implicit letter */
300 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
307 if(token==(uint16_t)(-2)) {
308 /* this is a lead byte for a double-byte token */
309 token=tokens[c<<8|*name++];
312 if(token==(uint16_t)(-1)) {
314 /* explicit letter */
315 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
317 /* stop, but skip the semicolon if we are seeking
318 extended names and there was no 2.0 name but there
320 if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
321 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
329 /* write token word */
330 uint8_t *tokenString=tokenStrings+token;
331 while((c=*tokenString++)!=0) {
332 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
347 * compareName() is almost the same as expandName() except that it compares
348 * the currently expanded name to an input name.
349 * It returns the match/no match result as soon as possible.
352 compareName(UCharNames *names,
353 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
354 const char *otherName) {
355 uint16_t *tokens=(uint16_t *)names+8;
356 uint16_t token, tokenCount=*tokens++;
357 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
359 const char *origOtherName = otherName;
361 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
363 * skip the modern name if it is not requested _and_
364 * if the semicolon byte value is a character, not a token number
366 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
367 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
369 while(nameLength>0) {
375 } while(--fieldIndex>0);
378 * the semicolon byte value is a token number, therefore
379 * only modern names are stored in unames.dat and there is no
380 * such requested alternate name here
386 /* compare each letter directly, and compare a token word per token */
387 while(nameLength>0) {
393 /* implicit letter */
394 if((char)c!=*otherName++) {
403 if(token==(uint16_t)(-2)) {
404 /* this is a lead byte for a double-byte token */
405 token=tokens[c<<8|*name++];
408 if(token==(uint16_t)(-1)) {
410 /* explicit letter */
411 if((char)c!=*otherName++) {
415 /* stop, but skip the semicolon if we are seeking
416 extended names and there was no 2.0 name but there
418 if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
419 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
427 /* write token word */
428 uint8_t *tokenString=tokenStrings+token;
429 while((c=*tokenString++)!=0) {
430 if((char)c!=*otherName++) {
438 /* complete match? */
439 return (UBool)(*otherName==0);
442 static uint8_t getCharCat(UChar32 cp) {
445 if (UTF_IS_UNICODE_NONCHAR(cp)) {
446 return U_NONCHARACTER_CODE_POINT;
449 if ((cat = u_charType(cp)) == U_SURROGATE) {
450 cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
456 static const char *getCharCatName(UChar32 cp) {
457 uint8_t cat = getCharCat(cp);
459 /* Return unknown if the table of names above is not up to
462 if (cat >= LENGTHOF(charCatNames)) {
465 return charCatNames[cat];
469 static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
470 const char *catname = getCharCatName(code);
476 WRITE_CHAR(buffer, bufferLength, length, '<');
477 while (catname[length - 1]) {
478 WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
480 WRITE_CHAR(buffer, bufferLength, length, '-');
481 for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
485 for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
486 uint8_t v = (uint8_t)(cp & 0xf);
487 buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
491 WRITE_CHAR(buffer, bufferLength, length, '>');
497 * getGroup() does a binary search for the group that contains the
498 * Unicode code point "code".
499 * The return value is always a valid Group* that may contain "code"
500 * or else is the highest group before "code".
501 * If the lowest group is after "code", then that one is returned.
503 static const uint16_t *
504 getGroup(UCharNames *names, uint32_t code) {
505 const uint16_t *groups=GET_GROUPS(names);
506 uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
511 /* binary search for the group of names that contains the one for code */
512 while(start<limit-1) {
513 number=(uint16_t)((start+limit)/2);
514 if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
521 /* return this regardless of whether it is an exact match */
522 return groups+start*GROUP_LENGTH;
526 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
527 * expands them into offsets and lengths for each string.
528 * Lengths are stored with a variable-width encoding in consecutive nibbles:
529 * If a nibble<0xc, then it is the length itself (0=empty string).
530 * If a nibble>=0xc, then it forms a length value with the following nibble.
531 * Calculation see below.
532 * The offsets and lengths arrays must be at least 33 (one more) long because
533 * there is no check here at the end if the last nibble is still used.
535 static const uint8_t *
536 expandGroupLengths(const uint8_t *s,
537 uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
538 /* read the lengths of the 32 strings in this group and get each string's offset */
539 uint16_t i=0, offset=0, length=0;
542 /* all 32 lengths must be read to get the offset of the first group string */
543 while(i<LINES_PER_GROUP) {
546 /* read even nibble - MSBs of lengthByte */
548 /* double-nibble length spread across two bytes */
549 length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
551 } else if((lengthByte /* &0xf0 */)>=0xc0) {
552 /* double-nibble length spread across this one byte */
553 length=(uint16_t)((lengthByte&0x3f)+12);
555 /* single-nibble length in MSBs */
556 length=(uint16_t)(lengthByte>>4);
566 /* read odd nibble - LSBs of lengthByte */
567 if((lengthByte&0xf0)==0) {
568 /* this nibble was not consumed for a double-nibble length above */
571 /* single-nibble length in LSBs */
579 length=0; /* prevent double-nibble detection in the next iteration */
583 /* now, s is at the first group string */
588 expandGroupName(UCharNames *names, const uint16_t *group,
589 uint16_t lineNumber, UCharNameChoice nameChoice,
590 char *buffer, uint16_t bufferLength) {
591 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
592 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
593 s=expandGroupLengths(s, offsets, lengths);
594 return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
595 buffer, bufferLength);
599 getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
600 char *buffer, uint16_t bufferLength) {
601 const uint16_t *group=getGroup(names, code);
602 if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
603 return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
604 buffer, bufferLength);
606 /* group not found */
616 * enumGroupNames() enumerates all the names in a 32-group
617 * and either calls the enumerator function or finds a given input name.
620 enumGroupNames(UCharNames *names, const uint16_t *group,
621 UChar32 start, UChar32 end,
622 UEnumCharNamesFn *fn, void *context,
623 UCharNameChoice nameChoice) {
624 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
625 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
627 s=expandGroupLengths(s, offsets, lengths);
628 if(fn!=DO_FIND_NAME) {
633 length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
634 if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
635 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
637 /* here, we assume that the buffer is large enough */
639 if(!fn(context, start, nameChoice, buffer, length)) {
646 const char *otherName=((FindName *)context)->otherName;
648 if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
649 ((FindName *)context)->code=start;
659 * enumExtNames enumerate extended names.
660 * It only needs to do it if it is called with a real function and not
661 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
662 * for extended names by itself.
665 enumExtNames(UChar32 start, UChar32 end,
666 UEnumCharNamesFn *fn, void *context)
668 if(fn!=DO_FIND_NAME) {
673 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
674 /* here, we assume that the buffer is large enough */
676 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
688 enumNames(UCharNames *names,
689 UChar32 start, UChar32 limit,
690 UEnumCharNamesFn *fn, void *context,
691 UCharNameChoice nameChoice) {
692 uint16_t startGroupMSB, endGroupMSB, groupCount;
693 const uint16_t *group, *groupLimit;
695 startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
696 endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
698 /* find the group that contains start, or the highest before it */
699 group=getGroup(names, start);
701 if(startGroupMSB==endGroupMSB) {
702 if(startGroupMSB==group[GROUP_MSB]) {
703 /* if start and limit-1 are in the same group, then enumerate only in that one */
704 return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
707 const uint16_t *groups=GET_GROUPS(names);
708 groupCount=*groups++;
709 groupLimit=groups+groupCount*GROUP_LENGTH;
711 if(startGroupMSB==group[GROUP_MSB]) {
712 /* enumerate characters in the partial start group */
713 if((start&GROUP_MASK)!=0) {
714 if(!enumGroupNames(names, group,
715 start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
716 fn, context, nameChoice)) {
719 group=NEXT_GROUP(group); /* continue with the next group */
721 } else if(startGroupMSB>group[GROUP_MSB]) {
722 /* make sure that we start enumerating with the first group after start */
723 const uint16_t *nextGroup=NEXT_GROUP(group);
724 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
725 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
729 if (!enumExtNames(start, end - 1, fn, context)) {
736 /* enumerate entire groups between the start- and end-groups */
737 while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
738 const uint16_t *nextGroup;
739 start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
740 if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
743 nextGroup=NEXT_GROUP(group);
744 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
745 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
749 if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
756 /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
757 if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
758 return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
759 } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
760 UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
769 /* we have not found a group, which means everything is made of
771 if (nameChoice == U_EXTENDED_CHAR_NAME) {
772 if (limit > UCHAR_MAX_VALUE + 1) {
773 limit = UCHAR_MAX_VALUE + 1;
775 return enumExtNames(start, limit - 1, fn, context);
782 writeFactorSuffix(const uint16_t *factors, uint16_t count,
783 const char *s, /* suffix elements */
785 uint16_t indexes[8], /* output fields from here */
786 const char *elementBases[8], const char *elements[8],
787 char *buffer, uint16_t bufferLength) {
788 uint16_t i, factor, bufferPos=0;
791 /* write elements according to the factors */
794 * the factorized elements are determined by modulo arithmetic
795 * with the factors of this algorithm
797 * note that for fewer operations, count is decremented here
800 for(i=count; i>0; --i) {
802 indexes[i]=(uint16_t)(code%factor);
806 * we don't need to calculate the last modulus because start<=code<=end
807 * guarantees here that code<=factors[0]
809 indexes[0]=(uint16_t)code;
811 /* write each element */
813 if(elementBases!=NULL) {
817 /* skip indexes[i] strings */
829 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
832 /* we do not need to perform the rest of this loop for i==count - break here */
837 /* skip the rest of the strings for this factors[i] */
838 factor=(uint16_t)(factors[i]-indexes[i]-1);
857 * Parts of findAlgName() are almost the same as some of getAlgName().
858 * Fixes must be applied to both.
861 getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
862 char *buffer, uint16_t bufferLength) {
863 uint16_t bufferPos=0;
865 /* Only the normative character name can be algorithmic. */
866 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
874 switch(range->type) {
876 /* name = prefix hex-digits */
877 const char *s=(const char *)(range+1);
884 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
887 /* write hexadecimal code point value */
888 count=range->variant;
891 if(count<bufferLength) {
896 if(--i<bufferLength) {
912 /* name = prefix factorized-elements */
914 const uint16_t *factors=(const uint16_t *)(range+1);
915 uint16_t count=range->variant;
916 const char *s=(const char *)(factors+count);
921 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
924 bufferPos+=writeFactorSuffix(factors, count,
925 s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
941 * Important: enumAlgNames() and findAlgName() are almost the same.
942 * Any fix must be applied to both.
945 enumAlgNames(AlgorithmicRange *range,
946 UChar32 start, UChar32 limit,
947 UEnumCharNamesFn *fn, void *context,
948 UCharNameChoice nameChoice) {
952 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
956 switch(range->type) {
961 /* get the full name of the start character */
962 length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
967 /* call the enumerator function with this first character */
968 if(!fn(context, start, nameChoice, buffer, length)) {
972 /* go to the end of the name; all these names have the same length */
978 /* enumerate the rest of the names */
979 while(++start<limit) {
980 /* increment the hexadecimal number on a character-basis */
984 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
995 if(!fn(context, start, nameChoice, buffer, length)) {
1002 uint16_t indexes[8];
1003 const char *elementBases[8], *elements[8];
1004 const uint16_t *factors=(const uint16_t *)(range+1);
1005 uint16_t count=range->variant;
1006 const char *s=(const char *)(factors+count);
1008 uint16_t prefixLength, i, idx;
1012 /* name = prefix factorized-elements */
1017 while((c=*s++)!=0) {
1022 /* append the suffix of the start character */
1023 length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1024 s, (uint32_t)start-range->start,
1025 indexes, elementBases, elements,
1026 suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1028 /* call the enumerator function with this first character */
1029 if(!fn(context, start, nameChoice, buffer, length)) {
1033 /* enumerate the rest of the names */
1034 while(++start<limit) {
1035 /* increment the indexes in lexical order bound by the factors */
1038 idx=(uint16_t)(indexes[--i]+1);
1039 if(idx<factors[i]) {
1040 /* skip one index and its element string */
1048 /* reset this index to 0 and its element string to the first one */
1050 elements[i]=elementBases[i];
1054 /* to make matters a little easier, just append all elements to the suffix */
1056 length=prefixLength;
1057 for(i=0; i<count; ++i) {
1059 while((c=*s++)!=0) {
1064 /* zero-terminate */
1067 if(!fn(context, start, nameChoice, buffer, length)) {
1074 /* undefined type */
1082 * findAlgName() is almost the same as enumAlgNames() except that it
1083 * returns the code point for a name if it fits into the range.
1084 * It returns 0xffff otherwise.
1087 findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1090 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1094 switch(range->type) {
1096 /* name = prefix hex-digits */
1097 const char *s=(const char *)(range+1);
1102 /* compare prefix */
1103 while((c=*s++)!=0) {
1104 if((char)c!=*otherName++) {
1109 /* read hexadecimal code point value */
1110 count=range->variant;
1112 for(i=0; i<count; ++i) {
1114 if('0'<=c && c<='9') {
1115 code=(code<<4)|(c-'0');
1116 } else if('A'<=c && c<='F') {
1117 code=(code<<4)|(c-'A'+10);
1123 /* does it fit into the range? */
1124 if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1131 uint16_t indexes[8];
1132 const char *elementBases[8], *elements[8];
1133 const uint16_t *factors=(const uint16_t *)(range+1);
1134 uint16_t count=range->variant;
1135 const char *s=(const char *)(factors+count), *t;
1136 UChar32 start, limit;
1141 /* name = prefix factorized-elements */
1143 /* compare prefix */
1144 while((c=*s++)!=0) {
1145 if((char)c!=*otherName++) {
1150 start=(UChar32)range->start;
1151 limit=(UChar32)(range->end+1);
1153 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1154 writeFactorSuffix(factors, count, s, 0,
1155 indexes, elementBases, elements, buffer, sizeof(buffer));
1157 /* compare the first suffix */
1158 if(0==uprv_strcmp(otherName, buffer)) {
1162 /* enumerate and compare the rest of the suffixes */
1163 while(++start<limit) {
1164 /* increment the indexes in lexical order bound by the factors */
1167 idx=(uint16_t)(indexes[--i]+1);
1168 if(idx<factors[i]) {
1169 /* skip one index and its element string */
1176 /* reset this index to 0 and its element string to the first one */
1178 elements[i]=elementBases[i];
1182 /* to make matters a little easier, just compare all elements of the suffix */
1184 for(i=0; i<count; ++i) {
1186 while((c=*s++)!=0) {
1188 s=""; /* does not match */
1200 /* undefined type */
1207 /* sets of name characters, maximum name lengths ---------------------------- */
1209 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1210 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1213 calcStringSetLength(uint32_t set[8], const char *s) {
1217 while((c=*s++)!=0) {
1225 calcAlgNameSetsLengths(int32_t maxNameLength) {
1226 AlgorithmicRange *range;
1228 uint32_t rangeCount;
1231 /* enumerate algorithmic ranges */
1232 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1234 range=(AlgorithmicRange *)(p+1);
1235 while(rangeCount>0) {
1236 switch(range->type) {
1238 /* name = prefix + (range->variant times) hex-digits */
1240 length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1241 if(length>maxNameLength) {
1242 maxNameLength=length;
1246 /* name = prefix factorized-elements */
1247 const uint16_t *factors=(const uint16_t *)(range+1);
1249 int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1252 s=(const char *)(factors+count);
1253 length=calcStringSetLength(gNameSet, s);
1254 s+=length+1; /* start of factor suffixes */
1256 /* get the set and maximum factor suffix length for each factor */
1257 for(i=0; i<count; ++i) {
1259 for(factor=factors[i]; factor>0; --factor) {
1260 factorLength=calcStringSetLength(gNameSet, s);
1262 if(factorLength>maxFactorLength) {
1263 maxFactorLength=factorLength;
1266 length+=maxFactorLength;
1269 if(length>maxNameLength) {
1270 maxNameLength=length;
1279 range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1282 return maxNameLength;
1286 calcExtNameSetsLengths(int32_t maxNameLength) {
1289 for(i=0; i<LENGTHOF(charCatNames); ++i) {
1291 * for each category, count the length of the category name
1295 * 6 for most hex digits per code point
1297 length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1298 if(length>maxNameLength) {
1299 maxNameLength=length;
1302 return maxNameLength;
1306 calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1308 const uint8_t **pLine, const uint8_t *lineLimit) {
1309 const uint8_t *line=*pLine;
1310 int32_t length=0, tokenLength;
1313 while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1315 /* implicit letter */
1320 if(token==(uint16_t)(-2)) {
1321 /* this is a lead byte for a double-byte token */
1325 if(token==(uint16_t)(-1)) {
1326 /* explicit letter */
1330 /* count token word */
1331 if(tokenLengths!=NULL) {
1332 /* use cached token length */
1333 tokenLength=tokenLengths[c];
1334 if(tokenLength==0) {
1335 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1336 tokenLengths[c]=(int8_t)tokenLength;
1339 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1341 length+=tokenLength;
1351 calcGroupNameSetsLengths(int32_t maxNameLength) {
1352 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1354 uint16_t *tokens=(uint16_t *)uCharNames+8;
1355 uint16_t tokenCount=*tokens++;
1356 uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1358 int8_t *tokenLengths;
1360 const uint16_t *group;
1361 const uint8_t *s, *line, *lineLimit;
1363 int32_t groupCount, lineNumber, length;
1365 tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1366 if(tokenLengths!=NULL) {
1367 uprv_memset(tokenLengths, 0, tokenCount);
1370 group=GET_GROUPS(uCharNames);
1371 groupCount=*group++;
1373 /* enumerate all groups */
1374 while(groupCount>0) {
1375 s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1376 s=expandGroupLengths(s, offsets, lengths);
1378 /* enumerate all lines in each group */
1379 for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1380 line=s+offsets[lineNumber];
1381 length=lengths[lineNumber];
1386 lineLimit=line+length;
1388 /* read regular name */
1389 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390 if(length>maxNameLength) {
1391 maxNameLength=length;
1393 if(line==lineLimit) {
1397 /* read Unicode 1.0 name */
1398 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1399 if(length>maxNameLength) {
1400 maxNameLength=length;
1402 if(line==lineLimit) {
1406 /* read ISO comment */
1407 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1410 group=NEXT_GROUP(group);
1414 if(tokenLengths!=NULL) {
1415 uprv_free(tokenLengths);
1418 /* set gMax... - name length last for threading */
1419 gMaxNameLength=maxNameLength;
1423 calcNameSetsLengths(UErrorCode *pErrorCode) {
1424 static const char extChars[]="0123456789ABCDEF<>-";
1425 int32_t i, maxNameLength;
1427 if(gMaxNameLength!=0) {
1431 if(!isDataLoaded(pErrorCode)) {
1435 /* set hex digits, used in various names, and <>-, used in extended names */
1436 for(i=0; i<sizeof(extChars)-1; ++i) {
1437 SET_ADD(gNameSet, extChars[i]);
1440 /* set sets and lengths from algorithmic names */
1441 maxNameLength=calcAlgNameSetsLengths(0);
1443 /* set sets and lengths from extended names */
1444 maxNameLength=calcExtNameSetsLengths(maxNameLength);
1446 /* set sets and lengths from group names, set global maximum values */
1447 calcGroupNameSetsLengths(maxNameLength);
1452 /* public API --------------------------------------------------------------- */
1454 U_CAPI int32_t U_EXPORT2
1455 u_charName(UChar32 code, UCharNameChoice nameChoice,
1456 char *buffer, int32_t bufferLength,
1457 UErrorCode *pErrorCode) {
1458 AlgorithmicRange *algRange;
1463 /* check the argument values */
1464 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1466 } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1467 bufferLength<0 || (bufferLength>0 && buffer==NULL)
1469 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1473 if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1474 return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1479 /* try algorithmic names first */
1480 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1482 algRange=(AlgorithmicRange *)(p+1);
1484 if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1485 length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1488 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1493 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1494 length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1496 /* extended character name */
1497 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1500 /* normal character name */
1501 length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1505 return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1508 U_CAPI int32_t U_EXPORT2
1509 u_getISOComment(UChar32 c,
1510 char *dest, int32_t destCapacity,
1511 UErrorCode *pErrorCode) {
1514 /* check the argument values */
1515 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1517 } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1518 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1522 if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1523 return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1526 /* the ISO comment is stored like a normal character name */
1527 length=getName(uCharNames, (uint32_t)c, U_ISO_COMMENT, dest, (uint16_t)destCapacity);
1528 return u_terminateChars(dest, destCapacity, length, pErrorCode);
1531 U_CAPI UChar32 U_EXPORT2
1532 u_charFromName(UCharNameChoice nameChoice,
1534 UErrorCode *pErrorCode) {
1535 char upper[120], lower[120];
1537 AlgorithmicRange *algRange;
1542 UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
1544 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1548 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1549 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1553 if(!isDataLoaded(pErrorCode)) {
1557 /* construct the uppercase and lowercase of the name first */
1558 for(i=0; i<sizeof(upper); ++i) {
1559 if((c0=*name++)!=0) {
1560 upper[i]=uprv_toupper(c0);
1561 lower[i]=uprv_tolower(c0);
1563 upper[i]=lower[i]=0;
1567 if(i==sizeof(upper)) {
1568 /* name too long, there is no such character */
1569 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1573 /* try extended names first */
1574 if (lower[0] == '<') {
1575 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1576 if (lower[--i] == '>') {
1577 for (--i; lower[i] && lower[i] != '-'; --i) {
1580 if (lower[i] == '-') { /* We've got a category. */
1585 for (++i; lower[i] != '>'; ++i) {
1586 if (lower[i] >= '0' && lower[i] <= '9') {
1587 cp = (cp << 4) + lower[i] - '0';
1588 } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1589 cp = (cp << 4) + lower[i] - 'a' + 10;
1591 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1596 /* Now validate the category name.
1597 We could use a binary search, or a trie, if
1598 we really wanted to. */
1600 for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
1602 if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1603 if (getCharCat(cp) == cIdx) {
1613 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1617 /* try algorithmic names now */
1618 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1620 algRange=(AlgorithmicRange *)(p+1);
1622 if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1625 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1629 /* normal character name */
1630 findName.otherName=upper;
1631 findName.code=error;
1632 enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1633 if (findName.code == error) {
1634 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1636 return findName.code;
1639 U_CAPI void U_EXPORT2
1640 u_enumCharNames(UChar32 start, UChar32 limit,
1641 UEnumCharNamesFn *fn,
1643 UCharNameChoice nameChoice,
1644 UErrorCode *pErrorCode) {
1645 AlgorithmicRange *algRange;
1649 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1653 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1654 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1658 if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1659 limit = UCHAR_MAX_VALUE + 1;
1661 if((uint32_t)start>=(uint32_t)limit) {
1665 if(!isDataLoaded(pErrorCode)) {
1669 /* interleave the data-driven ones with the algorithmic ones */
1670 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1671 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1673 algRange=(AlgorithmicRange *)(p+1);
1675 /* enumerate the character names before the current algorithmic range */
1676 /* here: start<limit */
1677 if((uint32_t)start<algRange->start) {
1678 if((uint32_t)limit<=algRange->start) {
1679 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1682 if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1685 start=(UChar32)algRange->start;
1687 /* enumerate the character names in the current algorithmic range */
1688 /* here: algRange->start<=start<limit */
1689 if((uint32_t)start<=algRange->end) {
1690 if((uint32_t)limit<=(algRange->end+1)) {
1691 enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1694 if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1697 start=(UChar32)algRange->end+1;
1699 /* continue to the next algorithmic range (here: start<limit) */
1700 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1703 /* enumerate the character names after the last algorithmic range */
1704 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1707 U_CAPI int32_t U_EXPORT2
1708 uprv_getMaxCharNameLength() {
1709 UErrorCode errorCode=U_ZERO_ERROR;
1710 if(calcNameSetsLengths(&errorCode)) {
1711 return gMaxNameLength;
1718 * Converts the char set cset into a Unicode set uset.
1719 * @param cset Set of 256 bit flags corresponding to a set of chars.
1720 * @param uset USet to receive characters. Existing contents are deleted.
1723 charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1728 UErrorCode errorCode;
1730 errorCode=U_ZERO_ERROR;
1732 if(!calcNameSetsLengths(&errorCode)) {
1736 /* build a char string with all chars that are used in character names */
1738 for(i=0; i<256; ++i) {
1739 if(SET_CONTAINS(cset, i)) {
1740 cs[length++]=(char)i;
1744 /* convert the char string to a UChar string */
1745 u_charsToUChars(cs, us, length);
1747 /* add each UChar to the USet */
1748 for(i=0; i<length; ++i) {
1749 if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1750 sa->add(sa->set, us[i]);
1756 * Fills set with characters that are used in Unicode character names.
1757 * @param set USet to receive characters.
1759 U_CAPI void U_EXPORT2
1760 uprv_getCharNameCharacters(const USetAdder *sa) {
1761 charSetToUSet(gNameSet, sa);
1764 /* data swapping ------------------------------------------------------------ */
1767 * The token table contains non-negative entries for token bytes,
1768 * and -1 for bytes that represent themselves in the data file's charset.
1769 * -2 entries are used for lead bytes.
1771 * Direct bytes (-1 entries) must be translated from the input charset family
1772 * to the output charset family.
1773 * makeTokenMap() writes a permutation mapping for this.
1774 * Use it once for single-/lead-byte tokens and once more for all trail byte
1775 * tokens. (';' is an unused trail byte marked with -1.)
1778 makeTokenMap(const UDataSwapper *ds,
1779 int16_t tokens[], uint16_t tokenCount,
1781 UErrorCode *pErrorCode) {
1782 UBool usedOutChar[256];
1786 if(U_FAILURE(*pErrorCode)) {
1790 if(ds->inCharset==ds->outCharset) {
1791 /* Same charset family: identity permutation */
1792 for(i=0; i<256; ++i) {
1796 uprv_memset(map, 0, 256);
1797 uprv_memset(usedOutChar, 0, 256);
1799 if(tokenCount>256) {
1803 /* set the direct bytes (byte 0 always maps to itself) */
1804 for(i=1; i<tokenCount; ++i) {
1806 /* convert the direct byte character */
1808 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1809 if(U_FAILURE(*pErrorCode)) {
1810 udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1815 /* enter the converted character into the map and mark it used */
1817 usedOutChar[c2]=TRUE;
1821 /* set the mappings for the rest of the permutation */
1822 for(i=j=1; i<tokenCount; ++i) {
1823 /* set mappings that were not set for direct bytes */
1825 /* set an output byte value that was not used as an output byte above */
1826 while(usedOutChar[j]) {
1829 map[i]=(uint8_t)j++;
1834 * leave mappings at tokenCount and above unset if tokenCount<256
1835 * because they won't be used
1840 U_CAPI int32_t U_EXPORT2
1841 uchar_swapNames(const UDataSwapper *ds,
1842 const void *inData, int32_t length, void *outData,
1843 UErrorCode *pErrorCode) {
1844 const UDataInfo *pInfo;
1847 const uint8_t *inBytes;
1850 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1851 offset, i, count, stringsCount;
1853 const AlgorithmicRange *inRange;
1854 AlgorithmicRange *outRange;
1856 /* udata_swapDataHeader checks the arguments */
1857 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1858 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1862 /* check data format and format version */
1863 pInfo=(const UDataInfo *)((const char *)inData+4);
1865 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
1866 pInfo->dataFormat[1]==0x6e &&
1867 pInfo->dataFormat[2]==0x61 &&
1868 pInfo->dataFormat[3]==0x6d &&
1869 pInfo->formatVersion[0]==1
1871 udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1872 pInfo->dataFormat[0], pInfo->dataFormat[1],
1873 pInfo->dataFormat[2], pInfo->dataFormat[3],
1874 pInfo->formatVersion[0]);
1875 *pErrorCode=U_UNSUPPORTED_ERROR;
1879 inBytes=(const uint8_t *)inData+headerSize;
1880 outBytes=(uint8_t *)outData+headerSize;
1882 algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1886 (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1888 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1890 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1896 /* preflighting: iterate through algorithmic ranges */
1897 offset=algNamesOffset;
1898 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1901 for(i=0; i<count; ++i) {
1902 inRange=(const AlgorithmicRange *)(inBytes+offset);
1903 offset+=ds->readUInt16(inRange->size);
1910 int16_t tokens[512];
1911 uint16_t tokenCount;
1913 uint8_t map[256], trailMap[256];
1915 /* copy the data for inaccessible bytes */
1916 if(inBytes!=outBytes) {
1917 uprv_memcpy(outBytes, inBytes, length);
1920 /* the initial 4 offsets first */
1921 tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1922 groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1923 groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1924 ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1927 * now the tokens table
1928 * it needs to be permutated along with the compressed name strings
1930 p=(const uint16_t *)(inBytes+16);
1931 q=(uint16_t *)(outBytes+16);
1933 /* read and swap the tokenCount */
1934 tokenCount=ds->readUInt16(*p);
1935 ds->swapArray16(ds, p, 2, q, pErrorCode);
1939 /* read the first 512 tokens and make the token maps */
1940 if(tokenCount<=512) {
1945 for(i=0; i<count; ++i) {
1946 tokens[i]=udata_readInt16(ds, p[i]);
1949 tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1951 makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1952 makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1953 if(U_FAILURE(*pErrorCode)) {
1958 * swap and permutate the tokens
1959 * go through a temporary array to support in-place swapping
1961 temp=(uint16_t *)uprv_malloc(tokenCount*2);
1963 udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1965 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1969 /* swap and permutate single-/lead-byte tokens */
1970 for(i=0; i<tokenCount && i<256; ++i) {
1971 ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1974 /* swap and permutate trail-byte tokens */
1975 for(; i<tokenCount; ++i) {
1976 ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1979 /* copy the result into the output and free the temporary array */
1980 uprv_memcpy(q, temp, tokenCount*2);
1984 * swap the token strings but not a possible padding byte after
1985 * the terminating NUL of the last string
1987 udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1988 outBytes+tokenStringOffset, pErrorCode);
1989 if(U_FAILURE(*pErrorCode)) {
1990 udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1994 /* swap the group table */
1995 count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1996 ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1997 outBytes+groupsOffset, pErrorCode);
2000 * swap the group strings
2001 * swap the string bytes but not the nibble-encoded string lengths
2003 if(ds->inCharset!=ds->outCharset) {
2004 uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
2006 const uint8_t *inStrings, *nextInStrings;
2007 uint8_t *outStrings;
2011 inStrings=inBytes+groupStringOffset;
2012 outStrings=outBytes+groupStringOffset;
2014 stringsCount=algNamesOffset-groupStringOffset;
2016 /* iterate through string groups until only a few padding bytes are left */
2017 while(stringsCount>32) {
2018 nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2020 /* move past the length bytes */
2021 stringsCount-=(uint32_t)(nextInStrings-inStrings);
2022 outStrings+=nextInStrings-inStrings;
2023 inStrings=nextInStrings;
2025 count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2026 stringsCount-=count;
2028 /* swap the string bytes using map[] and trailMap[] */
2031 *outStrings++=map[c];
2035 /* token lead byte: swap the trail byte, too */
2036 *outStrings++=trailMap[*inStrings++];
2043 /* swap the algorithmic ranges */
2044 offset=algNamesOffset;
2045 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2046 ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2049 for(i=0; i<count; ++i) {
2050 if(offset>(uint32_t)length) {
2051 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2053 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2057 inRange=(const AlgorithmicRange *)(inBytes+offset);
2058 outRange=(AlgorithmicRange *)(outBytes+offset);
2059 offset+=ds->readUInt16(inRange->size);
2061 ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2062 ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2063 switch(inRange->type) {
2065 /* swap prefix string */
2066 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2067 outRange+1, pErrorCode);
2068 if(U_FAILURE(*pErrorCode)) {
2069 udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2076 /* swap factors and the prefix and factor strings */
2077 uint32_t factorsCount;
2079 factorsCount=inRange->variant;
2080 p=(const uint16_t *)(inRange+1);
2081 q=(uint16_t *)(outRange+1);
2082 ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2084 /* swap the strings, up to the last terminating NUL */
2087 stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2088 while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2091 ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2095 udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2097 *pErrorCode=U_UNSUPPORTED_ERROR;
2103 return headerSize+(int32_t)offset;
2107 * Hey, Emacs, please set the following:
2110 * indent-tabs-mode: nil