1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2007, 2009-2015 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify it
5 under the terms of the GNU General Public License as published
6 by the Free Software Foundation; either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
28 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
31 /* Table of Unicode character names, derived from UnicodeData.txt.
32 This table is generated in a way to minimize the memory footprint:
33 1. its compiled size is small (less than 350 KB),
34 2. it resides entirely in the text or read-only data segment of the
35 executable or shared library: the table contains only immediate
36 integers, no pointers, and the functions don't do heap allocation.
40 static const char unicode_name_words[36303] = ...;
41 #define UNICODE_CHARNAME_NUM_WORDS 6260
42 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
43 #define UNICODE_CHARNAME_WORD_HANGUL 3902
44 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
45 #define UNICODE_CHARNAME_WORD_CJK 417
46 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
47 static const uint16_t unicode_names[68940] = ...;
48 static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
49 static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
50 #define UNICODE_CHARNAME_MAX_LENGTH 83
51 #define UNICODE_CHARNAME_MAX_WORDS 13
52 static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
55 /* Returns the word with a given index. */
57 unicode_name_word (unsigned int index, unsigned int *lengthp)
63 assert (index < UNICODE_CHARNAME_NUM_WORDS);
65 /* Binary search for i with
66 unicode_name_by_length[i].ind_offset <= index
68 index < unicode_name_by_length[i+1].ind_offset
72 i2 = SIZEOF (unicode_name_by_length) - 1;
75 unsigned int i = (i1 + i2) >> 1;
76 if (unicode_name_by_length[i].ind_offset <= index)
82 assert (unicode_name_by_length[i].ind_offset <= index
83 && index < unicode_name_by_length[i+1].ind_offset);
85 return &unicode_name_words[unicode_name_by_length[i].extra_offset
86 + (index-unicode_name_by_length[i].ind_offset)*i];
89 /* Looks up the index of a word. */
91 unicode_name_word_lookup (const char *word, unsigned int length)
93 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
95 /* Binary search among the words of given length. */
96 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
97 unsigned int i0 = unicode_name_by_length[length].ind_offset;
99 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
102 unsigned int i = (i1 + i2) >> 1;
103 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
104 const char *w = word;
105 unsigned int n = length;
112 /* Note here: i1 < i < i2. */
118 /* Note here: i1 <= i < i2. */
131 #define UNINAME_INVALID_INDEX UINT16_MAX
133 /* Looks up the internal index of a Unicode character. */
135 unicode_code_to_index (ucs4_t c)
137 /* Binary search in unicode_ranges. */
139 unsigned int i2 = SIZEOF (unicode_ranges);
143 unsigned int i = (i1 + i2) >> 1;
145 unicode_ranges[i].index + unicode_ranges[i].gap;
147 start_code + unicode_ranges[i].length - 1;
149 if (start_code <= c && c <= end_code)
150 return c - unicode_ranges[i].gap;
156 /* Note here: i1 < i < i2. */
159 else if (c < start_code)
163 /* Note here: i1 <= i < i2. */
167 return UNINAME_INVALID_INDEX;
170 /* Looks up the codepoint of a Unicode character, from the given
173 unicode_index_to_code (uint16_t index)
175 /* Binary search in unicode_ranges. */
177 unsigned int i2 = SIZEOF (unicode_ranges);
181 unsigned int i = (i1 + i2) >> 1;
182 uint16_t start_index = unicode_ranges[i].index;
183 uint16_t end_index = start_index + unicode_ranges[i].length - 1;
185 if (start_index <= index && index <= end_index)
186 return index + unicode_ranges[i].gap;
188 if (end_index < index)
192 /* Note here: i1 < i < i2. */
195 else if (index < start_index)
199 /* Note here: i1 <= i < i2. */
203 return UNINAME_INVALID;
207 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
208 sections 3.11 and 4.4. */
209 static const char jamo_initial_short_name[19][3] =
211 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
212 "C", "K", "T", "P", "H"
214 static const char jamo_medial_short_name[21][4] =
216 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
217 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
219 static const char jamo_final_short_name[28][3] =
221 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
222 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
225 /* Looks up the name of a Unicode character, in uppercase ASCII.
226 Returns the filled buf, or NULL if the character does not have a name. */
228 unicode_character_name (ucs4_t c, char *buf)
230 if (c >= 0xAC00 && c <= 0xD7A3)
232 /* Special case for Hangul syllables. Keeps the tables small. */
240 /* buf needs to have at least 16 + 7 bytes here. */
241 memcpy (buf, "HANGUL SYLLABLE ", 16);
245 index3 = tmp % 28; tmp = tmp / 28;
246 index2 = tmp % 21; tmp = tmp / 21;
249 q = jamo_initial_short_name[index1];
252 q = jamo_medial_short_name[index2];
255 q = jamo_final_short_name[index3];
261 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
262 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
264 /* Special case for CJK compatibility ideographs. Keeps the tables
269 /* buf needs to have at least 28 + 5 bytes here. */
270 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
273 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
275 unsigned int x = (c >> i) & 0xf;
276 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
281 else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
283 /* Special case for variation selectors. Keeps the tables
286 /* buf needs to have at least 19 + 3 bytes here. */
287 sprintf (buf, "VARIATION SELECTOR-%d",
288 c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
293 uint16_t index = unicode_code_to_index (c);
294 const uint16_t *words = NULL;
296 if (index != UNINAME_INVALID_INDEX)
298 /* Binary search in unicode_code_to_name. */
300 unsigned int i2 = SIZEOF (unicode_index_to_name);
303 unsigned int i = (i1 + i2) >> 1;
304 if (unicode_index_to_name[i].index == index)
306 words = &unicode_names[unicode_index_to_name[i].name];
309 else if (unicode_index_to_name[i].index < index)
316 /* Note here: i1 < i < i2. */
319 else if (unicode_index_to_name[i].index > index)
326 /* Note here: i1 <= i < i2. */
333 /* Found it in unicode_index_to_name. Now concatenate the words. */
334 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */
338 unsigned int wordlen;
339 const char *word = unicode_name_word (*words>>1, &wordlen);
342 while (--wordlen > 0);
343 if ((*words & 1) == 0)
355 /* Looks up the Unicode character with a given name, in upper- or lowercase
356 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
358 unicode_name_character (const char *name)
360 unsigned int len = strlen (name);
361 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
363 /* Test for "word1 word2 ..." syntax. */
364 char buf[UNICODE_CHARNAME_MAX_LENGTH];
369 if (!(c >= ' ' && c <= '~'))
371 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
379 /* Special case for variation selector aliases. Keeps the
381 const char *p1 = buf;
382 if (ptr >= buf + 3 && *p1++ == 'V')
391 if (*p1 >= '0' && *p1 <= '9')
396 if (c >= 1 && c <= 16)
397 return c - 1 + 0xFE00;
398 else if (c >= 17 && c <= 256)
399 return c - 17 + 0xE0100;
409 /* Convert the constituents to uint16_t words. */
410 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
411 uint16_t *wordptr = words;
413 const char *p1 = buf;
419 while (p2 < ptr && *p2 != ' ')
421 word = unicode_name_word_lookup (p1, p2 - p1);
424 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
431 /* Special case for Hangul syllables. Keeps the tables small. */
432 if (wordptr == &words[2]
433 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
434 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
436 /* Split the last word [p1..ptr) into three parts:
447 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
448 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
449 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
450 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
455 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
456 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
461 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
462 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
463 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
464 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
465 || *p4 == 'S' || *p4 == 'T'))
469 unsigned int n1 = p2 - p1;
470 unsigned int n2 = p3 - p2;
471 unsigned int n3 = p4 - p3;
473 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
477 for (index1 = 0; index1 < 19; index1++)
478 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
479 && jamo_initial_short_name[index1][n1] == '\0')
483 for (index2 = 0; index2 < 21; index2++)
484 if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
485 && jamo_medial_short_name[index2][n2] == '\0')
489 for (index3 = 0; index3 < 28; index3++)
490 if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
491 && jamo_final_short_name[index3][n3] == '\0')
493 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
502 /* Special case for CJK compatibility ideographs. Keeps the
504 if (wordptr == &words[2]
505 && words[0] == UNICODE_CHARNAME_WORD_CJK
506 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
509 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
511 const char *p2 = p1 + 10;
519 if (*p2 >= '0' && *p2 <= '9')
521 else if (*p2 >= 'A' && *p2 <= 'F')
522 c += (*p2 - 'A' + 10);
528 if ((c >= 0xF900 && c <= 0xFA2D)
529 || (c >= 0xFA30 && c <= 0xFA6A)
530 || (c >= 0xFA70 && c <= 0xFAD9)
531 || (c >= 0x2F800 && c <= 0x2FA1D))
540 /* Special case for variation selectors. Keeps the
542 if (wordptr == &words[1]
543 && words[0] == UNICODE_CHARNAME_WORD_VARIATION
546 && memcmp (p1, "SELECTOR-", 9) == 0)
548 const char *p2 = p1 + 9;
556 if (*p2 >= '0' && *p2 <= '9')
561 if (c >= 1 && c <= 16)
562 return c - 1 + 0xFE00;
563 else if (c >= 17 && c <= 256)
564 return c - 17 + 0xE0100;
577 /* Multiply by 2, to simplify later comparisons. */
578 unsigned int words_length = wordptr - words;
580 int i = words_length - 1;
581 words[i] = 2 * words[i];
583 words[i] = 2 * words[i] + 1;
585 /* Binary search in unicode_name_to_index. */
588 unsigned int i2 = SIZEOF (unicode_name_to_index);
591 unsigned int i = (i1 + i2) >> 1;
592 const uint16_t *w = words;
593 const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
594 unsigned int n = words_length;
601 /* Note here: i1 < i < i2. */
609 /* Note here: i1 <= i < i2. */
615 return unicode_index_to_code (unicode_name_to_index[i].index);
623 return UNINAME_INVALID;