gettext-tools/gnulib-lib/uniname/uniname.c

   1 /* Association between Unicode characters and their names.
   2    Copyright (C) 2000-2002, 2005-2007, 2009-2015 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify it
   5    under the terms of the GNU General Public License as published
   6    by the Free Software Foundation; either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 #include <config.h>
  18
  19 /* Specification.  */
  20 #include "uniname.h"
  21
  22 #include <assert.h>
  23 #include <stdbool.h>
  24 #include <stdint.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27
  28 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  29
  30
  31 /* Table of Unicode character names, derived from UnicodeData.txt.
  32    This table is generated in a way to minimize the memory footprint:
  33      1. its compiled size is small (less than 350 KB),
  34      2. it resides entirely in the text or read-only data segment of the
  35         executable or shared library: the table contains only immediate
  36         integers, no pointers, and the functions don't do heap allocation.
  37  */
  38 #include "uninames.h"
  39 /* It contains:
  40   static const char unicode_name_words[36303] = ...;
  41   #define UNICODE_CHARNAME_NUM_WORDS 6260
  42   static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
  43   #define UNICODE_CHARNAME_WORD_HANGUL 3902
  44   #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
  45   #define UNICODE_CHARNAME_WORD_CJK 417
  46   #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
  47   static const uint16_t unicode_names[68940] = ...;
  48   static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
  49   static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
  50   #define UNICODE_CHARNAME_MAX_LENGTH 83
  51   #define UNICODE_CHARNAME_MAX_WORDS 13
  52   static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
  53 */
  54
  55 /* Returns the word with a given index.  */
  56 static const char *
  57 unicode_name_word (unsigned int index, unsigned int *lengthp)
  58 {
  59   unsigned int i1;
  60   unsigned int i2;
  61   unsigned int i;
  62
  63   assert (index < UNICODE_CHARNAME_NUM_WORDS);
  64
  65   /* Binary search for i with
  66        unicode_name_by_length[i].ind_offset <= index
  67      and
  68        index < unicode_name_by_length[i+1].ind_offset
  69    */
  70
  71   i1 = 0;
  72   i2 = SIZEOF (unicode_name_by_length) - 1;
  73   while (i2 - i1 > 1)
  74     {
  75       unsigned int i = (i1 + i2) >> 1;
  76       if (unicode_name_by_length[i].ind_offset <= index)
  77         i1 = i;
  78       else
  79         i2 = i;
  80     }
  81   i = i1;
  82   assert (unicode_name_by_length[i].ind_offset <= index
  83           && index < unicode_name_by_length[i+1].ind_offset);
  84   *lengthp = i;
  85   return &unicode_name_words[unicode_name_by_length[i].extra_offset
  86                              + (index-unicode_name_by_length[i].ind_offset)*i];
  87 }
  88
  89 /* Looks up the index of a word.  */
  90 static int
  91 unicode_name_word_lookup (const char *word, unsigned int length)
  92 {
  93   if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
  94     {
  95       /* Binary search among the words of given length.  */
  96       unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
  97       unsigned int i0 = unicode_name_by_length[length].ind_offset;
  98       unsigned int i1 = i0;
  99       unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
 100       while (i2 - i1 > 0)
 101         {
 102           unsigned int i = (i1 + i2) >> 1;
 103           const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
 104           const char *w = word;
 105           unsigned int n = length;
 106           for (;;)
 107             {
 108               if (*p < *w)
 109                 {
 110                   if (i1 == i)
 111                     return -1;
 112                   /* Note here: i1 < i < i2.  */
 113                   i1 = i;
 114                   break;
 115                 }
 116               if (*p > *w)
 117                 {
 118                   /* Note here: i1 <= i < i2.  */
 119                   i2 = i;
 120                   break;
 121                 }
 122               p++; w++; n--;
 123               if (n == 0)
 124                 return i;
 125             }
 126         }
 127     }
 128   return -1;
 129 }
 130
 131 #define UNINAME_INVALID_INDEX UINT16_MAX
 132
 133 /* Looks up the internal index of a Unicode character.  */
 134 static uint16_t
 135 unicode_code_to_index (ucs4_t c)
 136 {
 137   /* Binary search in unicode_ranges.  */
 138   unsigned int i1 = 0;
 139   unsigned int i2 = SIZEOF (unicode_ranges);
 140
 141   for (;;)
 142     {
 143       unsigned int i = (i1 + i2) >> 1;
 144       ucs4_t start_code =
 145         unicode_ranges[i].index + unicode_ranges[i].gap;
 146       ucs4_t end_code =
 147         start_code + unicode_ranges[i].length - 1;
 148
 149       if (start_code <= c && c <= end_code)
 150         return c - unicode_ranges[i].gap;
 151
 152       if (end_code < c)
 153         {
 154           if (i1 == i)
 155             break;
 156           /* Note here: i1 < i < i2.  */
 157           i1 = i;
 158         }
 159       else if (c < start_code)
 160         {
 161           if (i2 == i)
 162             break;
 163           /* Note here: i1 <= i < i2.  */
 164           i2 = i;
 165         }
 166     }
 167   return UNINAME_INVALID_INDEX;
 168 }
 169
 170 /* Looks up the codepoint of a Unicode character, from the given
 171    internal index.  */
 172 static ucs4_t
 173 unicode_index_to_code (uint16_t index)
 174 {
 175   /* Binary search in unicode_ranges.  */
 176   unsigned int i1 = 0;
 177   unsigned int i2 = SIZEOF (unicode_ranges);
 178
 179   for (;;)
 180     {
 181       unsigned int i = (i1 + i2) >> 1;
 182       uint16_t start_index = unicode_ranges[i].index;
 183       uint16_t end_index = start_index + unicode_ranges[i].length - 1;
 184
 185       if (start_index <= index && index <= end_index)
 186         return index + unicode_ranges[i].gap;
 187
 188       if (end_index < index)
 189         {
 190           if (i1 == i)
 191             break;
 192           /* Note here: i1 < i < i2.  */
 193           i1 = i;
 194         }
 195       else if (index < start_index)
 196         {
 197           if (i2 == i)
 198             break;
 199           /* Note here: i1 <= i < i2.  */
 200           i2 = i;
 201         }
 202     }
 203   return UNINAME_INVALID;
 204 }
 205
 206
 207 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
 208    sections 3.11 and 4.4.  */
 209 static const char jamo_initial_short_name[19][3] =
 210 {
 211   "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
 212   "C", "K", "T", "P", "H"
 213 };
 214 static const char jamo_medial_short_name[21][4] =
 215 {
 216   "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
 217   "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
 218 };
 219 static const char jamo_final_short_name[28][3] =
 220 {
 221   "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
 222   "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
 223 };
 224
 225 /* Looks up the name of a Unicode character, in uppercase ASCII.
 226    Returns the filled buf, or NULL if the character does not have a name.  */
 227 char *
 228 unicode_character_name (ucs4_t c, char *buf)
 229 {
 230   if (c >= 0xAC00 && c <= 0xD7A3)
 231     {
 232       /* Special case for Hangul syllables. Keeps the tables small.  */
 233       char *ptr;
 234       unsigned int tmp;
 235       unsigned int index1;
 236       unsigned int index2;
 237       unsigned int index3;
 238       const char *q;
 239
 240       /* buf needs to have at least 16 + 7 bytes here.  */
 241       memcpy (buf, "HANGUL SYLLABLE ", 16);
 242       ptr = buf + 16;
 243
 244       tmp = c - 0xAC00;
 245       index3 = tmp % 28; tmp = tmp / 28;
 246       index2 = tmp % 21; tmp = tmp / 21;
 247       index1 = tmp;
 248
 249       q = jamo_initial_short_name[index1];
 250       while (*q != '\0')
 251         *ptr++ = *q++;
 252       q = jamo_medial_short_name[index2];
 253       while (*q != '\0')
 254         *ptr++ = *q++;
 255       q = jamo_final_short_name[index3];
 256       while (*q != '\0')
 257         *ptr++ = *q++;
 258       *ptr = '\0';
 259       return buf;
 260     }
 261   else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
 262            || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
 263     {
 264       /* Special case for CJK compatibility ideographs. Keeps the tables
 265          small.  */
 266       char *ptr;
 267       int i;
 268
 269       /* buf needs to have at least 28 + 5 bytes here.  */
 270       memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
 271       ptr = buf + 28;
 272
 273       for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
 274         {
 275           unsigned int x = (c >> i) & 0xf;
 276           *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
 277         }
 278       *ptr = '\0';
 279       return buf;
 280     }
 281   else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
 282     {
 283       /* Special case for variation selectors. Keeps the tables
 284          small.  */
 285
 286       /* buf needs to have at least 19 + 3 bytes here.  */
 287       sprintf (buf, "VARIATION SELECTOR-%d",
 288                c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
 289       return buf;
 290     }
 291   else
 292     {
 293       uint16_t index = unicode_code_to_index (c);
 294       const uint16_t *words = NULL;
 295
 296       if (index != UNINAME_INVALID_INDEX)
 297         {
 298           /* Binary search in unicode_code_to_name.  */
 299           unsigned int i1 = 0;
 300           unsigned int i2 = SIZEOF (unicode_index_to_name);
 301           for (;;)
 302             {
 303               unsigned int i = (i1 + i2) >> 1;
 304               if (unicode_index_to_name[i].index == index)
 305                 {
 306                   words = &unicode_names[unicode_index_to_name[i].name];
 307                   break;
 308                 }
 309               else if (unicode_index_to_name[i].index < index)
 310                 {
 311                   if (i1 == i)
 312                     {
 313                       words = NULL;
 314                       break;
 315                     }
 316                   /* Note here: i1 < i < i2.  */
 317                   i1 = i;
 318                 }
 319               else if (unicode_index_to_name[i].index > index)
 320                 {
 321                   if (i2 == i)
 322                     {
 323                       words = NULL;
 324                       break;
 325                     }
 326                   /* Note here: i1 <= i < i2.  */
 327                   i2 = i;
 328                 }
 329             }
 330         }
 331       if (words != NULL)
 332         {
 333           /* Found it in unicode_index_to_name. Now concatenate the words.  */
 334           /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */
 335           char *ptr = buf;
 336           for (;;)
 337             {
 338               unsigned int wordlen;
 339               const char *word = unicode_name_word (*words>>1, &wordlen);
 340               do
 341                 *ptr++ = *word++;
 342               while (--wordlen > 0);
 343               if ((*words & 1) == 0)
 344                 break;
 345               *ptr++ = ' ';
 346               words++;
 347             }
 348           *ptr = '\0';
 349           return buf;
 350         }
 351       return NULL;
 352     }
 353 }
 354
 355 /* Looks up the Unicode character with a given name, in upper- or lowercase
 356    ASCII.  Returns the character if found, or UNINAME_INVALID if not found.  */
 357 ucs4_t
 358 unicode_name_character (const char *name)
 359 {
 360   unsigned int len = strlen (name);
 361   if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
 362     {
 363       /* Test for "word1 word2 ..." syntax.  */
 364       char buf[UNICODE_CHARNAME_MAX_LENGTH];
 365       char *ptr = buf;
 366       for (;;)
 367         {
 368           char c = *name++;
 369           if (!(c >= ' ' && c <= '~'))
 370             break;
 371           *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
 372           if (--len == 0)
 373             goto filled_buf;
 374         }
 375       if (false)
 376       filled_buf:
 377         {
 378           {
 379             /* Special case for variation selector aliases. Keeps the
 380                tables small.  */
 381             const char *p1 = buf;
 382             if (ptr >= buf + 3 && *p1++ == 'V')
 383               {
 384                 if (*p1++ == 'S')
 385                   {
 386                     if (*p1 != '0')
 387                       {
 388                         unsigned int c = 0;
 389                         for (;;)
 390                           {
 391                             if (*p1 >= '0' && *p1 <= '9')
 392                               c += (*p1 - '0');
 393                             p1++;
 394                             if (p1 == ptr)
 395                               {
 396                                 if (c >= 1 && c <= 16)
 397                                   return c - 1 + 0xFE00;
 398                                 else if (c >= 17 && c <= 256)
 399                                   return c - 17 + 0xE0100;
 400                                 else
 401                                   break;
 402                               }
 403                             c = c * 10;
 404                           }
 405                       }
 406                   }
 407               }
 408           }
 409           /* Convert the constituents to uint16_t words.  */
 410           uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
 411           uint16_t *wordptr = words;
 412           {
 413             const char *p1 = buf;
 414             for (;;)
 415               {
 416                 {
 417                   int word;
 418                   const char *p2 = p1;
 419                   while (p2 < ptr && *p2 != ' ')
 420                     p2++;
 421                   word = unicode_name_word_lookup (p1, p2 - p1);
 422                   if (word < 0)
 423                     break;
 424                   if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
 425                     break;
 426                   *wordptr++ = word;
 427                   if (p2 == ptr)
 428                     goto filled_words;
 429                   p1 = p2 + 1;
 430                 }
 431                 /* Special case for Hangul syllables. Keeps the tables small. */
 432                 if (wordptr == &words[2]
 433                     && words[0] == UNICODE_CHARNAME_WORD_HANGUL
 434                     && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
 435                   {
 436                     /* Split the last word [p1..ptr) into three parts:
 437                          1) [BCDGHJKMNPRST]
 438                          2) [AEIOUWY]
 439                          3) [BCDGHIJKLMNPST]
 440                      */
 441                     const char *p2;
 442                     const char *p3;
 443                     const char *p4;
 444
 445                     p2 = p1;
 446                     while (p2 < ptr
 447                            && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
 448                                || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
 449                                || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
 450                                || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
 451                                || *p2 == 'T'))
 452                       p2++;
 453                     p3 = p2;
 454                     while (p3 < ptr
 455                            && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
 456                                || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
 457                                || *p3 == 'Y'))
 458                       p3++;
 459                     p4 = p3;
 460                     while (p4 < ptr
 461                            && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
 462                                || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
 463                                || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
 464                                || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
 465                                || *p4 == 'S' || *p4 == 'T'))
 466                       p4++;
 467                     if (p4 == ptr)
 468                       {
 469                         unsigned int n1 = p2 - p1;
 470                         unsigned int n2 = p3 - p2;
 471                         unsigned int n3 = p4 - p3;
 472
 473                         if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
 474                           {
 475                             unsigned int index1;
 476
 477                             for (index1 = 0; index1 < 19; index1++)
 478                               if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
 479                                   && jamo_initial_short_name[index1][n1] == '\0')
 480                                 {
 481                                   unsigned int index2;
 482
 483                                   for (index2 = 0; index2 < 21; index2++)
 484                                     if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
 485                                         && jamo_medial_short_name[index2][n2] == '\0')
 486                                       {
 487                                         unsigned int index3;
 488
 489                                         for (index3 = 0; index3 < 28; index3++)
 490                                           if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
 491                                               && jamo_final_short_name[index3][n3] == '\0')
 492                                             {
 493                                               return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
 494                                             }
 495                                         break;
 496                                       }
 497                                   break;
 498                                 }
 499                           }
 500                       }
 501                   }
 502                 /* Special case for CJK compatibility ideographs. Keeps the
 503                    tables small.  */
 504                 if (wordptr == &words[2]
 505                     && words[0] == UNICODE_CHARNAME_WORD_CJK
 506                     && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
 507                     && p1 + 14 <= ptr
 508                     && p1 + 15 >= ptr
 509                     && memcmp (p1, "IDEOGRAPH-", 10) == 0)
 510                   {
 511                     const char *p2 = p1 + 10;
 512
 513                     if (*p2 != '0')
 514                       {
 515                         unsigned int c = 0;
 516
 517                         for (;;)
 518                           {
 519                             if (*p2 >= '0' && *p2 <= '9')
 520                               c += (*p2 - '0');
 521                             else if (*p2 >= 'A' && *p2 <= 'F')
 522                               c += (*p2 - 'A' + 10);
 523                             else
 524                               break;
 525                             p2++;
 526                             if (p2 == ptr)
 527                               {
 528                                 if ((c >= 0xF900 && c <= 0xFA2D)
 529                                     || (c >= 0xFA30 && c <= 0xFA6A)
 530                                     || (c >= 0xFA70 && c <= 0xFAD9)
 531                                     || (c >= 0x2F800 && c <= 0x2FA1D))
 532                                   return c;
 533                                 else
 534                                   break;
 535                               }
 536                             c = c << 4;
 537                           }
 538                       }
 539                   }
 540                 /* Special case for variation selectors. Keeps the
 541                    tables small.  */
 542                 if (wordptr == &words[1]
 543                     && words[0] == UNICODE_CHARNAME_WORD_VARIATION
 544                     && p1 + 10 <= ptr
 545                     && p1 + 12 >= ptr
 546                     && memcmp (p1, "SELECTOR-", 9) == 0)
 547                   {
 548                     const char *p2 = p1 + 9;
 549
 550                     if (*p2 != '0')
 551                       {
 552                         unsigned int c = 0;
 553
 554                         for (;;)
 555                           {
 556                             if (*p2 >= '0' && *p2 <= '9')
 557                               c += (*p2 - '0');
 558                             p2++;
 559                             if (p2 == ptr)
 560                               {
 561                                 if (c >= 1 && c <= 16)
 562                                   return c - 1 + 0xFE00;
 563                                 else if (c >= 17 && c <= 256)
 564                                   return c - 17 + 0xE0100;
 565                                 else
 566                                   break;
 567                               }
 568                             c = c * 10;
 569                           }
 570                       }
 571                   }
 572               }
 573           }
 574           if (false)
 575           filled_words:
 576             {
 577               /* Multiply by 2, to simplify later comparisons.  */
 578               unsigned int words_length = wordptr - words;
 579               {
 580                 int i = words_length - 1;
 581                 words[i] = 2 * words[i];
 582                 for (; --i >= 0; )
 583                   words[i] = 2 * words[i] + 1;
 584               }
 585               /* Binary search in unicode_name_to_index.  */
 586               {
 587                 unsigned int i1 = 0;
 588                 unsigned int i2 = SIZEOF (unicode_name_to_index);
 589                 for (;;)
 590                   {
 591                     unsigned int i = (i1 + i2) >> 1;
 592                     const uint16_t *w = words;
 593                     const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
 594                     unsigned int n = words_length;
 595                     for (;;)
 596                       {
 597                         if (*p < *w)
 598                           {
 599                             if (i1 == i)
 600                               goto name_not_found;
 601                             /* Note here: i1 < i < i2.  */
 602                             i1 = i;
 603                             break;
 604                           }
 605                         else if (*p > *w)
 606                           {
 607                             if (i2 == i)
 608                               goto name_not_found;
 609                             /* Note here: i1 <= i < i2.  */
 610                             i2 = i;
 611                             break;
 612                           }
 613                         p++; w++; n--;
 614                         if (n == 0)
 615                           return unicode_index_to_code (unicode_name_to_index[i].index);
 616                       }
 617                   }
 618               }
 619             name_not_found: ;
 620             }
 621         }
 622     }
 623   return UNINAME_INVALID;
 624 }