glib/gunidecomp.c

   1 /* decomp.c - Character decomposition.
   2  *
   3  *  Copyright (C) 1999, 2000 Tom Tromey
   4  *  Copyright 2000 Red Hat, Inc.
   5  *
   6  * The Gnome Library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public License as
   8  * published by the Free Software Foundation; either version 2 of the
   9  * License, or (at your option) any later version.
  10  *
  11  * The Gnome Library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
  18  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  *   Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "config.h"
  23
  24 #include <stdlib.h>
  25
  26 #include "glib.h"
  27 #include "gunidecomp.h"
  28 #include "gunicomp.h"
  29 #include "gunicodeprivate.h"
  30
  31
  32 #define CC_PART1(Page, Char) \
  33   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  34    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  35    : (cclass_data[combining_class_table_part1[Page]][Char]))
  36
  37 #define CC_PART2(Page, Char) \
  38   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  39    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  40    : (cclass_data[combining_class_table_part2[Page]][Char]))
  41
  42 #define COMBINING_CLASS(Char) \
  43   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
  44    ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
  45    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
  46       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
  47       : 0))
  48
  49 /**
  50  * g_unichar_combining_class:
  51  * @uc: a Unicode character
  52  *
  53  * Determines the canonical combining class of a Unicode character.
  54  *
  55  * Return value: the combining class of the character
  56  *
  57  * Since: 2.14
  58  **/
  59 gint
  60 g_unichar_combining_class (gunichar uc)
  61 {
  62   return COMBINING_CLASS (uc);
  63 }
  64
  65 /* constants for hangul syllable [de]composition */
  66 #define SBase 0xAC00
  67 #define LBase 0x1100
  68 #define VBase 0x1161
  69 #define TBase 0x11A7
  70 #define LCount 19
  71 #define VCount 21
  72 #define TCount 28
  73 #define NCount (VCount * TCount)
  74 #define SCount (LCount * NCount)
  75
  76 /**
  77  * g_unicode_canonical_ordering:
  78  * @string: a UCS-4 encoded string.
  79  * @len: the maximum length of @string to use.
  80  *
  81  * Computes the canonical ordering of a string in-place.
  82  * This rearranges decomposed characters in the string
  83  * according to their combining classes.  See the Unicode
  84  * manual for more information.
  85  **/
  86 void
  87 g_unicode_canonical_ordering (gunichar *string,
  88                               gsize     len)
  89 {
  90   gsize i;
  91   int swap = 1;
  92
  93   while (swap)
  94     {
  95       int last;
  96       swap = 0;
  97       last = COMBINING_CLASS (string[0]);
  98       for (i = 0; i < len - 1; ++i)
  99         {
 100           int next = COMBINING_CLASS (string[i + 1]);
 101           if (next != 0 && last > next)
 102             {
 103               gsize j;
 104               /* Percolate item leftward through string.  */
 105               for (j = i + 1; j > 0; --j)
 106                 {
 107                   gunichar t;
 108                   if (COMBINING_CLASS (string[j - 1]) <= next)
 109                     break;
 110                   t = string[j];
 111                   string[j] = string[j - 1];
 112                   string[j - 1] = t;
 113                   swap = 1;
 114                 }
 115               /* We're re-entering the loop looking at the old
 116                  character again.  */
 117               next = last;
 118             }
 119           last = next;
 120         }
 121     }
 122 }
 123
 124 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
 125  * r should be null or have sufficient space. Calling with r == NULL will
 126  * only calculate the result_len; however, a buffer with space for three
 127  * characters will always be big enough. */
 128 static void
 129 decompose_hangul (gunichar s,
 130                   gunichar *r,
 131                   gsize *result_len)
 132 {
 133   gint SIndex = s - SBase;
 134
 135   /* not a hangul syllable */
 136   if (SIndex < 0 || SIndex >= SCount)
 137     {
 138       if (r)
 139         r[0] = s;
 140       *result_len = 1;
 141     }
 142   else
 143     {
 144       gunichar L = LBase + SIndex / NCount;
 145       gunichar V = VBase + (SIndex % NCount) / TCount;
 146       gunichar T = TBase + SIndex % TCount;
 147
 148       if (r)
 149         {
 150           r[0] = L;
 151           r[1] = V;
 152         }
 153
 154       if (T != TBase)
 155         {
 156           if (r)
 157             r[2] = T;
 158           *result_len = 3;
 159         }
 160       else
 161         *result_len = 2;
 162     }
 163 }
 164
 165 /* returns a pointer to a null-terminated UTF-8 string */
 166 static const gchar *
 167 find_decomposition (gunichar ch,
 168                     gboolean compat)
 169 {
 170   int start = 0;
 171   int end = G_N_ELEMENTS (decomp_table);
 172
 173   if (ch >= decomp_table[start].ch &&
 174       ch <= decomp_table[end - 1].ch)
 175     {
 176       while (TRUE)
 177         {
 178           int half = (start + end) / 2;
 179           if (ch == decomp_table[half].ch)
 180             {
 181               int offset;
 182
 183               if (compat)
 184                 {
 185                   offset = decomp_table[half].compat_offset;
 186                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 187                     offset = decomp_table[half].canon_offset;
 188                 }
 189               else
 190                 {
 191                   offset = decomp_table[half].canon_offset;
 192                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 193                     return NULL;
 194                 }
 195
 196               return &(decomp_expansion_string[offset]);
 197             }
 198           else if (half == start)
 199             break;
 200           else if (ch > decomp_table[half].ch)
 201             start = half;
 202           else
 203             end = half;
 204         }
 205     }
 206
 207   return NULL;
 208 }
 209
 210 /**
 211  * g_unicode_canonical_decomposition:
 212  * @ch: a Unicode character.
 213  * @result_len: location to store the length of the return value.
 214  *
 215  * Computes the canonical decomposition of a Unicode character.
 216  *
 217  * Return value: a newly allocated string of Unicode characters.
 218  *   @result_len is set to the resulting length of the string.
 219  **/
 220 gunichar *
 221 g_unicode_canonical_decomposition (gunichar ch,
 222                                    gsize   *result_len)
 223 {
 224   const gchar *decomp;
 225   const gchar *p;
 226   gunichar *r;
 227
 228   /* Hangul syllable */
 229   if (ch >= 0xac00 && ch <= 0xd7a3)
 230     {
 231       decompose_hangul (ch, NULL, result_len);
 232       r = g_malloc (*result_len * sizeof (gunichar));
 233       decompose_hangul (ch, r, result_len);
 234     }
 235   else if ((decomp = find_decomposition (ch, FALSE)) != NULL)
 236     {
 237       /* Found it.  */
 238       int i;
 239
 240       *result_len = g_utf8_strlen (decomp, -1);
 241       r = g_malloc (*result_len * sizeof (gunichar));
 242
 243       for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
 244         r[i] = g_utf8_get_char (p);
 245     }
 246   else
 247     {
 248       /* Not in our table.  */
 249       r = g_malloc (sizeof (gunichar));
 250       *r = ch;
 251       *result_len = 1;
 252     }
 253
 254   /* Supposedly following the Unicode 2.1.9 table means that the
 255      decompositions come out in canonical order.  I haven't tested
 256      this, but we rely on it here.  */
 257   return r;
 258 }
 259
 260 /* L,V => LV and LV,T => LVT  */
 261 static gboolean
 262 combine_hangul (gunichar a,
 263                 gunichar b,
 264                 gunichar *result)
 265 {
 266   gint LIndex = a - LBase;
 267   gint SIndex = a - SBase;
 268
 269   gint VIndex = b - VBase;
 270   gint TIndex = b - TBase;
 271
 272   if (0 <= LIndex && LIndex < LCount
 273       && 0 <= VIndex && VIndex < VCount)
 274     {
 275       *result = SBase + (LIndex * VCount + VIndex) * TCount;
 276       return TRUE;
 277     }
 278   else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
 279            && 0 < TIndex && TIndex < TCount)
 280     {
 281       *result = a + TIndex;
 282       return TRUE;
 283     }
 284
 285   return FALSE;
 286 }
 287
 288 #define CI(Page, Char) \
 289   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 290    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 291    : (compose_data[compose_table[Page]][Char]))
 292
 293 #define COMPOSE_INDEX(Char) \
 294      (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
 295
 296 static gboolean
 297 combine (gunichar  a,
 298          gunichar  b,
 299          gunichar *result)
 300 {
 301   gushort index_a, index_b;
 302
 303   if (combine_hangul (a, b, result))
 304     return TRUE;
 305
 306   index_a = COMPOSE_INDEX(a);
 307
 308   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
 309     {
 310       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
 311         {
 312           *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
 313           return TRUE;
 314         }
 315       else
 316         return FALSE;
 317     }
 318
 319   index_b = COMPOSE_INDEX(b);
 320
 321   if (index_b >= COMPOSE_SECOND_SINGLE_START)
 322     {
 323       if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
 324         {
 325           *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
 326           return TRUE;
 327         }
 328       else
 329         return FALSE;
 330     }
 331
 332   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
 333       index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START)
 334     {
 335       gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
 336
 337       if (res)
 338         {
 339           *result = res;
 340           return TRUE;
 341         }
 342     }
 343
 344   return FALSE;
 345 }
 346
 347 gunichar *
 348 _g_utf8_normalize_wc (const gchar    *str,
 349                       gssize          max_len,
 350                       GNormalizeMode  mode)
 351 {
 352   gsize n_wc;
 353   gunichar *wc_buffer;
 354   const char *p;
 355   gsize last_start;
 356   gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
 357                         mode == G_NORMALIZE_NFKD);
 358   gboolean do_compose = (mode == G_NORMALIZE_NFC ||
 359                          mode == G_NORMALIZE_NFKC);
 360
 361   n_wc = 0;
 362   p = str;
 363   while ((max_len < 0 || p < str + max_len) && *p)
 364     {
 365       const gchar *decomp;
 366       gunichar wc = g_utf8_get_char (p);
 367
 368       if (wc >= 0xac00 && wc <= 0xd7a3)
 369         {
 370           gsize result_len;
 371           decompose_hangul (wc, NULL, &result_len);
 372           n_wc += result_len;
 373         }
 374       else
 375         {
 376           decomp = find_decomposition (wc, do_compat);
 377
 378           if (decomp)
 379             n_wc += g_utf8_strlen (decomp, -1);
 380           else
 381             n_wc++;
 382         }
 383
 384       p = g_utf8_next_char (p);
 385     }
 386
 387   wc_buffer = g_new (gunichar, n_wc + 1);
 388
 389   last_start = 0;
 390   n_wc = 0;
 391   p = str;
 392   while ((max_len < 0 || p < str + max_len) && *p)
 393     {
 394       gunichar wc = g_utf8_get_char (p);
 395       const gchar *decomp;
 396       int cc;
 397       gsize old_n_wc = n_wc;
 398
 399       if (wc >= 0xac00 && wc <= 0xd7a3)
 400         {
 401           gsize result_len;
 402           decompose_hangul (wc, wc_buffer + n_wc, &result_len);
 403           n_wc += result_len;
 404         }
 405       else
 406         {
 407           decomp = find_decomposition (wc, do_compat);
 408
 409           if (decomp)
 410             {
 411               const char *pd;
 412               for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
 413                 wc_buffer[n_wc++] = g_utf8_get_char (pd);
 414             }
 415           else
 416             wc_buffer[n_wc++] = wc;
 417         }
 418
 419       if (n_wc > 0)
 420         {
 421           cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
 422
 423           if (cc == 0)
 424             {
 425               g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
 426               last_start = old_n_wc;
 427             }
 428         }
 429
 430       p = g_utf8_next_char (p);
 431     }
 432
 433   if (n_wc > 0)
 434     {
 435       g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
 436       last_start = n_wc;
 437     }
 438
 439   wc_buffer[n_wc] = 0;
 440
 441   /* All decomposed and reordered */
 442
 443   if (do_compose && n_wc > 0)
 444     {
 445       gsize i, j;
 446       int last_cc = 0;
 447       last_start = 0;
 448
 449       for (i = 0; i < n_wc; i++)
 450         {
 451           int cc = COMBINING_CLASS (wc_buffer[i]);
 452
 453           if (i > 0 &&
 454               (last_cc == 0 || last_cc < cc) &&
 455               combine (wc_buffer[last_start], wc_buffer[i],
 456                        &wc_buffer[last_start]))
 457             {
 458               for (j = i + 1; j < n_wc; j++)
 459                 wc_buffer[j-1] = wc_buffer[j];
 460               n_wc--;
 461               i--;
 462
 463               if (i == last_start)
 464                 last_cc = 0;
 465               else
 466                 last_cc = COMBINING_CLASS (wc_buffer[i-1]);
 467
 468               continue;
 469             }
 470
 471           if (cc == 0)
 472             last_start = i;
 473
 474           last_cc = cc;
 475         }
 476     }
 477
 478   wc_buffer[n_wc] = 0;
 479
 480   return wc_buffer;
 481 }
 482
 483 /**
 484  * g_utf8_normalize:
 485  * @str: a UTF-8 encoded string.
 486  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 487  * @mode: the type of normalization to perform.
 488  *
 489  * Converts a string into canonical form, standardizing
 490  * such issues as whether a character with an accent
 491  * is represented as a base character and combining
 492  * accent or as a single precomposed character. The
 493  * string has to be valid UTF-8, otherwise %NULL is
 494  * returned. You should generally call g_utf8_normalize()
 495  * before comparing two Unicode strings.
 496  *
 497  * The normalization mode %G_NORMALIZE_DEFAULT only
 498  * standardizes differences that do not affect the
 499  * text content, such as the above-mentioned accent
 500  * representation. %G_NORMALIZE_ALL also standardizes
 501  * the "compatibility" characters in Unicode, such
 502  * as SUPERSCRIPT THREE to the standard forms
 503  * (in this case DIGIT THREE). Formatting information
 504  * may be lost but for most text operations such
 505  * characters should be considered the same.
 506  *
 507  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
 508  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
 509  * but returned a result with composed forms rather
 510  * than a maximally decomposed form. This is often
 511  * useful if you intend to convert the string to
 512  * a legacy encoding or pass it to a system with
 513  * less capable Unicode handling.
 514  *
 515  * Return value: a newly allocated string, that is the
 516  *   normalized form of @str, or %NULL if @str is not
 517  *   valid UTF-8.
 518  **/
 519 gchar *
 520 g_utf8_normalize (const gchar    *str,
 521                   gssize          len,
 522                   GNormalizeMode  mode)
 523 {
 524   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
 525   gchar *result;
 526
 527   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
 528   g_free (result_wc);
 529
 530   return result;
 531 }