libidn/nfkc.c

   1 /* nfkc.c       Unicode normalization utilities.
   2  * Copyright (C) 2002, 2003  Simon Josefsson
   3  *
   4  * This file is part of GNU Libidn.
   5  *
   6  * GNU Libidn is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * GNU Libidn is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with GNU Libidn; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #if HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 #include <stdlib.h>
  25 #include <string.h>
  26
  27 #include "stringprep.h"
  28
  29 /* This file contains functions from GLIB, including gutf8.c and
  30  * gunidecomp.c, all licensed under LGPL and copyright hold by:
  31  *
  32  *  Copyright (C) 1999, 2000 Tom Tromey
  33  *  Copyright 2000 Red Hat, Inc.
  34  */
  35
  36 /* Hacks to make syncing with GLIB code easier. */
  37 #define gboolean int
  38 #define gchar char
  39 #define guchar unsigned char
  40 #define glong long
  41 #define gint int
  42 #define guint unsigned int
  43 #define gushort unsigned short
  44 #define gint16 int16_t
  45 #define guint16 uint16_t
  46 #define gunichar uint32_t
  47 #define gsize size_t
  48 #define gssize ssize_t
  49 #define g_malloc malloc
  50 #define g_free free
  51 #define GError void
  52 #define g_set_error(a,b,c,d) ((void) 0)
  53 #define g_new(struct_type, n_structs)                                   \
  54   ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
  55 #  if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
  56 #    define G_STMT_START        (void)(
  57 #    define G_STMT_END          )
  58 #  else
  59 #    if (defined (sun) || defined (__sun__))
  60 #      define G_STMT_START      if (1)
  61 #      define G_STMT_END        else (void)0
  62 #    else
  63 #      define G_STMT_START      do
  64 #      define G_STMT_END        while (0)
  65 #    endif
  66 #  endif
  67 #define g_return_val_if_fail(expr,val)          G_STMT_START{ (void)0; }G_STMT_END
  68 #define G_N_ELEMENTS(arr)               (sizeof (arr) / sizeof ((arr)[0]))
  69 #define TRUE 1
  70 #define FALSE 0
  71
  72 /* Code from GLIB gunicode.h starts here. */
  73
  74 typedef enum
  75 {
  76   G_NORMALIZE_DEFAULT,
  77   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
  78   G_NORMALIZE_DEFAULT_COMPOSE,
  79   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
  80   G_NORMALIZE_ALL,
  81   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
  82   G_NORMALIZE_ALL_COMPOSE,
  83   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
  84 }
  85 GNormalizeMode;
  86
  87 /* Code from GLIB gutf8.c starts here. */
  88
  89 #define UTF8_COMPUTE(Char, Mask, Len)           \
  90   if (Char < 128)                               \
  91     {                                           \
  92       Len = 1;                                  \
  93       Mask = 0x7f;                              \
  94     }                                           \
  95   else if ((Char & 0xe0) == 0xc0)               \
  96     {                                           \
  97       Len = 2;                                  \
  98       Mask = 0x1f;                              \
  99     }                                           \
 100   else if ((Char & 0xf0) == 0xe0)               \
 101     {                                           \
 102       Len = 3;                                  \
 103       Mask = 0x0f;                              \
 104     }                                           \
 105   else if ((Char & 0xf8) == 0xf0)               \
 106     {                                           \
 107       Len = 4;                                  \
 108       Mask = 0x07;                              \
 109     }                                           \
 110   else if ((Char & 0xfc) == 0xf8)               \
 111     {                                           \
 112       Len = 5;                                  \
 113       Mask = 0x03;                              \
 114     }                                           \
 115   else if ((Char & 0xfe) == 0xfc)               \
 116     {                                           \
 117       Len = 6;                                  \
 118       Mask = 0x01;                              \
 119     }                                           \
 120   else                                          \
 121     Len = -1;
 122
 123 #define UTF8_LENGTH(Char)                       \
 124   ((Char) < 0x80 ? 1 :                          \
 125    ((Char) < 0x800 ? 2 :                        \
 126     ((Char) < 0x10000 ? 3 :                     \
 127      ((Char) < 0x200000 ? 4 :                   \
 128       ((Char) < 0x4000000 ? 5 : 6)))))
 129
 130
 131 #define UTF8_GET(Result, Chars, Count, Mask, Len)       \
 132   (Result) = (Chars)[0] & (Mask);                       \
 133   for ((Count) = 1; (Count) < (Len); ++(Count))         \
 134     {                                                   \
 135       if (((Chars)[(Count)] & 0xc0) != 0x80)            \
 136         {                                               \
 137           (Result) = -1;                                \
 138           break;                                        \
 139         }                                               \
 140       (Result) <<= 6;                                   \
 141       (Result) |= ((Chars)[(Count)] & 0x3f);            \
 142     }
 143
 144 #define UNICODE_VALID(Char)                     \
 145   ((Char) < 0x110000 &&                         \
 146    (((Char) & 0xFFFFF800) != 0xD800) &&         \
 147    ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&      \
 148    ((Char) & 0xFFFE) != 0xFFFE)
 149
 150
 151 static const gchar utf8_skip_data[256] = {
 152   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 153   1, 1, 1, 1, 1, 1, 1,
 154   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 155   1, 1, 1, 1, 1, 1, 1,
 156   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 157   1, 1, 1, 1, 1, 1, 1,
 158   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 159   1, 1, 1, 1, 1, 1, 1,
 160   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 161   1, 1, 1, 1, 1, 1, 1,
 162   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 163   1, 1, 1, 1, 1, 1, 1,
 164   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 165   2, 2, 2, 2, 2, 2, 2,
 166   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
 167   5, 5, 5, 6, 6, 1, 1
 168 };
 169
 170 const gchar *const g_utf8_skip = utf8_skip_data;
 171
 172 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
 173
 174 /*
 175  * g_utf8_strlen:
 176  * @p: pointer to the start of a UTF-8 encoded string.
 177  * @max: the maximum number of bytes to examine. If @max
 178  *       is less than 0, then the string is assumed to be
 179  *       nul-terminated. If @max is 0, @p will not be examined and
 180  *       may be %NULL.
 181  *
 182  * Returns the length of the string in characters.
 183  *
 184  * Return value: the length of the string in characters
 185  **/
 186 static glong
 187 g_utf8_strlen (const gchar * p, gssize max)
 188 {
 189   glong len = 0;
 190   const gchar *start = p;
 191   g_return_val_if_fail (p != NULL || max == 0, 0);
 192
 193   if (max < 0)
 194     {
 195       while (*p)
 196         {
 197           p = g_utf8_next_char (p);
 198           ++len;
 199         }
 200     }
 201   else
 202     {
 203       if (max == 0 || !*p)
 204         return 0;
 205
 206       p = g_utf8_next_char (p);
 207
 208       while (p - start < max && *p)
 209         {
 210           ++len;
 211           p = g_utf8_next_char (p);
 212         }
 213
 214       /* only do the last len increment if we got a complete
 215        * char (don't count partial chars)
 216        */
 217       if (p - start == max)
 218         ++len;
 219     }
 220
 221   return len;
 222 }
 223
 224 /*
 225  * g_utf8_get_char:
 226  * @p: a pointer to Unicode character encoded as UTF-8
 227  *
 228  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 229  * If @p does not point to a valid UTF-8 encoded character, results are
 230  * undefined. If you are not sure that the bytes are complete
 231  * valid Unicode characters, you should use g_utf8_get_char_validated()
 232  * instead.
 233  *
 234  * Return value: the resulting character
 235  **/
 236 static gunichar
 237 g_utf8_get_char (const gchar * p)
 238 {
 239   int i, mask = 0, len;
 240   gunichar result;
 241   unsigned char c = (unsigned char) *p;
 242
 243   UTF8_COMPUTE (c, mask, len);
 244   if (len == -1)
 245     return (gunichar) - 1;
 246   UTF8_GET (result, p, i, mask, len);
 247
 248   return result;
 249 }
 250
 251 /*
 252  * g_unichar_to_utf8:
 253  * @c: a ISO10646 character code
 254  * @outbuf: output buffer, must have at least 6 bytes of space.
 255  *       If %NULL, the length will be computed and returned
 256  *       and nothing will be written to @outbuf.
 257  *
 258  * Converts a single character to UTF-8.
 259  *
 260  * Return value: number of bytes written
 261  **/
 262 static int
 263 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
 264 {
 265   guint len = 0;
 266   int first;
 267   int i;
 268
 269   if (c < 0x80)
 270     {
 271       first = 0;
 272       len = 1;
 273     }
 274   else if (c < 0x800)
 275     {
 276       first = 0xc0;
 277       len = 2;
 278     }
 279   else if (c < 0x10000)
 280     {
 281       first = 0xe0;
 282       len = 3;
 283     }
 284   else if (c < 0x200000)
 285     {
 286       first = 0xf0;
 287       len = 4;
 288     }
 289   else if (c < 0x4000000)
 290     {
 291       first = 0xf8;
 292       len = 5;
 293     }
 294   else
 295     {
 296       first = 0xfc;
 297       len = 6;
 298     }
 299
 300   if (outbuf)
 301     {
 302       for (i = len - 1; i > 0; --i)
 303         {
 304           outbuf[i] = (c & 0x3f) | 0x80;
 305           c >>= 6;
 306         }
 307       outbuf[0] = c | first;
 308     }
 309
 310   return len;
 311 }
 312
 313 /*
 314  * g_utf8_to_ucs4_fast:
 315  * @str: a UTF-8 encoded string
 316  * @len: the maximum length of @str to use. If @len < 0, then
 317  *       the string is nul-terminated.
 318  * @items_written: location to store the number of characters in the
 319  *                 result, or %NULL.
 320  *
 321  * Convert a string from UTF-8 to a 32-bit fixed width
 322  * representation as UCS-4, assuming valid UTF-8 input.
 323  * This function is roughly twice as fast as g_utf8_to_ucs4()
 324  * but does no error checking on the input.
 325  *
 326  * Return value: a pointer to a newly allocated UCS-4 string.
 327  *               This value must be freed with g_free().
 328  **/
 329 static gunichar *
 330 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
 331 {
 332   gint j, charlen;
 333   gunichar *result;
 334   gint n_chars, i;
 335   const gchar *p;
 336
 337   g_return_val_if_fail (str != NULL, NULL);
 338
 339   p = str;
 340   n_chars = 0;
 341   if (len < 0)
 342     {
 343       while (*p)
 344         {
 345           p = g_utf8_next_char (p);
 346           ++n_chars;
 347         }
 348     }
 349   else
 350     {
 351       while (p < str + len && *p)
 352         {
 353           p = g_utf8_next_char (p);
 354           ++n_chars;
 355         }
 356     }
 357
 358   result = g_new (gunichar, n_chars + 1);
 359   if (!result)
 360     return NULL;
 361
 362   p = str;
 363   for (i = 0; i < n_chars; i++)
 364     {
 365       gunichar wc = ((unsigned char *) p)[0];
 366
 367       if (wc < 0x80)
 368         {
 369           result[i] = wc;
 370           p++;
 371         }
 372       else
 373         {
 374           if (wc < 0xe0)
 375             {
 376               charlen = 2;
 377               wc &= 0x1f;
 378             }
 379           else if (wc < 0xf0)
 380             {
 381               charlen = 3;
 382               wc &= 0x0f;
 383             }
 384           else if (wc < 0xf8)
 385             {
 386               charlen = 4;
 387               wc &= 0x07;
 388             }
 389           else if (wc < 0xfc)
 390             {
 391               charlen = 5;
 392               wc &= 0x03;
 393             }
 394           else
 395             {
 396               charlen = 6;
 397               wc &= 0x01;
 398             }
 399
 400           for (j = 1; j < charlen; j++)
 401             {
 402               wc <<= 6;
 403               wc |= ((unsigned char *) p)[j] & 0x3f;
 404             }
 405
 406           result[i] = wc;
 407           p += charlen;
 408         }
 409     }
 410   result[i] = 0;
 411
 412   if (items_written)
 413     *items_written = i;
 414
 415   return result;
 416 }
 417
 418 /*
 419  * g_ucs4_to_utf8:
 420  * @str: a UCS-4 encoded string
 421  * @len: the maximum length of @str to use. If @len < 0, then
 422  *       the string is terminated with a 0 character.
 423  * @items_read: location to store number of characters read read, or %NULL.
 424  * @items_written: location to store number of bytes written or %NULL.
 425  *                 The value here stored does not include the trailing 0
 426  *                 byte.
 427  * @error: location to store the error occuring, or %NULL to ignore
 428  *         errors. Any of the errors in #GConvertError other than
 429  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 430  *
 431  * Convert a string from a 32-bit fixed width representation as UCS-4.
 432  * to UTF-8. The result will be terminated with a 0 byte.
 433  *
 434  * Return value: a pointer to a newly allocated UTF-8 string.
 435  *               This value must be freed with g_free(). If an
 436  *               error occurs, %NULL will be returned and
 437  *               @error set.
 438  **/
 439 static gchar *
 440 g_ucs4_to_utf8 (const gunichar * str,
 441                 glong len,
 442                 glong * items_read, glong * items_written, GError ** error)
 443 {
 444   gint result_length;
 445   gchar *result = NULL;
 446   gchar *p;
 447   gint i;
 448
 449   result_length = 0;
 450   for (i = 0; len < 0 || i < len; i++)
 451     {
 452       if (!str[i])
 453         break;
 454
 455       if (str[i] >= 0x80000000)
 456         {
 457           if (items_read)
 458             *items_read = i;
 459
 460           g_set_error (error, G_CONVERT_ERROR,
 461                        G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 462                        _("Character out of range for UTF-8"));
 463           goto err_out;
 464         }
 465
 466       result_length += UTF8_LENGTH (str[i]);
 467     }
 468
 469   result = g_malloc (result_length + 1);
 470   if (!result)
 471     return NULL;
 472   p = result;
 473
 474   i = 0;
 475   while (p < result + result_length)
 476     p += g_unichar_to_utf8 (str[i++], p);
 477
 478   *p = '\0';
 479
 480   if (items_written)
 481     *items_written = p - result;
 482
 483 err_out:
 484   if (items_read)
 485     *items_read = i;
 486
 487   return result;
 488 }
 489
 490 /* Code from GLIB gunidecomp.c starts here. */
 491
 492 #include "gunidecomp.h"
 493 #include "gunicomp.h"
 494
 495 #define CC_PART1(Page, Char) \
 496   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 497    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 498    : (cclass_data[combining_class_table_part1[Page]][Char]))
 499
 500 #define CC_PART2(Page, Char) \
 501   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 502    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 503    : (cclass_data[combining_class_table_part2[Page]][Char]))
 504
 505 #define COMBINING_CLASS(Char) \
 506   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
 507    ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
 508    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
 509       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
 510       : 0))
 511
 512 /* constants for hangul syllable [de]composition */
 513 #define SBase 0xAC00
 514 #define LBase 0x1100
 515 #define VBase 0x1161
 516 #define TBase 0x11A7
 517 #define LCount 19
 518 #define VCount 21
 519 #define TCount 28
 520 #define NCount (VCount * TCount)
 521 #define SCount (LCount * NCount)
 522
 523 /*
 524  * g_unicode_canonical_ordering:
 525  * @string: a UCS-4 encoded string.
 526  * @len: the maximum length of @string to use.
 527  *
 528  * Computes the canonical ordering of a string in-place.
 529  * This rearranges decomposed characters in the string
 530  * according to their combining classes.  See the Unicode
 531  * manual for more information.
 532  **/
 533 static void
 534 g_unicode_canonical_ordering (gunichar * string, gsize len)
 535 {
 536   gsize i;
 537   int swap = 1;
 538
 539   while (swap)
 540     {
 541       int last;
 542       swap = 0;
 543       last = COMBINING_CLASS (string[0]);
 544       for (i = 0; i < len - 1; ++i)
 545         {
 546           int next = COMBINING_CLASS (string[i + 1]);
 547           if (next != 0 && last > next)
 548             {
 549               gsize j;
 550               /* Percolate item leftward through string.  */
 551               for (j = i + 1; j > 0; --j)
 552                 {
 553                   gunichar t;
 554                   if (COMBINING_CLASS (string[j - 1]) <= next)
 555                     break;
 556                   t = string[j];
 557                   string[j] = string[j - 1];
 558                   string[j - 1] = t;
 559                   swap = 1;
 560                 }
 561               /* We're re-entering the loop looking at the old
 562                  character again.  */
 563               next = last;
 564             }
 565           last = next;
 566         }
 567     }
 568 }
 569
 570 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
 571  * r should be null or have sufficient space. Calling with r == NULL will
 572  * only calculate the result_len; however, a buffer with space for three
 573  * characters will always be big enough. */
 574 static void
 575 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
 576 {
 577   gint SIndex = s - SBase;
 578
 579   /* not a hangul syllable */
 580   if (SIndex < 0 || SIndex >= SCount)
 581     {
 582       if (r)
 583         r[0] = s;
 584       *result_len = 1;
 585     }
 586   else
 587     {
 588       gunichar L = LBase + SIndex / NCount;
 589       gunichar V = VBase + (SIndex % NCount) / TCount;
 590       gunichar T = TBase + SIndex % TCount;
 591
 592       if (r)
 593         {
 594           r[0] = L;
 595           r[1] = V;
 596         }
 597
 598       if (T != TBase)
 599         {
 600           if (r)
 601             r[2] = T;
 602           *result_len = 3;
 603         }
 604       else
 605         *result_len = 2;
 606     }
 607 }
 608
 609 /* returns a pointer to a null-terminated UTF-8 string */
 610 static const gchar *
 611 find_decomposition (gunichar ch, gboolean compat)
 612 {
 613   int start = 0;
 614   int end = G_N_ELEMENTS (decomp_table);
 615
 616   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
 617     {
 618       while (TRUE)
 619         {
 620           int half = (start + end) / 2;
 621           if (ch == decomp_table[half].ch)
 622             {
 623               int offset;
 624
 625               if (compat)
 626                 {
 627                   offset = decomp_table[half].compat_offset;
 628                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 629                     offset = decomp_table[half].canon_offset;
 630                 }
 631               else
 632                 {
 633                   offset = decomp_table[half].canon_offset;
 634                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 635                     return NULL;
 636                 }
 637
 638               return &(decomp_expansion_string[offset]);
 639             }
 640           else if (half == start)
 641             break;
 642           else if (ch > decomp_table[half].ch)
 643             start = half;
 644           else
 645             end = half;
 646         }
 647     }
 648
 649   return NULL;
 650 }
 651
 652 /* L,V => LV and LV,T => LVT  */
 653 static gboolean
 654 combine_hangul (gunichar a, gunichar b, gunichar * result)
 655 {
 656   gint LIndex = a - LBase;
 657   gint SIndex = a - SBase;
 658
 659   gint VIndex = b - VBase;
 660   gint TIndex = b - TBase;
 661
 662   if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
 663     {
 664       *result = SBase + (LIndex * VCount + VIndex) * TCount;
 665       return TRUE;
 666     }
 667   else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
 668            && 0 <= TIndex && TIndex <= TCount)
 669     {
 670       *result = a + TIndex;
 671       return TRUE;
 672     }
 673
 674   return FALSE;
 675 }
 676
 677 #define CI(Page, Char) \
 678   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 679    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 680    : (compose_data[compose_table[Page]][Char]))
 681
 682 #define COMPOSE_INDEX(Char) \
 683      ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
 684
 685 static gboolean
 686 combine (gunichar a, gunichar b, gunichar * result)
 687 {
 688   gushort index_a, index_b;
 689
 690   if (combine_hangul (a, b, result))
 691     return TRUE;
 692
 693   index_a = COMPOSE_INDEX (a);
 694
 695   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
 696     {
 697       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
 698         {
 699           *result =
 700             compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
 701           return TRUE;
 702         }
 703       else
 704         return FALSE;
 705     }
 706
 707   index_b = COMPOSE_INDEX (b);
 708
 709   if (index_b >= COMPOSE_SECOND_SINGLE_START)
 710     {
 711       if (a ==
 712           compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
 713         {
 714           *result =
 715             compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
 716           return TRUE;
 717         }
 718       else
 719         return FALSE;
 720     }
 721
 722   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
 723       && index_b >= COMPOSE_SECOND_START
 724       && index_b < COMPOSE_SECOND_SINGLE_START)
 725     {
 726       gunichar res =
 727         compose_array[index_a - COMPOSE_FIRST_START][index_b -
 728                                                      COMPOSE_SECOND_START];
 729
 730       if (res)
 731         {
 732           *result = res;
 733           return TRUE;
 734         }
 735     }
 736
 737   return FALSE;
 738 }
 739
 740 static gunichar *
 741 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
 742 {
 743   gsize n_wc;
 744   gunichar *wc_buffer;
 745   const char *p;
 746   gsize last_start;
 747   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
 748   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
 749
 750   n_wc = 0;
 751   p = str;
 752   while ((max_len < 0 || p < str + max_len) && *p)
 753     {
 754       const gchar *decomp;
 755       gunichar wc = g_utf8_get_char (p);
 756
 757       if (wc >= 0xac00 && wc <= 0xd7af)
 758         {
 759           gsize result_len;
 760           decompose_hangul (wc, NULL, &result_len);
 761           n_wc += result_len;
 762         }
 763       else
 764         {
 765           decomp = find_decomposition (wc, do_compat);
 766
 767           if (decomp)
 768             n_wc += g_utf8_strlen (decomp, -1);
 769           else
 770             n_wc++;
 771         }
 772
 773       p = g_utf8_next_char (p);
 774     }
 775
 776   wc_buffer = g_new (gunichar, n_wc + 1);
 777   if (!wc_buffer)
 778     return NULL;
 779
 780   last_start = 0;
 781   n_wc = 0;
 782   p = str;
 783   while ((max_len < 0 || p < str + max_len) && *p)
 784     {
 785       gunichar wc = g_utf8_get_char (p);
 786       const gchar *decomp;
 787       int cc;
 788       gsize old_n_wc = n_wc;
 789
 790       if (wc >= 0xac00 && wc <= 0xd7af)
 791         {
 792           gsize result_len;
 793           decompose_hangul (wc, wc_buffer + n_wc, &result_len);
 794           n_wc += result_len;
 795         }
 796       else
 797         {
 798           decomp = find_decomposition (wc, do_compat);
 799
 800           if (decomp)
 801             {
 802               const char *pd;
 803               for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
 804                 wc_buffer[n_wc++] = g_utf8_get_char (pd);
 805             }
 806           else
 807             wc_buffer[n_wc++] = wc;
 808         }
 809
 810       if (n_wc > 0)
 811         {
 812           cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
 813
 814           if (cc == 0)
 815             {
 816               g_unicode_canonical_ordering (wc_buffer + last_start,
 817                                             n_wc - last_start);
 818               last_start = old_n_wc;
 819             }
 820         }
 821
 822       p = g_utf8_next_char (p);
 823     }
 824
 825   if (n_wc > 0)
 826     {
 827       g_unicode_canonical_ordering (wc_buffer + last_start,
 828                                     n_wc - last_start);
 829       last_start = n_wc;
 830     }
 831
 832   wc_buffer[n_wc] = 0;
 833
 834   /* All decomposed and reordered */
 835
 836   if (do_compose && n_wc > 0)
 837     {
 838       gsize i, j;
 839       int last_cc = 0;
 840       last_start = 0;
 841
 842       for (i = 0; i < n_wc; i++)
 843         {
 844           int cc = COMBINING_CLASS (wc_buffer[i]);
 845
 846           if (i > 0 &&
 847               (last_cc == 0 || last_cc != cc) &&
 848               combine (wc_buffer[last_start], wc_buffer[i],
 849                        &wc_buffer[last_start]))
 850             {
 851               for (j = i + 1; j < n_wc; j++)
 852                 wc_buffer[j - 1] = wc_buffer[j];
 853               n_wc--;
 854               i--;
 855
 856               if (i == last_start)
 857                 last_cc = 0;
 858               else
 859                 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
 860
 861               continue;
 862             }
 863
 864           if (cc == 0)
 865             last_start = i;
 866
 867           last_cc = cc;
 868         }
 869     }
 870
 871   wc_buffer[n_wc] = 0;
 872
 873   return wc_buffer;
 874 }
 875
 876 /*
 877  * g_utf8_normalize:
 878  * @str: a UTF-8 encoded string.
 879  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 880  * @mode: the type of normalization to perform.
 881  *
 882  * Converts a string into canonical form, standardizing
 883  * such issues as whether a character with an accent
 884  * is represented as a base character and combining
 885  * accent or as a single precomposed character. You
 886  * should generally call g_utf8_normalize() before
 887  * comparing two Unicode strings.
 888  *
 889  * The normalization mode %G_NORMALIZE_DEFAULT only
 890  * standardizes differences that do not affect the
 891  * text content, such as the above-mentioned accent
 892  * representation. %G_NORMALIZE_ALL also standardizes
 893  * the "compatibility" characters in Unicode, such
 894  * as SUPERSCRIPT THREE to the standard forms
 895  * (in this case DIGIT THREE). Formatting information
 896  * may be lost but for most text operations such
 897  * characters should be considered the same.
 898  * For example, g_utf8_collate() normalizes
 899  * with %G_NORMALIZE_ALL as its first step.
 900  *
 901  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
 902  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
 903  * but returned a result with composed forms rather
 904  * than a maximally decomposed form. This is often
 905  * useful if you intend to convert the string to
 906  * a legacy encoding or pass it to a system with
 907  * less capable Unicode handling.
 908  *
 909  * Return value: a newly allocated string, that is the
 910  *   normalized form of @str.
 911  **/
 912 static gchar *
 913 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
 914 {
 915   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
 916   gchar *result;
 917
 918   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
 919   g_free (result_wc);
 920
 921   return result;
 922 }
 923
 924 /* Public Libidn API starts here. */
 925
 926 /**
 927  * stringprep_utf8_to_unichar:
 928  * @p: a pointer to Unicode character encoded as UTF-8
 929  *
 930  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 931  * If @p does not point to a valid UTF-8 encoded character, results are
 932  * undefined.
 933  *
 934  * Return value: the resulting character.
 935  **/
 936 uint32_t
 937 stringprep_utf8_to_unichar (const char *p)
 938 {
 939   return g_utf8_get_char (p);
 940 }
 941
 942 /**
 943  * stringprep_unichar_to_utf8:
 944  * @c: a ISO10646 character code
 945  * @outbuf: output buffer, must have at least 6 bytes of space.
 946  *       If %NULL, the length will be computed and returned
 947  *       and nothing will be written to @outbuf.
 948  *
 949  * Converts a single character to UTF-8.
 950  *
 951  * Return value: number of bytes written.
 952  **/
 953 int
 954 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
 955 {
 956   return g_unichar_to_utf8 (c, outbuf);
 957 }
 958
 959 /**
 960  * stringprep_utf8_to_ucs4:
 961  * @str: a UTF-8 encoded string
 962  * @len: the maximum length of @str to use. If @len < 0, then
 963  *       the string is nul-terminated.
 964  * @items_written: location to store the number of characters in the
 965  *                 result, or %NULL.
 966  *
 967  * Convert a string from UTF-8 to a 32-bit fixed width
 968  * representation as UCS-4, assuming valid UTF-8 input.
 969  * This function does no error checking on the input.
 970  *
 971  * Return value: a pointer to a newly allocated UCS-4 string.
 972  *               This value must be freed with free().
 973  **/
 974 uint32_t *
 975 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
 976 {
 977   return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
 978 }
 979
 980 /**
 981  * stringprep_ucs4_to_utf8:
 982  * @str: a UCS-4 encoded string
 983  * @len: the maximum length of @str to use. If @len < 0, then
 984  *       the string is terminated with a 0 character.
 985  * @items_read: location to store number of characters read read, or %NULL.
 986  * @items_written: location to store number of bytes written or %NULL.
 987  *                 The value here stored does not include the trailing 0
 988  *                 byte.
 989  *
 990  * Convert a string from a 32-bit fixed width representation as UCS-4.
 991  * to UTF-8. The result will be terminated with a 0 byte.
 992  *
 993  * Return value: a pointer to a newly allocated UTF-8 string.
 994  *               This value must be freed with free(). If an
 995  *               error occurs, %NULL will be returned and
 996  *               @error set.
 997  **/
 998 char *
 999 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1000                          size_t * items_read, size_t * items_written)
1001 {
1002   return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1003                          (glong *) items_written, NULL);
1004 }
1005
1006 /**
1007  * stringprep_utf8_nfkc_normalize:
1008  * @str: a UTF-8 encoded string.
1009  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1010  *
1011  * Converts a string into canonical form, standardizing
1012  * such issues as whether a character with an accent
1013  * is represented as a base character and combining
1014  * accent or as a single precomposed character.
1015  *
1016  * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1017  * differences that do not affect the text content, such as the
1018  * above-mentioned accent representation. It standardizes the
1019  * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1020  * the standard forms (in this case DIGIT THREE). Formatting
1021  * information may be lost but for most text operations such
1022  * characters should be considered the same. It returns a result with
1023  * composed forms rather than a maximally decomposed form.
1024  *
1025  * Return value: a newly allocated string, that is the
1026  *   NFKC normalized form of @str.
1027  **/
1028 char *
1029 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1030 {
1031   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1032 }
1033
1034 /**
1035  * stringprep_ucs4_nfkc_normalize:
1036  * @str: a Unicode string.
1037  * @len: length of @str array, or -1 if @str is nul-terminated.
1038  *
1039  * Converts UCS4 string into UTF-8 and runs
1040  * stringprep_utf8_nfkc_normalize().
1041  *
1042  * Return value: a newly allocated Unicode string, that is the NFKC
1043  *   normalized form of @str.
1044  **/
1045 uint32_t *
1046 stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1047 {
1048   char *p;
1049   uint32_t *result_wc;
1050
1051   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1052   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1053   free (p);
1054
1055   return result_wc;
1056 }