glib/gutf8.c

   1 /* gutf8.c - Operations on UTF-8 strings.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include <config.h>
  23
  24 #include <stdlib.h>
  25 #ifdef HAVE_CODESET
  26 #include <langinfo.h>
  27 #endif
  28 #include <string.h>
  29
  30 #include "glib.h"
  31
  32 #ifdef G_PLATFORM_WIN32
  33 #include <stdio.h>
  34 #define STRICT
  35 #include <windows.h>
  36 #undef STRICT
  37 #endif
  38
  39 #include "glibintl.h"
  40
  41 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
  42   if (Char < 128)                                                             \
  43     {                                                                         \
  44       Len = 1;                                                                \
  45       Mask = 0x7f;                                                            \
  46     }                                                                         \
  47   else if ((Char & 0xe0) == 0xc0)                                             \
  48     {                                                                         \
  49       Len = 2;                                                                \
  50       Mask = 0x1f;                                                            \
  51     }                                                                         \
  52   else if ((Char & 0xf0) == 0xe0)                                             \
  53     {                                                                         \
  54       Len = 3;                                                                \
  55       Mask = 0x0f;                                                            \
  56     }                                                                         \
  57   else if ((Char & 0xf8) == 0xf0)                                             \
  58     {                                                                         \
  59       Len = 4;                                                                \
  60       Mask = 0x07;                                                            \
  61     }                                                                         \
  62   else if ((Char & 0xfc) == 0xf8)                                             \
  63     {                                                                         \
  64       Len = 5;                                                                \
  65       Mask = 0x03;                                                            \
  66     }                                                                         \
  67   else if ((Char & 0xfe) == 0xfc)                                             \
  68     {                                                                         \
  69       Len = 6;                                                                \
  70       Mask = 0x01;                                                            \
  71     }                                                                         \
  72   else                                                                        \
  73     Len = -1;
  74
  75 #define UTF8_LENGTH(Char)              \
  76   ((Char) < 0x80 ? 1 :                 \
  77    ((Char) < 0x800 ? 2 :               \
  78     ((Char) < 0x10000 ? 3 :            \
  79      ((Char) < 0x200000 ? 4 :          \
  80       ((Char) < 0x4000000 ? 5 : 6)))))
  81
  82
  83 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
  84   (Result) = (Chars)[0] & (Mask);                                             \
  85   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
  86     {                                                                         \
  87       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
  88         {                                                                     \
  89           (Result) = -1;                                                      \
  90           break;                                                              \
  91         }                                                                     \
  92       (Result) <<= 6;                                                         \
  93       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
  94     }
  95
  96 #define UNICODE_VALID(Char)                   \
  97     ((Char) < 0x110000 &&                     \
  98      ((Char) < 0xD800 || (Char) >= 0xE000) && \
  99      (Char) != 0xFFFE && (Char) != 0xFFFF)
 100
 101
 102 gchar g_utf8_skip[256] = {
 103   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 104   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 105   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 106   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 107   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 108   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 109   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 110   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0
 111 };
 112
 113 /**
 114  * g_utf8_find_prev_char:
 115  * @str: pointer to the beginning of a UTF-8 string
 116  * @p: pointer to some position within @str
 117  *
 118  * Given a position @p with a UTF-8 encoded string @str, find the start
 119  * of the previous UTF-8 character starting before @p. Returns %NULL if no
 120  * UTF-8 characters are present in @p before @str.
 121  *
 122  * @p does not have to be at the beginning of a UTF-8 chracter. No check
 123  * is made to see if the character found is actually valid other than
 124  * it starts with an appropriate byte.
 125  *
 126  * Return value: a pointer to the found character or %NULL.
 127  **/
 128 gchar *
 129 g_utf8_find_prev_char (const char *str,
 130                        const char *p)
 131 {
 132   for (--p; p > str; --p)
 133     {
 134       if ((*p & 0xc0) != 0x80)
 135         return (gchar *)p;
 136     }
 137   return NULL;
 138 }
 139
 140 /**
 141  * g_utf8_find_next_char:
 142  * @p: a pointer to a position within a UTF-8 encoded string
 143  * @end: a pointer to the end of the string, or %NULL to indicate
 144  *        that the string is NULL terminated, in which case
 145  *        the returned value will be
 146  *
 147  * Find the start of the next utf-8 character in the string after @p
 148  *
 149  * @p does not have to be at the beginning of a UTF-8 chracter. No check
 150  * is made to see if the character found is actually valid other than
 151  * it starts with an appropriate byte.
 152  *
 153  * Return value: a pointer to the found character or %NULL
 154  **/
 155 gchar *
 156 g_utf8_find_next_char (const gchar *p,
 157                        const gchar *end)
 158 {
 159   if (*p)
 160     {
 161       if (end)
 162         for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
 163           ;
 164       else
 165         for (++p; (*p & 0xc0) == 0x80; ++p)
 166           ;
 167     }
 168   return (p == end) ? NULL : (gchar *)p;
 169 }
 170
 171 /**
 172  * g_utf8_prev_char:
 173  * @p: a pointer to a position within a UTF-8 encoded string
 174  *
 175  * Find the previous UTF-8 character in the string before @p
 176  *
 177  * @p does not have to be at the beginning of a UTF-8 character. No check
 178  * is made to see if the character found is actually valid other than
 179  * it starts with an appropriate byte. If @p might be the first
 180  * character of the string, you must use g_utf8_find_prev_char instead.
 181  *
 182  * Return value: a pointer to the found character.
 183  **/
 184 gchar *
 185 g_utf8_prev_char (const gchar *p)
 186 {
 187   while (TRUE)
 188     {
 189       p--;
 190       if ((*p & 0xc0) != 0x80)
 191         return (gchar *)p;
 192     }
 193 }
 194
 195 /**
 196  * g_utf8_strlen:
 197  * @p: pointer to the start of a UTF-8 string.
 198  * @max: the maximum number of bytes to examine. If @max
 199  *       is less than 0, then the string is assumed to be
 200  *       nul-terminated.
 201  *
 202  * Return value: the length of the string in characters
 203  **/
 204 gint
 205 g_utf8_strlen (const gchar *p, gint max)
 206 {
 207   int len = 0;
 208   const gchar *start = p;
 209
 210   if (max < 0)
 211     {
 212       while (*p)
 213         {
 214           p = g_utf8_next_char (p);
 215           ++len;
 216         }
 217     }
 218   else
 219     {
 220       if (max == 0 || !*p)
 221         return 0;
 222
 223       p = g_utf8_next_char (p);
 224
 225       while (p - start < max && *p)
 226         {
 227           ++len;
 228           p = g_utf8_next_char (p);
 229         }
 230
 231       /* only do the last len increment if we got a complete
 232        * char (don't count partial chars)
 233        */
 234       if (p - start == max)
 235         ++len;
 236     }
 237
 238   return len;
 239 }
 240
 241 /**
 242  * g_utf8_get_char:
 243  * @p: a pointer to unicode character encoded as UTF-8
 244  *
 245  * Convert a sequence of bytes encoded as UTF-8 to a unicode character.
 246  *
 247  * Return value: the resulting character or (gunichar)-1 if @p does
 248  *               not point to a valid UTF-8 encoded unicode character
 249  **/
 250 gunichar
 251 g_utf8_get_char (const gchar *p)
 252 {
 253   int i, mask = 0, len;
 254   gunichar result;
 255   unsigned char c = (unsigned char) *p;
 256
 257   UTF8_COMPUTE (c, mask, len);
 258   if (len == -1)
 259     return (gunichar)-1;
 260   UTF8_GET (result, p, i, mask, len);
 261
 262   return result;
 263 }
 264
 265 /**
 266  * g_utf8_offset_to_pointer:
 267  * @str: a UTF-8 encoded string
 268  * @offset: a character offset within the string.
 269  *
 270  * Converts from an integer character offset to a pointer to a position
 271  * within the string.
 272  *
 273  * Return value: the resulting pointer
 274  **/
 275 gchar *
 276 g_utf8_offset_to_pointer  (const gchar *str,
 277                            gint         offset)
 278 {
 279   const gchar *s = str;
 280   while (offset--)
 281     s = g_utf8_next_char (s);
 282
 283   return (gchar *)s;
 284 }
 285
 286 /**
 287  * g_utf8_pointer_to_offset:
 288  * @str: a UTF-8 encoded string
 289  * @pos: a pointer to a position within @str
 290  *
 291  * Converts from a pointer to position within a string to a integer
 292  * character offset
 293  *
 294  * Return value: the resulting character offset
 295  **/
 296 gint
 297 g_utf8_pointer_to_offset (const gchar *str,
 298                           const gchar *pos)
 299 {
 300   const gchar *s = str;
 301   gint offset = 0;
 302
 303   while (s < pos)
 304     {
 305       s = g_utf8_next_char (s);
 306       offset++;
 307     }
 308
 309   return offset;
 310 }
 311
 312
 313 /**
 314  * g_utf8_strncpy:
 315  * @dest: buffer to fill with characters from @src
 316  * @src: UTF-8 string
 317  * @n: character count
 318  *
 319  * Like the standard C strncpy() function, but copies a given number
 320  * of characters instead of a given number of bytes. The @src string
 321  * must be valid UTF-8 encoded text. (Use g_utf8_validate() on all
 322  * text before trying to use UTF-8 utility functions with it.)
 323  *
 324  * Return value: @dest
 325  **/
 326 gchar *
 327 g_utf8_strncpy (gchar *dest, const gchar *src, size_t n)
 328 {
 329   const gchar *s = src;
 330   while (n && *s)
 331     {
 332       s = g_utf8_next_char(s);
 333       n--;
 334     }
 335   strncpy(dest, src, s - src);
 336   dest[s - src] = 0;
 337   return dest;
 338 }
 339
 340 static gboolean
 341 g_utf8_get_charset_internal (char **a)
 342 {
 343   char *charset = getenv("CHARSET");
 344
 345   if (charset && a && ! *a)
 346     *a = charset;
 347
 348   if (charset && strstr (charset, "UTF-8"))
 349       return TRUE;
 350
 351 #ifdef HAVE_CODESET
 352   charset = nl_langinfo(CODESET);
 353   if (charset)
 354     {
 355       if (a && ! *a)
 356         *a = charset;
 357       if (strcmp (charset, "UTF-8") == 0)
 358         return TRUE;
 359     }
 360 #endif
 361
 362 #if 0 /* #ifdef _NL_CTYPE_CODESET_NAME */
 363   charset = nl_langinfo (_NL_CTYPE_CODESET_NAME);
 364   if (charset)
 365     {
 366       if (a && ! *a)
 367         *a = charset;
 368       if (strcmp (charset, "UTF-8") == 0)
 369         return TRUE;
 370     }
 371 #endif
 372
 373 #ifdef G_PLATFORM_WIN32
 374   if (a && ! *a)
 375     {
 376       static char codepage[10];
 377
 378       sprintf (codepage, "CP%d", GetACP ());
 379       *a = codepage;
 380       /* What about codepage 1200? Is that UTF-8? */
 381       return FALSE;
 382     }
 383 #else
 384   if (a && ! *a)
 385     *a = "US-ASCII";
 386 #endif
 387
 388   /* Assume this for compatibility at present.  */
 389   return FALSE;
 390 }
 391
 392 static int utf8_locale_cache = -1;
 393 static char *utf8_charset_cache = NULL;
 394
 395 /**
 396  * g_get_charset:
 397  * @charset: return location for character set name
 398  *
 399  * Obtains the character set for the current locale; you might use
 400  * this character set as an argument to g_convert(), to convert from
 401  * the current locale's encoding to some other encoding. (Frequently
 402  * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts,
 403  * though.)
 404  *
 405  * The return value is %TRUE if the locale's encoding is UTF-8, in that
 406  * case you can perhaps avoid calling g_convert().
 407  *
 408  * The string returned in @charset is not allocated, and should not be
 409  * freed.
 410  *
 411  * Return value: %TRUE if the returned charset is UTF-8
 412  **/
 413 gboolean
 414 g_get_charset (char **charset)
 415 {
 416   if (utf8_locale_cache != -1)
 417     {
 418       if (charset)
 419         *charset = utf8_charset_cache;
 420       return utf8_locale_cache;
 421     }
 422   utf8_locale_cache = g_utf8_get_charset_internal (&utf8_charset_cache);
 423   if (charset)
 424     *charset = utf8_charset_cache;
 425   return utf8_locale_cache;
 426 }
 427
 428 /* unicode_strchr */
 429
 430 /**
 431  * g_unichar_to_utf8:
 432  * @c: a ISO10646 character code
 433  * @outbuf: output buffer, must have at least 6 bytes of space.
 434  *       If %NULL, the length will be computed and returned
 435  *       and nothing will be written to @out.
 436  *
 437  * Convert a single character to utf8
 438  *
 439  * Return value: number of bytes written
 440  **/
 441 int
 442 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
 443 {
 444   size_t len = 0;
 445   int first;
 446   int i;
 447
 448   if (c < 0x80)
 449     {
 450       first = 0;
 451       len = 1;
 452     }
 453   else if (c < 0x800)
 454     {
 455       first = 0xc0;
 456       len = 2;
 457     }
 458   else if (c < 0x10000)
 459     {
 460       first = 0xe0;
 461       len = 3;
 462     }
 463    else if (c < 0x200000)
 464     {
 465       first = 0xf0;
 466       len = 4;
 467     }
 468   else if (c < 0x4000000)
 469     {
 470       first = 0xf8;
 471       len = 5;
 472     }
 473   else
 474     {
 475       first = 0xfc;
 476       len = 6;
 477     }
 478
 479   if (outbuf)
 480     {
 481       for (i = len - 1; i > 0; --i)
 482         {
 483           outbuf[i] = (c & 0x3f) | 0x80;
 484           c >>= 6;
 485         }
 486       outbuf[0] = c | first;
 487     }
 488
 489   return len;
 490 }
 491
 492 /**
 493  * g_utf8_strchr:
 494  * @p: a nul-terminated utf-8 string
 495  * @c: a iso-10646 character/
 496  *
 497  * Find the leftmost occurence of the given iso-10646 character
 498  * in a UTF-8 string.
 499  *
 500  * Return value: NULL if the string does not contain the character, otherwise, a
 501  *               a pointer to the start of the leftmost of the character in the string.
 502  **/
 503 gchar *
 504 g_utf8_strchr (const char *p, gunichar c)
 505 {
 506   gchar ch[10];
 507
 508   gint len = g_unichar_to_utf8 (c, ch);
 509   ch[len] = '\0';
 510
 511   return strstr(p, ch);
 512 }
 513
 514 #if 0
 515 /**
 516  * g_utf8_strrchr:
 517  * @p: a nul-terminated utf-8 string
 518  * @c: a iso-10646 character/
 519  *
 520  * Find the rightmost occurence of the given iso-10646 character
 521  * in a UTF-8 string.
 522  *
 523  * Return value: NULL if the string does not contain the character, otherwise, a
 524  *               a pointer to the start of the rightmost of the character in the string.
 525  **/
 526
 527 /* This is ifdefed out atm as there is no strrstr function in libc.
 528  */
 529 gchar *
 530 unicode_strrchr (const char *p, gunichar c)
 531 {
 532   gchar ch[10];
 533
 534   len = g_unichar_to_utf8 (c, ch);
 535   ch[len] = '\0';
 536
 537   return strrstr(p, ch);
 538 }
 539 #endif
 540
 541
 542 /* Like g_utf8_get_char, but take a maximum length
 543  * and return (gunichar)-2 on incomplete trailing character
 544  */
 545 static inline gunichar
 546 g_utf8_get_char_extended (const gchar *p, int max_len)
 547 {
 548   gint i, len;
 549   gunichar wc = (guchar) *p;
 550
 551   if (wc < 0x80)
 552     {
 553       return wc;
 554     }
 555   else if (wc < 0xc0)
 556     {
 557       return (gunichar)-1;
 558     }
 559   else if (wc < 0xe0)
 560     {
 561       len = 2;
 562       wc &= 0x1f;
 563     }
 564   else if (wc < 0xf0)
 565     {
 566       len = 3;
 567       wc &= 0x0f;
 568     }
 569   else if (wc < 0xf8)
 570     {
 571       len = 4;
 572       wc &= 0x07;
 573     }
 574   else if (wc < 0xfc)
 575     {
 576       len = 5;
 577       wc &= 0x03;
 578     }
 579   else if (wc < 0xfe)
 580     {
 581       len = 6;
 582       wc &= 0x01;
 583     }
 584   else
 585     {
 586       return (gunichar)-1;
 587     }
 588
 589   if (len == -1)
 590     return (gunichar)-1;
 591   if (max_len >= 0 && len > max_len)
 592     {
 593       for (i = 1; i < max_len; i++)
 594         {
 595           if ((((guchar *)p)[i] & 0xc0) != 0x80)
 596             return (gunichar)-1;
 597         }
 598       return (gunichar)-2;
 599     }
 600
 601   for (i = 1; i < len; ++i)
 602     {
 603       gunichar ch = ((guchar *)p)[i];
 604
 605       if ((ch & 0xc0) != 0x80)
 606         {
 607           if (ch)
 608             return (gunichar)-1;
 609           else
 610             return (gunichar)-2;
 611         }
 612
 613       wc <<= 6;
 614       wc |= (ch & 0x3f);
 615     }
 616
 617   if (UTF8_LENGTH(wc) != len)
 618     return (gunichar)-1;
 619
 620   return wc;
 621 }
 622
 623 /**
 624  * g_utf8_to_ucs4_fast:
 625  * @str: a UTF-8 encoded string
 626  * @len: the maximum length of @str to use. If < 0, then
 627  *       the string is %NULL terminated.
 628  * @items_written: location to store the number of characters in the
 629  *                 result, or %NULL.
 630  *
 631  * Convert a string from UTF-8 to a 32-bit fixed width
 632  * representation as UCS-4, assuming valid UTF-8 input.
 633  * This function is roughly twice as fast as g_utf8_to_ucs4()
 634  * but does no error checking on the input.
 635  *
 636  * Return value: a pointer to a newly allocated UCS-4 string.
 637  *               This value must be freed with g_free()
 638  **/
 639 gunichar *
 640 g_utf8_to_ucs4_fast (const gchar *str,
 641                      gint         len,
 642                      gint        *items_written)
 643 {
 644   gint j, charlen;
 645   gunichar *result;
 646   gint n_chars, i;
 647   const gchar *p;
 648
 649   g_return_val_if_fail (str != NULL, NULL);
 650
 651   p = str;
 652   n_chars = 0;
 653   if (len < 0)
 654     {
 655       while (*p)
 656         {
 657           p = g_utf8_next_char (p);
 658           ++n_chars;
 659         }
 660     }
 661   else
 662     {
 663       while (*p && p < str + len)
 664         {
 665           p = g_utf8_next_char (p);
 666           ++n_chars;
 667         }
 668     }
 669
 670   result = g_new (gunichar, n_chars + 1);
 671
 672   p = str;
 673   for (i=0; i < n_chars; i++)
 674     {
 675       gunichar wc = ((unsigned char *)p)[0];
 676
 677       if (wc < 0x80)
 678         {
 679           result[i] = wc;
 680           p++;
 681         }
 682       else
 683         {
 684           if (wc < 0xe0)
 685             {
 686               charlen = 2;
 687               wc &= 0x1f;
 688             }
 689           else if (wc < 0xf0)
 690             {
 691               charlen = 3;
 692               wc &= 0x0f;
 693             }
 694           else if (wc < 0xf8)
 695             {
 696               charlen = 4;
 697               wc &= 0x07;
 698             }
 699           else if (wc < 0xfc)
 700             {
 701               charlen = 5;
 702               wc &= 0x03;
 703             }
 704           else
 705             {
 706               charlen = 6;
 707               wc &= 0x01;
 708             }
 709
 710           for (j = 1; j < charlen; j++)
 711             {
 712               wc <<= 6;
 713               wc |= ((unsigned char *)p)[j] & 0x3f;
 714             }
 715
 716           result[i] = wc;
 717           p += charlen;
 718         }
 719     }
 720   result[i] = 0;
 721
 722   if (items_written)
 723     *items_written = i;
 724
 725   return result;
 726 }
 727
 728 /**
 729  * g_utf8_to_ucs4:
 730  * @str: a UTF-8 encoded string
 731  * @len: the maximum length of @str to use. If < 0, then
 732  *       the string is %NULL terminated.
 733  * @items_read: location to store number of bytes read, or %NULL.
 734  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
 735  *              returned in case @str contains a trailing partial
 736  *              character. If an error occurs then the index of the
 737  *              invalid input is stored here.
 738  * @items_written: location to store number of characters written or %NULL.
 739  *                 The value here stored does not include the trailing 0
 740  *                 character.
 741  * @error: location to store the error occuring, or %NULL to ignore
 742  *         errors. Any of the errors in #GConvertError other than
 743  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 744  *
 745  * Convert a string from UTF-8 to a 32-bit fixed width
 746  * representation as UCS-4. A trailing 0 will be added to the
 747  * string after the converted text.
 748  *
 749  * Return value: a pointer to a newly allocated UCS-4 string.
 750  *               This value must be freed with g_free(). If an
 751  *               error occurs, %NULL will be returned and
 752  *               @error set.
 753  **/
 754 gunichar *
 755 g_utf8_to_ucs4 (const gchar *str,
 756                 gint         len,
 757                 gint        *items_read,
 758                 gint        *items_written,
 759                 GError     **error)
 760 {
 761   gunichar *result = NULL;
 762   gint n_chars, i;
 763   const gchar *in;
 764
 765   in = str;
 766   n_chars = 0;
 767   while ((len < 0 || str + len - in > 0) && *in)
 768     {
 769       gunichar wc = g_utf8_get_char_extended (in, str + len - in);
 770       if (wc & 0x80000000)
 771         {
 772           if (wc == (gunichar)-2)
 773             {
 774               if (items_read)
 775                 break;
 776               else
 777                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 778                              _("Partial character sequence at end of input"));
 779             }
 780           else
 781             g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 782                          _("Invalid byte sequence in conversion input"));
 783
 784           goto err_out;
 785         }
 786
 787       n_chars++;
 788
 789       in = g_utf8_next_char (in);
 790     }
 791
 792   result = g_new (gunichar, n_chars + 1);
 793
 794   in = str;
 795   for (i=0; i < n_chars; i++)
 796     {
 797       result[i] = g_utf8_get_char (in);
 798       in = g_utf8_next_char (in);
 799     }
 800   result[i] = 0;
 801
 802   if (items_written)
 803     *items_written = n_chars;
 804
 805  err_out:
 806   if (items_read)
 807     *items_read = in - str;
 808
 809   return result;
 810 }
 811
 812 /**
 813  * g_ucs4_to_utf8:
 814  * @str: a UCS-4 encoded string
 815  * @len: the maximum length of @str to use. If < 0, then
 816  *       the string is %NULL terminated.
 817  * @items_read: location to store number of characters read read, or %NULL.
 818  * @items_written: location to store number of bytes written or %NULL.
 819  *                 The value here stored does not include the trailing 0
 820  *                 byte.
 821  * @error: location to store the error occuring, or %NULL to ignore
 822  *         errors. Any of the errors in #GConvertError other than
 823  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 824  *
 825  * Convert a string from a 32-bit fixed width representation as UCS-4.
 826  * to UTF-8. The result will be terminated with a 0 byte.
 827  *
 828  * Return value: a pointer to a newly allocated UTF-8 string.
 829  *               This value must be freed with g_free(). If an
 830  *               error occurs, %NULL will be returned and
 831  *               @error set.
 832  **/
 833 gchar *
 834 g_ucs4_to_utf8 (const gunichar *str,
 835                 gint            len,
 836                 gint           *items_read,
 837                 gint           *items_written,
 838                 GError        **error)
 839 {
 840   gint result_length;
 841   gchar *result = NULL;
 842   gchar *p;
 843   gint i;
 844
 845   result_length = 0;
 846   for (i = 0; len < 0 || i < len ; i++)
 847     {
 848       if (!str[i])
 849         break;
 850
 851       if (str[i] >= 0x80000000)
 852         {
 853           if (items_read)
 854             *items_read = i;
 855
 856           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 857                        _("Character out of range for UTF-8"));
 858           goto err_out;
 859         }
 860
 861       result_length += UTF8_LENGTH (str[i]);
 862     }
 863
 864   result = g_malloc (result_length + 1);
 865   p = result;
 866
 867   i = 0;
 868   while (p < result + result_length)
 869     p += g_unichar_to_utf8 (str[i++], p);
 870
 871   *p = '\0';
 872
 873   if (items_written)
 874     *items_written = p - result;
 875
 876  err_out:
 877   if (items_read)
 878     *items_read = i;
 879
 880   return result;
 881 }
 882
 883 #define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
 884
 885 /**
 886  * g_utf16_to_utf8:
 887  * @str: a UTF-16 encoded string
 888  * @len: the maximum length of @str to use. If < 0, then
 889  *       the string is terminated with a 0 character.
 890  * @items_read: location to store number of words read, or %NULL.
 891  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
 892  *              returned in case @str contains a trailing partial
 893  *              character. If an error occurs then the index of the
 894  *              invalid input is stored here.
 895  * @items_written: location to store number of bytes written, or %NULL.
 896  *                 The value stored here does not include the trailing
 897  *                 0 byte.
 898  * @error: location to store the error occuring, or %NULL to ignore
 899  *         errors. Any of the errors in #GConvertError other than
 900  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 901  *
 902  * Convert a string from UTF-16 to UTF-8. The result will be
 903  * terminated with a 0 byte.
 904  *
 905  * Return value: a pointer to a newly allocated UTF-8 string.
 906  *               This value must be freed with g_free(). If an
 907  *               error occurs, %NULL will be returned and
 908  *               @error set.
 909  **/
 910 gchar *
 911 g_utf16_to_utf8 (const gunichar2  *str,
 912                  gint              len,
 913                  gint             *items_read,
 914                  gint             *items_written,
 915                  GError          **error)
 916 {
 917   /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
 918    * are marked.
 919    */
 920   const gunichar2 *in;
 921   gchar *out;
 922   gchar *result = NULL;
 923   gint n_bytes;
 924   gunichar high_surrogate;
 925
 926   g_return_val_if_fail (str != 0, NULL);
 927
 928   n_bytes = 0;
 929   in = str;
 930   high_surrogate = 0;
 931   while ((len < 0 || in - str < len) && *in)
 932     {
 933       gunichar2 c = *in;
 934       gunichar wc;
 935
 936       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
 937         {
 938           if (high_surrogate)
 939             {
 940               wc = SURROGATE_VALUE (high_surrogate, c);
 941               high_surrogate = 0;
 942             }
 943           else
 944             {
 945               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 946                            _("Invalid sequence in conversion input"));
 947               goto err_out;
 948             }
 949         }
 950       else
 951         {
 952           if (high_surrogate)
 953             {
 954               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 955                            _("Invalid sequence in conversion input"));
 956               goto err_out;
 957             }
 958
 959           if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
 960             {
 961               high_surrogate = c;
 962               goto next1;
 963             }
 964           else
 965             wc = c;
 966         }
 967
 968       /********** DIFFERENT for UTF8/UCS4 **********/
 969       n_bytes += UTF8_LENGTH (wc);
 970
 971     next1:
 972       in++;
 973     }
 974
 975   if (high_surrogate && !items_read)
 976     {
 977       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 978                    _("Partial character sequence at end of input"));
 979       goto err_out;
 980     }
 981
 982   /* At this point, everything is valid, and we just need to convert
 983    */
 984   /********** DIFFERENT for UTF8/UCS4 **********/
 985   result = g_malloc (n_bytes + 1);
 986
 987   high_surrogate = 0;
 988   out = result;
 989   in = str;
 990   while (out < result + n_bytes)
 991     {
 992       gunichar2 c = *in;
 993       gunichar wc;
 994
 995       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
 996         {
 997           wc = SURROGATE_VALUE (high_surrogate, c);
 998           high_surrogate = 0;
 999         }
1000       else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1001         {
1002           high_surrogate = c;
1003           goto next2;
1004         }
1005       else
1006         wc = c;
1007
1008       /********** DIFFERENT for UTF8/UCS4 **********/
1009       out += g_unichar_to_utf8 (wc, out);
1010
1011     next2:
1012       in++;
1013     }
1014
1015   /********** DIFFERENT for UTF8/UCS4 **********/
1016   *out = '\0';
1017
1018   if (items_written)
1019     /********** DIFFERENT for UTF8/UCS4 **********/
1020     *items_written = out - result;
1021
1022  err_out:
1023   if (items_read)
1024     *items_read = in - str;
1025
1026   return result;
1027 }
1028
1029 /**
1030  * g_utf16_to_ucs4:
1031  * @str: a UTF-16 encoded string
1032  * @len: the maximum length of @str to use. If < 0, then
1033  *       the string is terminated with a 0 character.
1034  * @items_read: location to store number of words read, or %NULL.
1035  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1036  *              returned in case @str contains a trailing partial
1037  *              character. If an error occurs then the index of the
1038  *              invalid input is stored here.
1039  * @items_written: location to store number of characters written, or %NULL.
1040  *                 The value stored here does not include the trailing
1041  *                 0 character.
1042  * @error: location to store the error occuring, or %NULL to ignore
1043  *         errors. Any of the errors in #GConvertError other than
1044  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1045  *
1046  * Convert a string from UTF-16 to UCS-4. The result will be
1047  * terminated with a 0 character.
1048  *
1049  * Return value: a pointer to a newly allocated UCS-4 string.
1050  *               This value must be freed with g_free(). If an
1051  *               error occurs, %NULL will be returned and
1052  *               @error set.
1053  **/
1054 gunichar *
1055 g_utf16_to_ucs4 (const gunichar2  *str,
1056                  gint              len,
1057                  gint             *items_read,
1058                  gint             *items_written,
1059                  GError          **error)
1060 {
1061   const gunichar2 *in;
1062   gchar *out;
1063   gchar *result = NULL;
1064   gint n_bytes;
1065   gunichar high_surrogate;
1066
1067   g_return_val_if_fail (str != 0, NULL);
1068
1069   n_bytes = 0;
1070   in = str;
1071   high_surrogate = 0;
1072   while ((len < 0 || in - str < len) && *in)
1073     {
1074       gunichar2 c = *in;
1075       gunichar wc;
1076
1077       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1078         {
1079           if (high_surrogate)
1080             {
1081               wc = SURROGATE_VALUE (high_surrogate, c);
1082               high_surrogate = 0;
1083             }
1084           else
1085             {
1086               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1087                            _("Invalid sequence in conversion input"));
1088               goto err_out;
1089             }
1090         }
1091       else
1092         {
1093           if (high_surrogate)
1094             {
1095               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1096                            _("Invalid sequence in conversion input"));
1097               goto err_out;
1098             }
1099
1100           if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1101             {
1102               high_surrogate = c;
1103               goto next1;
1104             }
1105           else
1106             wc = c;
1107         }
1108
1109       /********** DIFFERENT for UTF8/UCS4 **********/
1110       n_bytes += sizeof (gunichar);
1111
1112     next1:
1113       in++;
1114     }
1115
1116   if (high_surrogate && !items_read)
1117     {
1118       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1119                    _("Partial character sequence at end of input"));
1120       goto err_out;
1121     }
1122
1123   /* At this point, everything is valid, and we just need to convert
1124    */
1125   /********** DIFFERENT for UTF8/UCS4 **********/
1126   result = g_malloc (n_bytes + 4);
1127
1128   high_surrogate = 0;
1129   out = result;
1130   in = str;
1131   while (out < result + n_bytes)
1132     {
1133       gunichar2 c = *in;
1134       gunichar wc;
1135
1136       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1137         {
1138           wc = SURROGATE_VALUE (high_surrogate, c);
1139           high_surrogate = 0;
1140         }
1141       else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1142         {
1143           high_surrogate = c;
1144           goto next2;
1145         }
1146       else
1147         wc = c;
1148
1149       /********** DIFFERENT for UTF8/UCS4 **********/
1150       *(gunichar *)out = wc;
1151       out += sizeof (gunichar);
1152
1153     next2:
1154       in++;
1155     }
1156
1157   /********** DIFFERENT for UTF8/UCS4 **********/
1158   *(gunichar *)out = 0;
1159
1160   if (items_written)
1161     /********** DIFFERENT for UTF8/UCS4 **********/
1162     *items_written = (out - result) / sizeof (gunichar);
1163
1164  err_out:
1165   if (items_read)
1166     *items_read = in - str;
1167
1168   return (gunichar *)result;
1169 }
1170
1171 /**
1172  * g_utf8_to_utf16:
1173  * @str: a UTF-8 encoded string
1174  * @len: the maximum length of @str to use. If < 0, then
1175  *       the string is %NULL terminated.
1176
1177  * @items_read: location to store number of bytes read, or %NULL.
1178  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1179  *              returned in case @str contains a trailing partial
1180  *              character. If an error occurs then the index of the
1181  *              invalid input is stored here.
1182  * @items_written: location to store number of words written, or %NULL.
1183  *                 The value stored here does not include the trailing
1184  *                 0 word.
1185  * @error: location to store the error occuring, or %NULL to ignore
1186  *         errors. Any of the errors in #GConvertError other than
1187  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1188  *
1189  * Convert a string from UTF-8 to UTF-16. A 0 word will be
1190  * added to the result after the converted text.
1191  *
1192  * Return value: a pointer to a newly allocated UTF-16 string.
1193  *               This value must be freed with g_free(). If an
1194  *               error occurs, %NULL will be returned and
1195  *               @error set.
1196  **/
1197 gunichar2 *
1198 g_utf8_to_utf16 (const gchar *str,
1199                  gint         len,
1200                  gint        *items_read,
1201                  gint        *items_written,
1202                  GError     **error)
1203 {
1204   gunichar2 *result = NULL;
1205   gint n16;
1206   const gchar *in;
1207   gint i;
1208
1209   g_return_val_if_fail (str != NULL, NULL);
1210
1211   in = str;
1212   n16 = 0;
1213   while ((len < 0 || str + len - in > 0) && *in)
1214     {
1215       gunichar wc = g_utf8_get_char_extended (in, str + len - in);
1216       if (wc & 0x80000000)
1217         {
1218           if (wc == (gunichar)-2)
1219             {
1220               if (items_read)
1221                 break;
1222               else
1223                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1224                              _("Partial character sequence at end of input"));
1225             }
1226           else
1227             g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1228                          _("Invalid byte sequence in conversion input"));
1229
1230           goto err_out;
1231         }
1232
1233       if (wc < 0xd800)
1234         n16 += 1;
1235       else if (wc < 0xe000)
1236         {
1237           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1238                        _("Invalid sequence in conversion input"));
1239
1240           goto err_out;
1241         }
1242       else if (wc < 0x10000)
1243         n16 += 1;
1244       else if (wc < 0x110000)
1245         n16 += 2;
1246       else
1247         {
1248           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1249                        _("Character out of range for UTF-16"));
1250
1251           goto err_out;
1252         }
1253
1254       in = g_utf8_next_char (in);
1255     }
1256
1257   result = g_new (gunichar2, n16 + 1);
1258
1259   in = str;
1260   for (i = 0; i < n16;)
1261     {
1262       gunichar wc = g_utf8_get_char (in);
1263
1264       if (wc < 0x10000)
1265         {
1266           result[i++] = wc;
1267         }
1268       else
1269         {
1270           result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
1271           result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
1272         }
1273
1274       in = g_utf8_next_char (in);
1275     }
1276
1277   result[i] = 0;
1278
1279   if (items_written)
1280     *items_written = n16;
1281
1282  err_out:
1283   if (items_read)
1284     *items_read = in - str;
1285
1286   return result;
1287 }
1288
1289 /**
1290  * g_ucs4_to_utf16:
1291  * @str: a UCS-4 encoded string
1292  * @len: the maximum length of @str to use. If < 0, then
1293  *       the string is terminated with a zero character.
1294  * @items_read: location to store number of bytes read, or %NULL.
1295  *              If an error occurs then the index of the invalid input
1296  *              is stored here.
1297  * @items_written: location to store number of words written, or %NULL.
1298  *                 The value stored here does not include the trailing
1299  *                 0 word.
1300  * @error: location to store the error occuring, or %NULL to ignore
1301  *         errors. Any of the errors in #GConvertError other than
1302  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1303  *
1304  * Convert a string from UCS-4 to UTF-16. A 0 word will be
1305  * added to the result after the converted text.
1306  *
1307  * Return value: a pointer to a newly allocated UTF-16 string.
1308  *               This value must be freed with g_free(). If an
1309  *               error occurs, %NULL will be returned and
1310  *               @error set.
1311  **/
1312 gunichar2 *
1313 g_ucs4_to_utf16 (const gunichar  *str,
1314                  gint             len,
1315                  gint            *items_read,
1316                  gint            *items_written,
1317                  GError         **error)
1318 {
1319   gunichar2 *result = NULL;
1320   gint n16;
1321   gint i, j;
1322
1323   n16 = 0;
1324   i = 0;
1325   while ((len < 0 || i < len) && str[i])
1326     {
1327       gunichar wc = str[i];
1328
1329       if (wc < 0xd800)
1330         n16 += 1;
1331       else if (wc < 0xe000)
1332         {
1333           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1334                        _("Invalid sequence in conversion input"));
1335
1336           goto err_out;
1337         }
1338       else if (wc < 0x10000)
1339         n16 += 1;
1340       else if (wc < 0x110000)
1341         n16 += 2;
1342       else
1343         {
1344           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1345                        _("Character out of range for UTF-16"));
1346
1347           goto err_out;
1348         }
1349
1350       i++;
1351     }
1352
1353   result = g_new (gunichar2, n16 + 1);
1354
1355   for (i = 0, j = 0; j < n16; i++)
1356     {
1357       gunichar wc = str[i];
1358
1359       if (wc < 0x10000)
1360         {
1361           result[j++] = wc;
1362         }
1363       else
1364         {
1365           result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
1366           result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
1367         }
1368     }
1369   result[j] = 0;
1370
1371   if (items_written)
1372     *items_written = n16;
1373
1374  err_out:
1375   if (items_read)
1376     *items_read = i;
1377
1378   return result;
1379 }
1380
1381 /**
1382  * g_utf8_validate:
1383  * @str: a pointer to character data
1384  * @max_len: max bytes to validate, or -1 to go until nul
1385  * @end: return location for end of valid data
1386  *
1387  * Validates UTF-8 encoded text. @str is the text to validate;
1388  * if @str is nul-terminated, then @max_len can be -1, otherwise
1389  * @max_len should be the number of bytes to validate.
1390  * If @end is non-NULL, then the end of the valid range
1391  * will be stored there (i.e. the address of the first invalid byte
1392  * if some bytes were invalid, or the end of the text being validated
1393  * otherwise).
1394  *
1395  * Returns TRUE if all of @str was valid. Many GLib and GTK+
1396  * routines <emphasis>require</emphasis> valid UTF8 as input;
1397  * so data read from a file or the network should be checked
1398  * with g_utf8_validate() before doing anything else with it.
1399  *
1400  * Return value: TRUE if the text was valid UTF-8.
1401  **/
1402 gboolean
1403 g_utf8_validate (const gchar  *str,
1404                  gint          max_len,
1405                  const gchar **end)
1406 {
1407
1408   const gchar *p;
1409
1410   g_return_val_if_fail (str != NULL, FALSE);
1411
1412   if (end)
1413     *end = str;
1414
1415   p = str;
1416
1417   while ((max_len < 0 || (p - str) < max_len) && *p)
1418     {
1419       int i, mask = 0, len;
1420       gunichar result;
1421       unsigned char c = (unsigned char) *p;
1422
1423       UTF8_COMPUTE (c, mask, len);
1424
1425       if (len == -1)
1426         break;
1427
1428       /* check that the expected number of bytes exists in str */
1429       if (max_len >= 0 &&
1430           ((max_len - (p - str)) < len))
1431         break;
1432
1433       UTF8_GET (result, p, i, mask, len);
1434
1435       if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
1436         break;
1437
1438       if (result == (gunichar)-1)
1439         break;
1440
1441       if (!UNICODE_VALID (result))
1442         break;
1443
1444       p += len;
1445     }
1446
1447   if (end)
1448     *end = p;
1449
1450   /* See that we covered the entire length if a length was
1451    * passed in, or that we ended on a nul if not
1452    */
1453   if (max_len >= 0 &&
1454       p != (str + max_len))
1455     return FALSE;
1456   else if (max_len < 0 &&
1457            *p != '\0')
1458     return FALSE;
1459   else
1460     return TRUE;
1461 }
1462
1463 /**
1464  * g_unichar_validate:
1465  * @ch: a Unicode character
1466  *
1467  * Checks whether @ch is a valid Unicode character. Some possible
1468  * integer values of @ch will not be valid. 0 is considered a valid
1469  * character, though it's normally a string terminator.
1470  *
1471  * Return value: %TRUE if @ch is a valid Unicode character
1472  **/
1473 gboolean
1474 g_unichar_validate (gunichar ch)
1475 {
1476   return UNICODE_VALID (ch);
1477 }