glib/gutf8.c

   1 /* gutf8.c - Operations on UTF-8 strings.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "config.h"
  23
  24 #include <stdlib.h>
  25 #ifdef HAVE_CODESET
  26 #include <langinfo.h>
  27 #endif
  28 #include <string.h>
  29
  30 #ifdef G_PLATFORM_WIN32
  31 #include <stdio.h>
  32 #define STRICT
  33 #include <windows.h>
  34 #undef STRICT
  35 #endif
  36
  37 #include "gconvert.h"
  38 #include "ghash.h"
  39 #include "gstrfuncs.h"
  40 #include "gtestutils.h"
  41 #include "gtypes.h"
  42 #include "gthread.h"
  43 #include "glibintl.h"
  44
  45 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
  46   if (Char < 128)                                                             \
  47     {                                                                         \
  48       Len = 1;                                                                \
  49       Mask = 0x7f;                                                            \
  50     }                                                                         \
  51   else if ((Char & 0xe0) == 0xc0)                                             \
  52     {                                                                         \
  53       Len = 2;                                                                \
  54       Mask = 0x1f;                                                            \
  55     }                                                                         \
  56   else if ((Char & 0xf0) == 0xe0)                                             \
  57     {                                                                         \
  58       Len = 3;                                                                \
  59       Mask = 0x0f;                                                            \
  60     }                                                                         \
  61   else if ((Char & 0xf8) == 0xf0)                                             \
  62     {                                                                         \
  63       Len = 4;                                                                \
  64       Mask = 0x07;                                                            \
  65     }                                                                         \
  66   else if ((Char & 0xfc) == 0xf8)                                             \
  67     {                                                                         \
  68       Len = 5;                                                                \
  69       Mask = 0x03;                                                            \
  70     }                                                                         \
  71   else if ((Char & 0xfe) == 0xfc)                                             \
  72     {                                                                         \
  73       Len = 6;                                                                \
  74       Mask = 0x01;                                                            \
  75     }                                                                         \
  76   else                                                                        \
  77     Len = -1;
  78
  79 #define UTF8_LENGTH(Char)              \
  80   ((Char) < 0x80 ? 1 :                 \
  81    ((Char) < 0x800 ? 2 :               \
  82     ((Char) < 0x10000 ? 3 :            \
  83      ((Char) < 0x200000 ? 4 :          \
  84       ((Char) < 0x4000000 ? 5 : 6)))))
  85
  86
  87 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
  88   (Result) = (Chars)[0] & (Mask);                                             \
  89   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
  90     {                                                                         \
  91       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
  92         {                                                                     \
  93           (Result) = -1;                                                      \
  94           break;                                                              \
  95         }                                                                     \
  96       (Result) <<= 6;                                                         \
  97       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
  98     }
  99
 100 /*
 101  * Check whether a Unicode (5.2) char is in a valid range.
 102  *
 103  * The first check comes from the Unicode guarantee to never encode
 104  * a point above 0x0010ffff, since UTF-16 couldn't represent it.
 105  *
 106  * The second check covers surrogate pairs (category Cs).
 107  *
 108  * @param Char the character
 109  */
 110 #define UNICODE_VALID(Char)                   \
 111     ((Char) < 0x110000 &&                     \
 112      (((Char) & 0xFFFFF800) != 0xD800))
 113
 114
 115 static const gchar utf8_skip_data[256] = {
 116   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 117   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 118   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 119   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 120   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 121   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 122   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 123   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
 124 };
 125
 126 const gchar * const g_utf8_skip = utf8_skip_data;
 127
 128 /**
 129  * g_utf8_find_prev_char:
 130  * @str: pointer to the beginning of a UTF-8 encoded string
 131  * @p: pointer to some position within @str
 132  *
 133  * Given a position @p with a UTF-8 encoded string @str, find the start
 134  * of the previous UTF-8 character starting before @p. Returns %NULL if no
 135  * UTF-8 characters are present in @str before @p.
 136  *
 137  * @p does not have to be at the beginning of a UTF-8 character. No check
 138  * is made to see if the character found is actually valid other than
 139  * it starts with an appropriate byte.
 140  *
 141  * Return value: a pointer to the found character or %NULL.
 142  **/
 143 gchar *
 144 g_utf8_find_prev_char (const char *str,
 145                        const char *p)
 146 {
 147   for (--p; p >= str; --p)
 148     {
 149       if ((*p & 0xc0) != 0x80)
 150         return (gchar *)p;
 151     }
 152   return NULL;
 153 }
 154
 155 /**
 156  * g_utf8_find_next_char:
 157  * @p: a pointer to a position within a UTF-8 encoded string
 158  * @end: a pointer to the byte following the end of the string,
 159  * or %NULL to indicate that the string is nul-terminated.
 160  *
 161  * Finds the start of the next UTF-8 character in the string after @p.
 162  *
 163  * @p does not have to be at the beginning of a UTF-8 character. No check
 164  * is made to see if the character found is actually valid other than
 165  * it starts with an appropriate byte.
 166  *
 167  * Return value: a pointer to the found character or %NULL
 168  **/
 169 gchar *
 170 g_utf8_find_next_char (const gchar *p,
 171                        const gchar *end)
 172 {
 173   if (*p)
 174     {
 175       if (end)
 176         for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
 177           ;
 178       else
 179         for (++p; (*p & 0xc0) == 0x80; ++p)
 180           ;
 181     }
 182   return (p == end) ? NULL : (gchar *)p;
 183 }
 184
 185 /**
 186  * g_utf8_prev_char:
 187  * @p: a pointer to a position within a UTF-8 encoded string
 188  *
 189  * Finds the previous UTF-8 character in the string before @p.
 190  *
 191  * @p does not have to be at the beginning of a UTF-8 character. No check
 192  * is made to see if the character found is actually valid other than
 193  * it starts with an appropriate byte. If @p might be the first
 194  * character of the string, you must use g_utf8_find_prev_char() instead.
 195  *
 196  * Return value: a pointer to the found character.
 197  **/
 198 gchar *
 199 g_utf8_prev_char (const gchar *p)
 200 {
 201   while (TRUE)
 202     {
 203       p--;
 204       if ((*p & 0xc0) != 0x80)
 205         return (gchar *)p;
 206     }
 207 }
 208
 209 /**
 210  * g_utf8_strlen:
 211  * @p: pointer to the start of a UTF-8 encoded string
 212  * @max: the maximum number of bytes to examine. If @max
 213  *       is less than 0, then the string is assumed to be
 214  *       nul-terminated. If @max is 0, @p will not be examined and
 215  *       may be %NULL. If @max is greater than 0, up to @max
 216  *       bytes are examined
 217  *
 218  * Computes the length of the string in characters, not including
 219  * the terminating nul character. If the @max'th byte falls in the
 220  * middle of a character, the last (partial) character is not counted.
 221  *
 222  * Return value: the length of the string in characters
 223  **/
 224 glong
 225 g_utf8_strlen (const gchar *p,
 226                gssize       max)
 227 {
 228   glong len = 0;
 229   const gchar *start = p;
 230   g_return_val_if_fail (p != NULL || max == 0, 0);
 231
 232   if (max < 0)
 233     {
 234       while (*p)
 235         {
 236           p = g_utf8_next_char (p);
 237           ++len;
 238         }
 239     }
 240   else
 241     {
 242       if (max == 0 || !*p)
 243         return 0;
 244
 245       p = g_utf8_next_char (p);
 246
 247       while (p - start < max && *p)
 248         {
 249           ++len;
 250           p = g_utf8_next_char (p);
 251         }
 252
 253       /* only do the last len increment if we got a complete
 254        * char (don't count partial chars)
 255        */
 256       if (p - start <= max)
 257         ++len;
 258     }
 259
 260   return len;
 261 }
 262
 263 /**
 264  * g_utf8_substring:
 265  * @str: a UTF-8 encoded string
 266  * @start_pos: a character offset within @str
 267  * @end_pos: another character offset within @str
 268  *
 269  * Copies a substring out of a UTF-8 encoded string.
 270  * The substring will contain @end_pos - @start_pos
 271  * characters.
 272  *
 273  * Returns: a newly allocated copy of the requested
 274  *     substring. Free with g_free() when no longer needed.
 275  *
 276  * Since: 2.30
 277  */
 278 gchar *
 279 g_utf8_substring (const gchar *str,
 280                   glong        start_pos,
 281                   glong        end_pos)
 282 {
 283   gchar *start, *end, *out;
 284
 285   start = g_utf8_offset_to_pointer (str, start_pos);
 286   end = g_utf8_offset_to_pointer (start, end_pos - start_pos);
 287
 288   out = g_malloc (end - start + 1);
 289   memcpy (out, start, end - start);
 290   out[end - start] = 0;
 291
 292   return out;
 293 }
 294
 295 /**
 296  * g_utf8_get_char:
 297  * @p: a pointer to Unicode character encoded as UTF-8
 298  *
 299  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 300  * If @p does not point to a valid UTF-8 encoded character, results are
 301  * undefined. If you are not sure that the bytes are complete
 302  * valid Unicode characters, you should use g_utf8_get_char_validated()
 303  * instead.
 304  *
 305  * Return value: the resulting character
 306  **/
 307 gunichar
 308 g_utf8_get_char (const gchar *p)
 309 {
 310   int i, mask = 0, len;
 311   gunichar result;
 312   unsigned char c = (unsigned char) *p;
 313
 314   UTF8_COMPUTE (c, mask, len);
 315   if (len == -1)
 316     return (gunichar)-1;
 317   UTF8_GET (result, p, i, mask, len);
 318
 319   return result;
 320 }
 321
 322 /**
 323  * g_utf8_offset_to_pointer:
 324  * @str: a UTF-8 encoded string
 325  * @offset: a character offset within @str
 326  *
 327  * Converts from an integer character offset to a pointer to a position
 328  * within the string.
 329  *
 330  * Since 2.10, this function allows to pass a negative @offset to
 331  * step backwards. It is usually worth stepping backwards from the end
 332  * instead of forwards if @offset is in the last fourth of the string,
 333  * since moving forward is about 3 times faster than moving backward.
 334  *
 335  * <note><para>
 336  * This function doesn't abort when reaching the end of @str. Therefore
 337  * you should be sure that @offset is within string boundaries before
 338  * calling that function. Call g_utf8_strlen() when unsure.
 339  *
 340  * This limitation exists as this function is called frequently during
 341  * text rendering and therefore has to be as fast as possible.
 342  * </para></note>
 343  *
 344  * Return value: the resulting pointer
 345  **/
 346 gchar *
 347 g_utf8_offset_to_pointer  (const gchar *str,
 348                            glong        offset)
 349 {
 350   const gchar *s = str;
 351
 352   if (offset > 0)
 353     while (offset--)
 354       s = g_utf8_next_char (s);
 355   else
 356     {
 357       const char *s1;
 358
 359       /* This nice technique for fast backwards stepping
 360        * through a UTF-8 string was dubbed "stutter stepping"
 361        * by its inventor, Larry Ewing.
 362        */
 363       while (offset)
 364         {
 365           s1 = s;
 366           s += offset;
 367           while ((*s & 0xc0) == 0x80)
 368             s--;
 369
 370           offset += g_utf8_pointer_to_offset (s, s1);
 371         }
 372     }
 373
 374   return (gchar *)s;
 375 }
 376
 377 /**
 378  * g_utf8_pointer_to_offset:
 379  * @str: a UTF-8 encoded string
 380  * @pos: a pointer to a position within @str
 381  *
 382  * Converts from a pointer to position within a string to a integer
 383  * character offset.
 384  *
 385  * Since 2.10, this function allows @pos to be before @str, and returns
 386  * a negative offset in this case.
 387  *
 388  * Return value: the resulting character offset
 389  **/
 390 glong
 391 g_utf8_pointer_to_offset (const gchar *str,
 392                           const gchar *pos)
 393 {
 394   const gchar *s = str;
 395   glong offset = 0;
 396
 397   if (pos < str)
 398     offset = - g_utf8_pointer_to_offset (pos, str);
 399   else
 400     while (s < pos)
 401       {
 402         s = g_utf8_next_char (s);
 403         offset++;
 404       }
 405
 406   return offset;
 407 }
 408
 409
 410 /**
 411  * g_utf8_strncpy:
 412  * @dest: buffer to fill with characters from @src
 413  * @src: UTF-8 encoded string
 414  * @n: character count
 415  *
 416  * Like the standard C strncpy() function, but
 417  * copies a given number of characters instead of a given number of
 418  * bytes. The @src string must be valid UTF-8 encoded text.
 419  * (Use g_utf8_validate() on all text before trying to use UTF-8
 420  * utility functions with it.)
 421  *
 422  * Return value: @dest
 423  **/
 424 gchar *
 425 g_utf8_strncpy (gchar       *dest,
 426                 const gchar *src,
 427                 gsize        n)
 428 {
 429   const gchar *s = src;
 430   while (n && *s)
 431     {
 432       s = g_utf8_next_char(s);
 433       n--;
 434     }
 435   strncpy(dest, src, s - src);
 436   dest[s - src] = 0;
 437   return dest;
 438 }
 439
 440 /* unicode_strchr */
 441
 442 /**
 443  * g_unichar_to_utf8:
 444  * @c: a Unicode character code
 445  * @outbuf: output buffer, must have at least 6 bytes of space.
 446  *       If %NULL, the length will be computed and returned
 447  *       and nothing will be written to @outbuf.
 448  *
 449  * Converts a single character to UTF-8.
 450  *
 451  * Return value: number of bytes written
 452  **/
 453 int
 454 g_unichar_to_utf8 (gunichar c,
 455                    gchar   *outbuf)
 456 {
 457   /* If this gets modified, also update the copy in g_string_insert_unichar() */
 458   guint len = 0;
 459   int first;
 460   int i;
 461
 462   if (c < 0x80)
 463     {
 464       first = 0;
 465       len = 1;
 466     }
 467   else if (c < 0x800)
 468     {
 469       first = 0xc0;
 470       len = 2;
 471     }
 472   else if (c < 0x10000)
 473     {
 474       first = 0xe0;
 475       len = 3;
 476     }
 477    else if (c < 0x200000)
 478     {
 479       first = 0xf0;
 480       len = 4;
 481     }
 482   else if (c < 0x4000000)
 483     {
 484       first = 0xf8;
 485       len = 5;
 486     }
 487   else
 488     {
 489       first = 0xfc;
 490       len = 6;
 491     }
 492
 493   if (outbuf)
 494     {
 495       for (i = len - 1; i > 0; --i)
 496         {
 497           outbuf[i] = (c & 0x3f) | 0x80;
 498           c >>= 6;
 499         }
 500       outbuf[0] = c | first;
 501     }
 502
 503   return len;
 504 }
 505
 506 /**
 507  * g_utf8_strchr:
 508  * @p: a nul-terminated UTF-8 encoded string
 509  * @len: the maximum length of @p
 510  * @c: a Unicode character
 511  *
 512  * Finds the leftmost occurrence of the given Unicode character
 513  * in a UTF-8 encoded string, while limiting the search to @len bytes.
 514  * If @len is -1, allow unbounded search.
 515  *
 516  * Return value: %NULL if the string does not contain the character,
 517  *   otherwise, a pointer to the start of the leftmost occurrence of
 518  *   the character in the string.
 519  **/
 520 gchar *
 521 g_utf8_strchr (const char *p,
 522                gssize      len,
 523                gunichar    c)
 524 {
 525   gchar ch[10];
 526
 527   gint charlen = g_unichar_to_utf8 (c, ch);
 528   ch[charlen] = '\0';
 529
 530   return g_strstr_len (p, len, ch);
 531 }
 532
 533
 534 /**
 535  * g_utf8_strrchr:
 536  * @p: a nul-terminated UTF-8 encoded string
 537  * @len: the maximum length of @p
 538  * @c: a Unicode character
 539  *
 540  * Find the rightmost occurrence of the given Unicode character
 541  * in a UTF-8 encoded string, while limiting the search to @len bytes.
 542  * If @len is -1, allow unbounded search.
 543  *
 544  * Return value: %NULL if the string does not contain the character,
 545  *   otherwise, a pointer to the start of the rightmost occurrence of the
 546  *   character in the string.
 547  **/
 548 gchar *
 549 g_utf8_strrchr (const char *p,
 550                 gssize      len,
 551                 gunichar    c)
 552 {
 553   gchar ch[10];
 554
 555   gint charlen = g_unichar_to_utf8 (c, ch);
 556   ch[charlen] = '\0';
 557
 558   return g_strrstr_len (p, len, ch);
 559 }
 560
 561
 562 /* Like g_utf8_get_char, but take a maximum length
 563  * and return (gunichar)-2 on incomplete trailing character;
 564  * also check for malformed or overlong sequences
 565  * and return (gunichar)-1 in this case.
 566  */
 567 static inline gunichar
 568 g_utf8_get_char_extended (const  gchar *p,
 569                           gssize max_len)
 570 {
 571   guint i, len;
 572   gunichar min_code;
 573   gunichar wc = (guchar) *p;
 574
 575   if (wc < 0x80)
 576     {
 577       return wc;
 578     }
 579   else if (G_UNLIKELY (wc < 0xc0))
 580     {
 581       return (gunichar)-1;
 582     }
 583   else if (wc < 0xe0)
 584     {
 585       len = 2;
 586       wc &= 0x1f;
 587       min_code = 1 << 7;
 588     }
 589   else if (wc < 0xf0)
 590     {
 591       len = 3;
 592       wc &= 0x0f;
 593       min_code = 1 << 11;
 594     }
 595   else if (wc < 0xf8)
 596     {
 597       len = 4;
 598       wc &= 0x07;
 599       min_code = 1 << 16;
 600     }
 601   else if (wc < 0xfc)
 602     {
 603       len = 5;
 604       wc &= 0x03;
 605       min_code = 1 << 21;
 606     }
 607   else if (wc < 0xfe)
 608     {
 609       len = 6;
 610       wc &= 0x01;
 611       min_code = 1 << 26;
 612     }
 613   else
 614     {
 615       return (gunichar)-1;
 616     }
 617
 618   if (G_UNLIKELY (max_len >= 0 && len > max_len))
 619     {
 620       for (i = 1; i < max_len; i++)
 621         {
 622           if ((((guchar *)p)[i] & 0xc0) != 0x80)
 623             return (gunichar)-1;
 624         }
 625       return (gunichar)-2;
 626     }
 627
 628   for (i = 1; i < len; ++i)
 629     {
 630       gunichar ch = ((guchar *)p)[i];
 631
 632       if (G_UNLIKELY ((ch & 0xc0) != 0x80))
 633         {
 634           if (ch)
 635             return (gunichar)-1;
 636           else
 637             return (gunichar)-2;
 638         }
 639
 640       wc <<= 6;
 641       wc |= (ch & 0x3f);
 642     }
 643
 644   if (G_UNLIKELY (wc < min_code))
 645     return (gunichar)-1;
 646
 647   return wc;
 648 }
 649
 650 /**
 651  * g_utf8_get_char_validated:
 652  * @p: a pointer to Unicode character encoded as UTF-8
 653  * @max_len: the maximum number of bytes to read, or -1, for no maximum or
 654  *           if @p is nul-terminated
 655  *
 656  * Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
 657  * This function checks for incomplete characters, for invalid characters
 658  * such as characters that are out of the range of Unicode, and for
 659  * overlong encodings of valid characters.
 660  *
 661  * Return value: the resulting character. If @p points to a partial
 662  *    sequence at the end of a string that could begin a valid
 663  *    character (or if @max_len is zero), returns (gunichar)-2;
 664  *    otherwise, if @p does not point to a valid UTF-8 encoded
 665  *    Unicode character, returns (gunichar)-1.
 666  **/
 667 gunichar
 668 g_utf8_get_char_validated (const  gchar *p,
 669                            gssize max_len)
 670 {
 671   gunichar result;
 672
 673   if (max_len == 0)
 674     return (gunichar)-2;
 675
 676   result = g_utf8_get_char_extended (p, max_len);
 677
 678   if (result & 0x80000000)
 679     return result;
 680   else if (!UNICODE_VALID (result))
 681     return (gunichar)-1;
 682   else
 683     return result;
 684 }
 685
 686 /**
 687  * g_utf8_to_ucs4_fast:
 688  * @str: a UTF-8 encoded string
 689  * @len: the maximum length of @str to use, in bytes. If @len < 0,
 690  *       then the string is nul-terminated.
 691  * @items_written: (allow-none): location to store the number of characters in the
 692  *                 result, or %NULL.
 693  *
 694  * Convert a string from UTF-8 to a 32-bit fixed width
 695  * representation as UCS-4, assuming valid UTF-8 input.
 696  * This function is roughly twice as fast as g_utf8_to_ucs4()
 697  * but does no error checking on the input. A trailing 0 character
 698  * will be added to the string after the converted text.
 699  *
 700  * Return value: a pointer to a newly allocated UCS-4 string.
 701  *               This value must be freed with g_free().
 702  **/
 703 gunichar *
 704 g_utf8_to_ucs4_fast (const gchar *str,
 705                      glong        len,
 706                      glong       *items_written)
 707 {
 708   gunichar *result;
 709   gint n_chars, i;
 710   const gchar *p;
 711
 712   g_return_val_if_fail (str != NULL, NULL);
 713
 714   p = str;
 715   n_chars = 0;
 716   if (len < 0)
 717     {
 718       while (*p)
 719         {
 720           p = g_utf8_next_char (p);
 721           ++n_chars;
 722         }
 723     }
 724   else
 725     {
 726       while (p < str + len && *p)
 727         {
 728           p = g_utf8_next_char (p);
 729           ++n_chars;
 730         }
 731     }
 732
 733   result = g_new (gunichar, n_chars + 1);
 734
 735   p = str;
 736   for (i=0; i < n_chars; i++)
 737     {
 738       gunichar wc = (guchar)*p++;
 739
 740       if (wc < 0x80)
 741         {
 742           result[i] = wc;
 743         }
 744       else
 745         {
 746           gunichar mask = 0x40;
 747
 748           if (G_UNLIKELY ((wc & mask) == 0))
 749             {
 750               /* It's an out-of-sequence 10xxxxxxx byte.
 751                * Rather than making an ugly hash of this and the next byte
 752                * and overrunning the buffer, it's more useful to treat it
 753                * with a replacement character */
 754               result[i] = 0xfffd;
 755               continue;
 756             }
 757
 758           do
 759             {
 760               wc <<= 6;
 761               wc |= (guchar)(*p++) & 0x3f;
 762               mask <<= 5;
 763             }
 764           while((wc & mask) != 0);
 765
 766           wc &= mask - 1;
 767
 768           result[i] = wc;
 769         }
 770     }
 771   result[i] = 0;
 772
 773   if (items_written)
 774     *items_written = i;
 775
 776   return result;
 777 }
 778
 779 /**
 780  * g_utf8_to_ucs4:
 781  * @str: a UTF-8 encoded string
 782  * @len: the maximum length of @str to use, in bytes. If @len < 0,
 783  *       then the string is nul-terminated.
 784  * @items_read: (allow-none): location to store number of bytes read, or %NULL.
 785  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
 786  *              returned in case @str contains a trailing partial
 787  *              character. If an error occurs then the index of the
 788  *              invalid input is stored here.
 789  * @items_written: (allow-none): location to store number of characters written or %NULL.
 790  *                 The value here stored does not include the trailing 0
 791  *                 character.
 792  * @error: location to store the error occurring, or %NULL to ignore
 793  *         errors. Any of the errors in #GConvertError other than
 794  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 795  *
 796  * Convert a string from UTF-8 to a 32-bit fixed width
 797  * representation as UCS-4. A trailing 0 character will be added to the
 798  * string after the converted text.
 799  *
 800  * Return value: a pointer to a newly allocated UCS-4 string.
 801  *               This value must be freed with g_free(). If an
 802  *               error occurs, %NULL will be returned and
 803  *               @error set.
 804  **/
 805 gunichar *
 806 g_utf8_to_ucs4 (const gchar *str,
 807                 glong        len,
 808                 glong       *items_read,
 809                 glong       *items_written,
 810                 GError     **error)
 811 {
 812   gunichar *result = NULL;
 813   gint n_chars, i;
 814   const gchar *in;
 815
 816   in = str;
 817   n_chars = 0;
 818   while ((len < 0 || str + len - in > 0) && *in)
 819     {
 820       gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
 821       if (wc & 0x80000000)
 822         {
 823           if (wc == (gunichar)-2)
 824             {
 825               if (items_read)
 826                 break;
 827               else
 828                 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 829                                      _("Partial character sequence at end of input"));
 830             }
 831           else
 832             g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 833                                  _("Invalid byte sequence in conversion input"));
 834
 835           goto err_out;
 836         }
 837
 838       n_chars++;
 839
 840       in = g_utf8_next_char (in);
 841     }
 842
 843   result = g_new (gunichar, n_chars + 1);
 844
 845   in = str;
 846   for (i=0; i < n_chars; i++)
 847     {
 848       result[i] = g_utf8_get_char (in);
 849       in = g_utf8_next_char (in);
 850     }
 851   result[i] = 0;
 852
 853   if (items_written)
 854     *items_written = n_chars;
 855
 856  err_out:
 857   if (items_read)
 858     *items_read = in - str;
 859
 860   return result;
 861 }
 862
 863 /**
 864  * g_ucs4_to_utf8:
 865  * @str: a UCS-4 encoded string
 866  * @len: the maximum length (number of characters) of @str to use.
 867  *       If @len < 0, then the string is nul-terminated.
 868  * @items_read: (allow-none): location to store number of characters read, or %NULL.
 869  * @items_written: (allow-none): location to store number of bytes written or %NULL.
 870  *                 The value here stored does not include the trailing 0
 871  *                 byte.
 872  * @error: location to store the error occurring, or %NULL to ignore
 873  *         errors. Any of the errors in #GConvertError other than
 874  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 875  *
 876  * Convert a string from a 32-bit fixed width representation as UCS-4.
 877  * to UTF-8. The result will be terminated with a 0 byte.
 878  *
 879  * Return value: a pointer to a newly allocated UTF-8 string.
 880  *               This value must be freed with g_free(). If an
 881  *               error occurs, %NULL will be returned and
 882  *               @error set. In that case, @items_read will be
 883  *               set to the position of the first invalid input
 884  *               character.
 885  **/
 886 gchar *
 887 g_ucs4_to_utf8 (const gunichar *str,
 888                 glong           len,
 889                 glong          *items_read,
 890                 glong          *items_written,
 891                 GError        **error)
 892 {
 893   gint result_length;
 894   gchar *result = NULL;
 895   gchar *p;
 896   gint i;
 897
 898   result_length = 0;
 899   for (i = 0; len < 0 || i < len ; i++)
 900     {
 901       if (!str[i])
 902         break;
 903
 904       if (str[i] >= 0x80000000)
 905         {
 906           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 907                                _("Character out of range for UTF-8"));
 908           goto err_out;
 909         }
 910
 911       result_length += UTF8_LENGTH (str[i]);
 912     }
 913
 914   result = g_malloc (result_length + 1);
 915   p = result;
 916
 917   i = 0;
 918   while (p < result + result_length)
 919     p += g_unichar_to_utf8 (str[i++], p);
 920
 921   *p = '\0';
 922
 923   if (items_written)
 924     *items_written = p - result;
 925
 926  err_out:
 927   if (items_read)
 928     *items_read = i;
 929
 930   return result;
 931 }
 932
 933 #define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
 934
 935 /**
 936  * g_utf16_to_utf8:
 937  * @str: a UTF-16 encoded string
 938  * @len: the maximum length (number of <type>gunichar2</type>) of @str to use.
 939  *       If @len < 0, then the string is nul-terminated.
 940  * @items_read: (allow-none): location to store number of words read, or %NULL.
 941  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
 942  *              returned in case @str contains a trailing partial
 943  *              character. If an error occurs then the index of the
 944  *              invalid input is stored here.
 945  * @items_written: (allow-none): location to store number of bytes written, or %NULL.
 946  *                 The value stored here does not include the trailing
 947  *                 0 byte.
 948  * @error: location to store the error occurring, or %NULL to ignore
 949  *         errors. Any of the errors in #GConvertError other than
 950  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 951  *
 952  * Convert a string from UTF-16 to UTF-8. The result will be
 953  * terminated with a 0 byte.
 954  *
 955  * Note that the input is expected to be already in native endianness,
 956  * an initial byte-order-mark character is not handled specially.
 957  * g_convert() can be used to convert a byte buffer of UTF-16 data of
 958  * ambiguous endianess.
 959  *
 960  * Further note that this function does not validate the result
 961  * string; it may e.g. include embedded NUL characters. The only
 962  * validation done by this function is to ensure that the input can
 963  * be correctly interpreted as UTF-16, i.e. it doesn't contain
 964  * things unpaired surrogates.
 965  *
 966  * Return value: a pointer to a newly allocated UTF-8 string.
 967  *               This value must be freed with g_free(). If an
 968  *               error occurs, %NULL will be returned and
 969  *               @error set.
 970  **/
 971 gchar *
 972 g_utf16_to_utf8 (const gunichar2  *str,
 973                  glong             len,
 974                  glong            *items_read,
 975                  glong            *items_written,
 976                  GError          **error)
 977 {
 978   /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
 979    * are marked.
 980    */
 981   const gunichar2 *in;
 982   gchar *out;
 983   gchar *result = NULL;
 984   gint n_bytes;
 985   gunichar high_surrogate;
 986
 987   g_return_val_if_fail (str != NULL, NULL);
 988
 989   n_bytes = 0;
 990   in = str;
 991   high_surrogate = 0;
 992   while ((len < 0 || in - str < len) && *in)
 993     {
 994       gunichar2 c = *in;
 995       gunichar wc;
 996
 997       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
 998         {
 999           if (high_surrogate)
1000             {
1001               wc = SURROGATE_VALUE (high_surrogate, c);
1002               high_surrogate = 0;
1003             }
1004           else
1005             {
1006               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1007                                    _("Invalid sequence in conversion input"));
1008               goto err_out;
1009             }
1010         }
1011       else
1012         {
1013           if (high_surrogate)
1014             {
1015               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1016                                    _("Invalid sequence in conversion input"));
1017               goto err_out;
1018             }
1019
1020           if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1021             {
1022               high_surrogate = c;
1023               goto next1;
1024             }
1025           else
1026             wc = c;
1027         }
1028
1029       /********** DIFFERENT for UTF8/UCS4 **********/
1030       n_bytes += UTF8_LENGTH (wc);
1031
1032     next1:
1033       in++;
1034     }
1035
1036   if (high_surrogate && !items_read)
1037     {
1038       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1039                            _("Partial character sequence at end of input"));
1040       goto err_out;
1041     }
1042
1043   /* At this point, everything is valid, and we just need to convert
1044    */
1045   /********** DIFFERENT for UTF8/UCS4 **********/
1046   result = g_malloc (n_bytes + 1);
1047
1048   high_surrogate = 0;
1049   out = result;
1050   in = str;
1051   while (out < result + n_bytes)
1052     {
1053       gunichar2 c = *in;
1054       gunichar wc;
1055
1056       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1057         {
1058           wc = SURROGATE_VALUE (high_surrogate, c);
1059           high_surrogate = 0;
1060         }
1061       else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1062         {
1063           high_surrogate = c;
1064           goto next2;
1065         }
1066       else
1067         wc = c;
1068
1069       /********** DIFFERENT for UTF8/UCS4 **********/
1070       out += g_unichar_to_utf8 (wc, out);
1071
1072     next2:
1073       in++;
1074     }
1075
1076   /********** DIFFERENT for UTF8/UCS4 **********/
1077   *out = '\0';
1078
1079   if (items_written)
1080     /********** DIFFERENT for UTF8/UCS4 **********/
1081     *items_written = out - result;
1082
1083  err_out:
1084   if (items_read)
1085     *items_read = in - str;
1086
1087   return result;
1088 }
1089
1090 /**
1091  * g_utf16_to_ucs4:
1092  * @str: a UTF-16 encoded string
1093  * @len: the maximum length (number of <type>gunichar2</type>) of @str to use.
1094  *       If @len < 0, then the string is nul-terminated.
1095  * @items_read: (allow-none): location to store number of words read, or %NULL.
1096  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1097  *              returned in case @str contains a trailing partial
1098  *              character. If an error occurs then the index of the
1099  *              invalid input is stored here.
1100  * @items_written: (allow-none): location to store number of characters written, or %NULL.
1101  *                 The value stored here does not include the trailing
1102  *                 0 character.
1103  * @error: location to store the error occurring, or %NULL to ignore
1104  *         errors. Any of the errors in #GConvertError other than
1105  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1106  *
1107  * Convert a string from UTF-16 to UCS-4. The result will be
1108  * nul-terminated.
1109  *
1110  * Return value: a pointer to a newly allocated UCS-4 string.
1111  *               This value must be freed with g_free(). If an
1112  *               error occurs, %NULL will be returned and
1113  *               @error set.
1114  **/
1115 gunichar *
1116 g_utf16_to_ucs4 (const gunichar2  *str,
1117                  glong             len,
1118                  glong            *items_read,
1119                  glong            *items_written,
1120                  GError          **error)
1121 {
1122   const gunichar2 *in;
1123   gchar *out;
1124   gchar *result = NULL;
1125   gint n_bytes;
1126   gunichar high_surrogate;
1127
1128   g_return_val_if_fail (str != NULL, NULL);
1129
1130   n_bytes = 0;
1131   in = str;
1132   high_surrogate = 0;
1133   while ((len < 0 || in - str < len) && *in)
1134     {
1135       gunichar2 c = *in;
1136
1137       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1138         {
1139           if (high_surrogate)
1140             {
1141               high_surrogate = 0;
1142             }
1143           else
1144             {
1145               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1146                                    _("Invalid sequence in conversion input"));
1147               goto err_out;
1148             }
1149         }
1150       else
1151         {
1152           if (high_surrogate)
1153             {
1154               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1155                                    _("Invalid sequence in conversion input"));
1156               goto err_out;
1157             }
1158
1159           if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1160             {
1161               high_surrogate = c;
1162               goto next1;
1163             }
1164         }
1165
1166       /********** DIFFERENT for UTF8/UCS4 **********/
1167       n_bytes += sizeof (gunichar);
1168
1169     next1:
1170       in++;
1171     }
1172
1173   if (high_surrogate && !items_read)
1174     {
1175       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1176                            _("Partial character sequence at end of input"));
1177       goto err_out;
1178     }
1179
1180   /* At this point, everything is valid, and we just need to convert
1181    */
1182   /********** DIFFERENT for UTF8/UCS4 **********/
1183   result = g_malloc (n_bytes + 4);
1184
1185   high_surrogate = 0;
1186   out = result;
1187   in = str;
1188   while (out < result + n_bytes)
1189     {
1190       gunichar2 c = *in;
1191       gunichar wc;
1192
1193       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1194         {
1195           wc = SURROGATE_VALUE (high_surrogate, c);
1196           high_surrogate = 0;
1197         }
1198       else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1199         {
1200           high_surrogate = c;
1201           goto next2;
1202         }
1203       else
1204         wc = c;
1205
1206       /********** DIFFERENT for UTF8/UCS4 **********/
1207       *(gunichar *)out = wc;
1208       out += sizeof (gunichar);
1209
1210     next2:
1211       in++;
1212     }
1213
1214   /********** DIFFERENT for UTF8/UCS4 **********/
1215   *(gunichar *)out = 0;
1216
1217   if (items_written)
1218     /********** DIFFERENT for UTF8/UCS4 **********/
1219     *items_written = (out - result) / sizeof (gunichar);
1220
1221  err_out:
1222   if (items_read)
1223     *items_read = in - str;
1224
1225   return (gunichar *)result;
1226 }
1227
1228 /**
1229  * g_utf8_to_utf16:
1230  * @str: a UTF-8 encoded string
1231  * @len: the maximum length (number of bytes) of @str to use.
1232  *       If @len < 0, then the string is nul-terminated.
1233  * @items_read: (allow-none): location to store number of bytes read, or %NULL.
1234  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1235  *              returned in case @str contains a trailing partial
1236  *              character. If an error occurs then the index of the
1237  *              invalid input is stored here.
1238  * @items_written: (allow-none): location to store number of <type>gunichar2</type> written,
1239  *                 or %NULL.
1240  *                 The value stored here does not include the trailing 0.
1241  * @error: location to store the error occurring, or %NULL to ignore
1242  *         errors. Any of the errors in #GConvertError other than
1243  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1244  *
1245  * Convert a string from UTF-8 to UTF-16. A 0 character will be
1246  * added to the result after the converted text.
1247  *
1248  * Return value: a pointer to a newly allocated UTF-16 string.
1249  *               This value must be freed with g_free(). If an
1250  *               error occurs, %NULL will be returned and
1251  *               @error set.
1252  **/
1253 gunichar2 *
1254 g_utf8_to_utf16 (const gchar *str,
1255                  glong        len,
1256                  glong       *items_read,
1257                  glong       *items_written,
1258                  GError     **error)
1259 {
1260   gunichar2 *result = NULL;
1261   gint n16;
1262   const gchar *in;
1263   gint i;
1264
1265   g_return_val_if_fail (str != NULL, NULL);
1266
1267   in = str;
1268   n16 = 0;
1269   while ((len < 0 || str + len - in > 0) && *in)
1270     {
1271       gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
1272       if (wc & 0x80000000)
1273         {
1274           if (wc == (gunichar)-2)
1275             {
1276               if (items_read)
1277                 break;
1278               else
1279                 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1280                                      _("Partial character sequence at end of input"));
1281             }
1282           else
1283             g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1284                                  _("Invalid byte sequence in conversion input"));
1285
1286           goto err_out;
1287         }
1288
1289       if (wc < 0xd800)
1290         n16 += 1;
1291       else if (wc < 0xe000)
1292         {
1293           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1294                                _("Invalid sequence in conversion input"));
1295
1296           goto err_out;
1297         }
1298       else if (wc < 0x10000)
1299         n16 += 1;
1300       else if (wc < 0x110000)
1301         n16 += 2;
1302       else
1303         {
1304           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1305                                _("Character out of range for UTF-16"));
1306
1307           goto err_out;
1308         }
1309
1310       in = g_utf8_next_char (in);
1311     }
1312
1313   result = g_new (gunichar2, n16 + 1);
1314
1315   in = str;
1316   for (i = 0; i < n16;)
1317     {
1318       gunichar wc = g_utf8_get_char (in);
1319
1320       if (wc < 0x10000)
1321         {
1322           result[i++] = wc;
1323         }
1324       else
1325         {
1326           result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
1327           result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
1328         }
1329
1330       in = g_utf8_next_char (in);
1331     }
1332
1333   result[i] = 0;
1334
1335   if (items_written)
1336     *items_written = n16;
1337
1338  err_out:
1339   if (items_read)
1340     *items_read = in - str;
1341
1342   return result;
1343 }
1344
1345 /**
1346  * g_ucs4_to_utf16:
1347  * @str: a UCS-4 encoded string
1348  * @len: the maximum length (number of characters) of @str to use.
1349  *       If @len < 0, then the string is nul-terminated.
1350  * @items_read: (allow-none): location to store number of bytes read, or %NULL.
1351  *              If an error occurs then the index of the invalid input
1352  *              is stored here.
1353  * @items_written: (allow-none): location to store number of <type>gunichar2</type>
1354  *                 written, or %NULL. The value stored here does not
1355  *                 include the trailing 0.
1356  * @error: location to store the error occurring, or %NULL to ignore
1357  *         errors. Any of the errors in #GConvertError other than
1358  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1359  *
1360  * Convert a string from UCS-4 to UTF-16. A 0 character will be
1361  * added to the result after the converted text.
1362  *
1363  * Return value: a pointer to a newly allocated UTF-16 string.
1364  *               This value must be freed with g_free(). If an
1365  *               error occurs, %NULL will be returned and
1366  *               @error set.
1367  **/
1368 gunichar2 *
1369 g_ucs4_to_utf16 (const gunichar  *str,
1370                  glong            len,
1371                  glong           *items_read,
1372                  glong           *items_written,
1373                  GError         **error)
1374 {
1375   gunichar2 *result = NULL;
1376   gint n16;
1377   gint i, j;
1378
1379   n16 = 0;
1380   i = 0;
1381   while ((len < 0 || i < len) && str[i])
1382     {
1383       gunichar wc = str[i];
1384
1385       if (wc < 0xd800)
1386         n16 += 1;
1387       else if (wc < 0xe000)
1388         {
1389           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1390                                _("Invalid sequence in conversion input"));
1391
1392           goto err_out;
1393         }
1394       else if (wc < 0x10000)
1395         n16 += 1;
1396       else if (wc < 0x110000)
1397         n16 += 2;
1398       else
1399         {
1400           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1401                                _("Character out of range for UTF-16"));
1402
1403           goto err_out;
1404         }
1405
1406       i++;
1407     }
1408
1409   result = g_new (gunichar2, n16 + 1);
1410
1411   for (i = 0, j = 0; j < n16; i++)
1412     {
1413       gunichar wc = str[i];
1414
1415       if (wc < 0x10000)
1416         {
1417           result[j++] = wc;
1418         }
1419       else
1420         {
1421           result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
1422           result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
1423         }
1424     }
1425   result[j] = 0;
1426
1427   if (items_written)
1428     *items_written = n16;
1429
1430  err_out:
1431   if (items_read)
1432     *items_read = i;
1433
1434   return result;
1435 }
1436
1437 #define CONTINUATION_CHAR                           \
1438  G_STMT_START {                                     \
1439   if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
1440     goto error;                                     \
1441   val <<= 6;                                        \
1442   val |= (*(guchar *)p) & 0x3f;                     \
1443  } G_STMT_END
1444
1445 static const gchar *
1446 fast_validate (const char *str)
1447
1448 {
1449   gunichar val = 0;
1450   gunichar min = 0;
1451   const gchar *p;
1452
1453   for (p = str; *p; p++)
1454     {
1455       if (*(guchar *)p < 128)
1456         /* done */;
1457       else
1458         {
1459           const gchar *last;
1460
1461           last = p;
1462           if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
1463             {
1464               if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
1465                 goto error;
1466               p++;
1467               if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
1468                 goto error;
1469             }
1470           else
1471             {
1472               if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
1473                 {
1474                   min = (1 << 11);
1475                   val = *(guchar *)p & 0x0f;
1476                   goto TWO_REMAINING;
1477                 }
1478               else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
1479                 {
1480                   min = (1 << 16);
1481                   val = *(guchar *)p & 0x07;
1482                 }
1483               else
1484                 goto error;
1485
1486               p++;
1487               CONTINUATION_CHAR;
1488             TWO_REMAINING:
1489               p++;
1490               CONTINUATION_CHAR;
1491               p++;
1492               CONTINUATION_CHAR;
1493
1494               if (G_UNLIKELY (val < min))
1495                 goto error;
1496
1497               if (G_UNLIKELY (!UNICODE_VALID(val)))
1498                 goto error;
1499             }
1500
1501           continue;
1502
1503         error:
1504           return last;
1505         }
1506     }
1507
1508   return p;
1509 }
1510
1511 static const gchar *
1512 fast_validate_len (const char *str,
1513                    gssize      max_len)
1514
1515 {
1516   gunichar val = 0;
1517   gunichar min = 0;
1518   const gchar *p;
1519
1520   g_assert (max_len >= 0);
1521
1522   for (p = str; ((p - str) < max_len) && *p; p++)
1523     {
1524       if (*(guchar *)p < 128)
1525         /* done */;
1526       else
1527         {
1528           const gchar *last;
1529
1530           last = p;
1531           if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
1532             {
1533               if (G_UNLIKELY (max_len - (p - str) < 2))
1534                 goto error;
1535
1536               if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
1537                 goto error;
1538               p++;
1539               if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
1540                 goto error;
1541             }
1542           else
1543             {
1544               if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
1545                 {
1546                   if (G_UNLIKELY (max_len - (p - str) < 3))
1547                     goto error;
1548
1549                   min = (1 << 11);
1550                   val = *(guchar *)p & 0x0f;
1551                   goto TWO_REMAINING;
1552                 }
1553               else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
1554                 {
1555                   if (G_UNLIKELY (max_len - (p - str) < 4))
1556                     goto error;
1557
1558                   min = (1 << 16);
1559                   val = *(guchar *)p & 0x07;
1560                 }
1561               else
1562                 goto error;
1563
1564               p++;
1565               CONTINUATION_CHAR;
1566             TWO_REMAINING:
1567               p++;
1568               CONTINUATION_CHAR;
1569               p++;
1570               CONTINUATION_CHAR;
1571
1572               if (G_UNLIKELY (val < min))
1573                 goto error;
1574               if (G_UNLIKELY (!UNICODE_VALID(val)))
1575                 goto error;
1576             }
1577
1578           continue;
1579
1580         error:
1581           return last;
1582         }
1583     }
1584
1585   return p;
1586 }
1587
1588 /**
1589  * g_utf8_validate:
1590  * @str: (array length=max_len) (element-type guint8): a pointer to character data
1591  * @max_len: max bytes to validate, or -1 to go until NUL
1592  * @end: (allow-none) (out) (transfer none): return location for end of valid data
1593  *
1594  * Validates UTF-8 encoded text. @str is the text to validate;
1595  * if @str is nul-terminated, then @max_len can be -1, otherwise
1596  * @max_len should be the number of bytes to validate.
1597  * If @end is non-%NULL, then the end of the valid range
1598  * will be stored there (i.e. the start of the first invalid
1599  * character if some bytes were invalid, or the end of the text
1600  * being validated otherwise).
1601  *
1602  * Note that g_utf8_validate() returns %FALSE if @max_len is
1603  * positive and any of the @max_len bytes are NUL.
1604  *
1605  * Returns %TRUE if all of @str was valid. Many GLib and GTK+
1606  * routines <emphasis>require</emphasis> valid UTF-8 as input;
1607  * so data read from a file or the network should be checked
1608  * with g_utf8_validate() before doing anything else with it.
1609  *
1610  * Return value: %TRUE if the text was valid UTF-8
1611  **/
1612 gboolean
1613 g_utf8_validate (const char   *str,
1614                  gssize        max_len,
1615                  const gchar **end)
1616
1617 {
1618   const gchar *p;
1619
1620   if (max_len < 0)
1621     p = fast_validate (str);
1622   else
1623     p = fast_validate_len (str, max_len);
1624
1625   if (end)
1626     *end = p;
1627
1628   if ((max_len >= 0 && p != str + max_len) ||
1629       (max_len < 0 && *p != '\0'))
1630     return FALSE;
1631   else
1632     return TRUE;
1633 }
1634
1635 /**
1636  * g_unichar_validate:
1637  * @ch: a Unicode character
1638  *
1639  * Checks whether @ch is a valid Unicode character. Some possible
1640  * integer values of @ch will not be valid. 0 is considered a valid
1641  * character, though it's normally a string terminator.
1642  *
1643  * Return value: %TRUE if @ch is a valid Unicode character
1644  **/
1645 gboolean
1646 g_unichar_validate (gunichar ch)
1647 {
1648   return UNICODE_VALID (ch);
1649 }
1650
1651 /**
1652  * g_utf8_strreverse:
1653  * @str: a UTF-8 encoded string
1654  * @len: the maximum length of @str to use, in bytes. If @len < 0,
1655  *       then the string is nul-terminated.
1656  *
1657  * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text.
1658  * (Use g_utf8_validate() on all text before trying to use UTF-8
1659  * utility functions with it.)
1660  *
1661  * This function is intended for programmatic uses of reversed strings.
1662  * It pays no attention to decomposed characters, combining marks, byte
1663  * order marks, directional indicators (LRM, LRO, etc) and similar
1664  * characters which might need special handling when reversing a string
1665  * for display purposes.
1666  *
1667  * Note that unlike g_strreverse(), this function returns
1668  * newly-allocated memory, which should be freed with g_free() when
1669  * no longer needed.
1670  *
1671  * Returns: a newly-allocated string which is the reverse of @str.
1672  *
1673  * Since: 2.2
1674  */
1675 gchar *
1676 g_utf8_strreverse (const gchar *str,
1677                    gssize       len)
1678 {
1679   gchar *r, *result;
1680   const gchar *p;
1681
1682   if (len < 0)
1683     len = strlen (str);
1684
1685   result = g_new (gchar, len + 1);
1686   r = result + len;
1687   p = str;
1688   while (r > result)
1689     {
1690       gchar *m, skip = g_utf8_skip[*(guchar*) p];
1691       r -= skip;
1692       for (m = r; skip; skip--)
1693         *m++ = *p++;
1694     }
1695   result[len] = 0;
1696
1697   return result;
1698 }
1699
1700
1701 gchar *
1702 _g_utf8_make_valid (const gchar *name)
1703 {
1704   GString *string;
1705   const gchar *remainder, *invalid;
1706   gint remaining_bytes, valid_bytes;
1707
1708   g_return_val_if_fail (name != NULL, NULL);
1709
1710   string = NULL;
1711   remainder = name;
1712   remaining_bytes = strlen (name);
1713
1714   while (remaining_bytes != 0)
1715     {
1716       if (g_utf8_validate (remainder, remaining_bytes, &invalid))
1717         break;
1718       valid_bytes = invalid - remainder;
1719
1720       if (string == NULL)
1721         string = g_string_sized_new (remaining_bytes);
1722
1723       g_string_append_len (string, remainder, valid_bytes);
1724       /* append U+FFFD REPLACEMENT CHARACTER */
1725       g_string_append (string, "\357\277\275");
1726
1727       remaining_bytes -= valid_bytes + 1;
1728       remainder = invalid + 1;
1729     }
1730
1731   if (string == NULL)
1732     return g_strdup (name);
1733
1734   g_string_append (string, remainder);
1735
1736   g_assert (g_utf8_validate (string->str, -1, NULL));
1737
1738   return g_string_free (string, FALSE);
1739 }