glib/gutf8.c

   1 /* gutf8.c - Operations on UTF-8 strings.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "config.h"
  23
  24 #include <stdlib.h>
  25 #ifdef HAVE_CODESET
  26 #include <langinfo.h>
  27 #endif
  28 #include <string.h>
  29
  30 #ifdef G_PLATFORM_WIN32
  31 #include <stdio.h>
  32 #define STRICT
  33 #include <windows.h>
  34 #undef STRICT
  35 #endif
  36
  37 #include "gconvert.h"
  38 #include "ghash.h"
  39 #include "gstrfuncs.h"
  40 #include "gtestutils.h"
  41 #include "gtypes.h"
  42 #include "gthread.h"
  43 #include "glibintl.h"
  44
  45 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
  46   if (Char < 128)                                                             \
  47     {                                                                         \
  48       Len = 1;                                                                \
  49       Mask = 0x7f;                                                            \
  50     }                                                                         \
  51   else if ((Char & 0xe0) == 0xc0)                                             \
  52     {                                                                         \
  53       Len = 2;                                                                \
  54       Mask = 0x1f;                                                            \
  55     }                                                                         \
  56   else if ((Char & 0xf0) == 0xe0)                                             \
  57     {                                                                         \
  58       Len = 3;                                                                \
  59       Mask = 0x0f;                                                            \
  60     }                                                                         \
  61   else if ((Char & 0xf8) == 0xf0)                                             \
  62     {                                                                         \
  63       Len = 4;                                                                \
  64       Mask = 0x07;                                                            \
  65     }                                                                         \
  66   else if ((Char & 0xfc) == 0xf8)                                             \
  67     {                                                                         \
  68       Len = 5;                                                                \
  69       Mask = 0x03;                                                            \
  70     }                                                                         \
  71   else if ((Char & 0xfe) == 0xfc)                                             \
  72     {                                                                         \
  73       Len = 6;                                                                \
  74       Mask = 0x01;                                                            \
  75     }                                                                         \
  76   else                                                                        \
  77     Len = -1;
  78
  79 #define UTF8_LENGTH(Char)              \
  80   ((Char) < 0x80 ? 1 :                 \
  81    ((Char) < 0x800 ? 2 :               \
  82     ((Char) < 0x10000 ? 3 :            \
  83      ((Char) < 0x200000 ? 4 :          \
  84       ((Char) < 0x4000000 ? 5 : 6)))))
  85
  86
  87 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
  88   (Result) = (Chars)[0] & (Mask);                                             \
  89   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
  90     {                                                                         \
  91       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
  92         {                                                                     \
  93           (Result) = -1;                                                      \
  94           break;                                                              \
  95         }                                                                     \
  96       (Result) <<= 6;                                                         \
  97       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
  98     }
  99
 100 /*
 101  * Check whether a Unicode (5.2) char is in a valid range.
 102  *
 103  * The first check comes from the Unicode guarantee to never encode
 104  * a point above 0x0010ffff, since UTF-16 couldn't represent it.
 105  *
 106  * The second check covers surrogate pairs (category Cs).
 107  *
 108  * @param Char the character
 109  */
 110 #define UNICODE_VALID(Char)                   \
 111     ((Char) < 0x110000 &&                     \
 112      (((Char) & 0xFFFFF800) != 0xD800))
 113
 114
 115 static const gchar utf8_skip_data[256] = {
 116   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 117   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 118   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 119   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 120   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 121   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 122   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 123   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
 124 };
 125
 126 const gchar * const g_utf8_skip = utf8_skip_data;
 127
 128 /**
 129  * g_utf8_find_prev_char:
 130  * @str: pointer to the beginning of a UTF-8 encoded string
 131  * @p: pointer to some position within @str
 132  *
 133  * Given a position @p with a UTF-8 encoded string @str, find the start
 134  * of the previous UTF-8 character starting before @p. Returns %NULL if no
 135  * UTF-8 characters are present in @str before @p.
 136  *
 137  * @p does not have to be at the beginning of a UTF-8 character. No check
 138  * is made to see if the character found is actually valid other than
 139  * it starts with an appropriate byte.
 140  *
 141  * Return value: a pointer to the found character or %NULL.
 142  **/
 143 gchar *
 144 g_utf8_find_prev_char (const char *str,
 145                        const char *p)
 146 {
 147   for (--p; p >= str; --p)
 148     {
 149       if ((*p & 0xc0) != 0x80)
 150         return (gchar *)p;
 151     }
 152   return NULL;
 153 }
 154
 155 /**
 156  * g_utf8_find_next_char:
 157  * @p: a pointer to a position within a UTF-8 encoded string
 158  * @end: a pointer to the byte following the end of the string,
 159  * or %NULL to indicate that the string is nul-terminated.
 160  *
 161  * Finds the start of the next UTF-8 character in the string after @p.
 162  *
 163  * @p does not have to be at the beginning of a UTF-8 character. No check
 164  * is made to see if the character found is actually valid other than
 165  * it starts with an appropriate byte.
 166  *
 167  * Return value: a pointer to the found character or %NULL
 168  **/
 169 gchar *
 170 g_utf8_find_next_char (const gchar *p,
 171                        const gchar *end)
 172 {
 173   if (*p)
 174     {
 175       if (end)
 176         for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
 177           ;
 178       else
 179         for (++p; (*p & 0xc0) == 0x80; ++p)
 180           ;
 181     }
 182   return (p == end) ? NULL : (gchar *)p;
 183 }
 184
 185 /**
 186  * g_utf8_prev_char:
 187  * @p: a pointer to a position within a UTF-8 encoded string
 188  *
 189  * Finds the previous UTF-8 character in the string before @p.
 190  *
 191  * @p does not have to be at the beginning of a UTF-8 character. No check
 192  * is made to see if the character found is actually valid other than
 193  * it starts with an appropriate byte. If @p might be the first
 194  * character of the string, you must use g_utf8_find_prev_char() instead.
 195  *
 196  * Return value: a pointer to the found character.
 197  **/
 198 gchar *
 199 g_utf8_prev_char (const gchar *p)
 200 {
 201   while (TRUE)
 202     {
 203       p--;
 204       if ((*p & 0xc0) != 0x80)
 205         return (gchar *)p;
 206     }
 207 }
 208
 209 /**
 210  * g_utf8_strlen:
 211  * @p: pointer to the start of a UTF-8 encoded string
 212  * @max: the maximum number of bytes to examine. If @max
 213  *       is less than 0, then the string is assumed to be
 214  *       nul-terminated. If @max is 0, @p will not be examined and
 215  *       may be %NULL. If @max is greater than 0, up to @max
 216  *       bytes are examined
 217  *
 218  * Computes the length of the string in characters, not including
 219  * the terminating nul character. If the @max'th byte falls in the
 220  * middle of a character, the last (partial) character is not counted.
 221  *
 222  * Return value: the length of the string in characters
 223  **/
 224 glong
 225 g_utf8_strlen (const gchar *p,
 226                gssize       max)
 227 {
 228   glong len = 0;
 229   const gchar *start = p;
 230   g_return_val_if_fail (p != NULL || max == 0, 0);
 231
 232   if (max < 0)
 233     {
 234       while (*p)
 235         {
 236           p = g_utf8_next_char (p);
 237           ++len;
 238         }
 239     }
 240   else
 241     {
 242       if (max == 0 || !*p)
 243         return 0;
 244
 245       p = g_utf8_next_char (p);
 246
 247       while (p - start < max && *p)
 248         {
 249           ++len;
 250           p = g_utf8_next_char (p);
 251         }
 252
 253       /* only do the last len increment if we got a complete
 254        * char (don't count partial chars)
 255        */
 256       if (p - start <= max)
 257         ++len;
 258     }
 259
 260   return len;
 261 }
 262
 263 /**
 264  * g_utf8_substring:
 265  * @str: a UTF-8 encoded string
 266  * @start_pos: a character offset within @str
 267  * @end_pos: another character offset within @str
 268  *
 269  * Copies a substring out of a UTF-8 encoded string.
 270  * The substring will contain @end_pos - @start_pos
 271  * characters.
 272  *
 273  * Returns: a newly allocated copy of the requested
 274  *     substring. Free with g_free() when no longer needed.
 275  *
 276  * Since: 2.30
 277  */
 278 gchar *
 279 g_utf8_substring (const gchar *str,
 280                   glong        start_pos,
 281                   glong        end_pos)
 282 {
 283   gchar *start, *end, *out;
 284
 285   start = g_utf8_offset_to_pointer (str, start_pos);
 286   end = g_utf8_offset_to_pointer (start, end_pos - start_pos);
 287
 288   out = g_malloc (end - start + 1);
 289   memcpy (out, start, end - start);
 290   out[end - start] = 0;
 291
 292   return out;
 293 }
 294
 295 /**
 296  * g_utf8_get_char:
 297  * @p: a pointer to Unicode character encoded as UTF-8
 298  *
 299  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 300  * If @p does not point to a valid UTF-8 encoded character, results are
 301  * undefined. If you are not sure that the bytes are complete
 302  * valid Unicode characters, you should use g_utf8_get_char_validated()
 303  * instead.
 304  *
 305  * Return value: the resulting character
 306  **/
 307 gunichar
 308 g_utf8_get_char (const gchar *p)
 309 {
 310   int i, mask = 0, len;
 311   gunichar result;
 312   unsigned char c = (unsigned char) *p;
 313
 314   UTF8_COMPUTE (c, mask, len);
 315   if (len == -1)
 316     return (gunichar)-1;
 317   UTF8_GET (result, p, i, mask, len);
 318
 319   return result;
 320 }
 321
 322 /**
 323  * g_utf8_offset_to_pointer:
 324  * @str: a UTF-8 encoded string
 325  * @offset: a character offset within @str
 326  *
 327  * Converts from an integer character offset to a pointer to a position
 328  * within the string.
 329  *
 330  * Since 2.10, this function allows to pass a negative @offset to
 331  * step backwards. It is usually worth stepping backwards from the end
 332  * instead of forwards if @offset is in the last fourth of the string,
 333  * since moving forward is about 3 times faster than moving backward.
 334  *
 335  * <note><para>
 336  * This function doesn't abort when reaching the end of @str. Therefore
 337  * you should be sure that @offset is within string boundaries before
 338  * calling that function. Call g_utf8_strlen() when unsure.
 339  *
 340  * This limitation exists as this function is called frequently during
 341  * text rendering and therefore has to be as fast as possible.
 342  * </para></note>
 343  *
 344  * Return value: the resulting pointer
 345  **/
 346 gchar *
 347 g_utf8_offset_to_pointer  (const gchar *str,
 348                            glong        offset)
 349 {
 350   const gchar *s = str;
 351
 352   if (offset > 0)
 353     while (offset--)
 354       s = g_utf8_next_char (s);
 355   else
 356     {
 357       const char *s1;
 358
 359       /* This nice technique for fast backwards stepping
 360        * through a UTF-8 string was dubbed "stutter stepping"
 361        * by its inventor, Larry Ewing.
 362        */
 363       while (offset)
 364         {
 365           s1 = s;
 366           s += offset;
 367           while ((*s & 0xc0) == 0x80)
 368             s--;
 369
 370           offset += g_utf8_pointer_to_offset (s, s1);
 371         }
 372     }
 373
 374   return (gchar *)s;
 375 }
 376
 377 /**
 378  * g_utf8_pointer_to_offset:
 379  * @str: a UTF-8 encoded string
 380  * @pos: a pointer to a position within @str
 381  *
 382  * Converts from a pointer to position within a string to a integer
 383  * character offset.
 384  *
 385  * Since 2.10, this function allows @pos to be before @str, and returns
 386  * a negative offset in this case.
 387  *
 388  * Return value: the resulting character offset
 389  **/
 390 glong
 391 g_utf8_pointer_to_offset (const gchar *str,
 392                           const gchar *pos)
 393 {
 394   const gchar *s = str;
 395   glong offset = 0;
 396
 397   if (pos < str)
 398     offset = - g_utf8_pointer_to_offset (pos, str);
 399   else
 400     while (s < pos)
 401       {
 402         s = g_utf8_next_char (s);
 403         offset++;
 404       }
 405
 406   return offset;
 407 }
 408
 409
 410 /**
 411  * g_utf8_strncpy:
 412  * @dest: buffer to fill with characters from @src
 413  * @src: UTF-8 encoded string
 414  * @n: character count
 415  *
 416  * Like the standard C strncpy() function, but
 417  * copies a given number of characters instead of a given number of
 418  * bytes. The @src string must be valid UTF-8 encoded text.
 419  * (Use g_utf8_validate() on all text before trying to use UTF-8
 420  * utility functions with it.)
 421  *
 422  * Return value: @dest
 423  **/
 424 gchar *
 425 g_utf8_strncpy (gchar       *dest,
 426                 const gchar *src,
 427                 gsize        n)
 428 {
 429   const gchar *s = src;
 430   while (n && *s)
 431     {
 432       s = g_utf8_next_char(s);
 433       n--;
 434     }
 435   strncpy(dest, src, s - src);
 436   dest[s - src] = 0;
 437   return dest;
 438 }
 439
 440 /* unicode_strchr */
 441
 442 /**
 443  * g_unichar_to_utf8:
 444  * @c: a Unicode character code
 445  * @outbuf: output buffer, must have at least 6 bytes of space.
 446  *       If %NULL, the length will be computed and returned
 447  *       and nothing will be written to @outbuf.
 448  *
 449  * Converts a single character to UTF-8.
 450  *
 451  * Return value: number of bytes written
 452  **/
 453 int
 454 g_unichar_to_utf8 (gunichar c,
 455                    gchar   *outbuf)
 456 {
 457   /* If this gets modified, also update the copy in g_string_insert_unichar() */
 458   guint len = 0;
 459   int first;
 460   int i;
 461
 462   if (c < 0x80)
 463     {
 464       first = 0;
 465       len = 1;
 466     }
 467   else if (c < 0x800)
 468     {
 469       first = 0xc0;
 470       len = 2;
 471     }
 472   else if (c < 0x10000)
 473     {
 474       first = 0xe0;
 475       len = 3;
 476     }
 477    else if (c < 0x200000)
 478     {
 479       first = 0xf0;
 480       len = 4;
 481     }
 482   else if (c < 0x4000000)
 483     {
 484       first = 0xf8;
 485       len = 5;
 486     }
 487   else
 488     {
 489       first = 0xfc;
 490       len = 6;
 491     }
 492
 493   if (outbuf)
 494     {
 495       for (i = len - 1; i > 0; --i)
 496         {
 497           outbuf[i] = (c & 0x3f) | 0x80;
 498           c >>= 6;
 499         }
 500       outbuf[0] = c | first;
 501     }
 502
 503   return len;
 504 }
 505
 506 /**
 507  * g_utf8_strchr:
 508  * @p: a nul-terminated UTF-8 encoded string
 509  * @len: the maximum length of @p
 510  * @c: a Unicode character
 511  *
 512  * Finds the leftmost occurrence of the given Unicode character
 513  * in a UTF-8 encoded string, while limiting the search to @len bytes.
 514  * If @len is -1, allow unbounded search.
 515  *
 516  * Return value: %NULL if the string does not contain the character,
 517  *   otherwise, a pointer to the start of the leftmost occurrence of
 518  *   the character in the string.
 519  **/
 520 gchar *
 521 g_utf8_strchr (const char *p,
 522                gssize      len,
 523                gunichar    c)
 524 {
 525   gchar ch[10];
 526
 527   gint charlen = g_unichar_to_utf8 (c, ch);
 528   ch[charlen] = '\0';
 529
 530   return g_strstr_len (p, len, ch);
 531 }
 532
 533
 534 /**
 535  * g_utf8_strrchr:
 536  * @p: a nul-terminated UTF-8 encoded string
 537  * @len: the maximum length of @p
 538  * @c: a Unicode character
 539  *
 540  * Find the rightmost occurrence of the given Unicode character
 541  * in a UTF-8 encoded string, while limiting the search to @len bytes.
 542  * If @len is -1, allow unbounded search.
 543  *
 544  * Return value: %NULL if the string does not contain the character,
 545  *   otherwise, a pointer to the start of the rightmost occurrence of the
 546  *   character in the string.
 547  **/
 548 gchar *
 549 g_utf8_strrchr (const char *p,
 550                 gssize      len,
 551                 gunichar    c)
 552 {
 553   gchar ch[10];
 554
 555   gint charlen = g_unichar_to_utf8 (c, ch);
 556   ch[charlen] = '\0';
 557
 558   return g_strrstr_len (p, len, ch);
 559 }
 560
 561
 562 /* Like g_utf8_get_char, but take a maximum length
 563  * and return (gunichar)-2 on incomplete trailing character;
 564  * also check for malformed or overlong sequences
 565  * and return (gunichar)-1 in this case.
 566  */
 567 static inline gunichar
 568 g_utf8_get_char_extended (const  gchar *p,
 569                           gssize max_len)
 570 {
 571   guint i, len;
 572   gunichar min_code;
 573   gunichar wc = (guchar) *p;
 574
 575   if (wc < 0x80)
 576     {
 577       return wc;
 578     }
 579   else if (G_UNLIKELY (wc < 0xc0))
 580     {
 581       return (gunichar)-1;
 582     }
 583   else if (wc < 0xe0)
 584     {
 585       len = 2;
 586       wc &= 0x1f;
 587       min_code = 1 << 7;
 588     }
 589   else if (wc < 0xf0)
 590     {
 591       len = 3;
 592       wc &= 0x0f;
 593       min_code = 1 << 11;
 594     }
 595   else if (wc < 0xf8)
 596     {
 597       len = 4;
 598       wc &= 0x07;
 599       min_code = 1 << 16;
 600     }
 601   else if (wc < 0xfc)
 602     {
 603       len = 5;
 604       wc &= 0x03;
 605       min_code = 1 << 21;
 606     }
 607   else if (wc < 0xfe)
 608     {
 609       len = 6;
 610       wc &= 0x01;
 611       min_code = 1 << 26;
 612     }
 613   else
 614     {
 615       return (gunichar)-1;
 616     }
 617
 618   if (G_UNLIKELY (max_len >= 0 && len > max_len))
 619     {
 620       for (i = 1; i < max_len; i++)
 621         {
 622           if ((((guchar *)p)[i] & 0xc0) != 0x80)
 623             return (gunichar)-1;
 624         }
 625       return (gunichar)-2;
 626     }
 627
 628   for (i = 1; i < len; ++i)
 629     {
 630       gunichar ch = ((guchar *)p)[i];
 631
 632       if (G_UNLIKELY ((ch & 0xc0) != 0x80))
 633         {
 634           if (ch)
 635             return (gunichar)-1;
 636           else
 637             return (gunichar)-2;
 638         }
 639
 640       wc <<= 6;
 641       wc |= (ch & 0x3f);
 642     }
 643
 644   if (G_UNLIKELY (wc < min_code))
 645     return (gunichar)-1;
 646
 647   return wc;
 648 }
 649
 650 /**
 651  * g_utf8_get_char_validated:
 652  * @p: a pointer to Unicode character encoded as UTF-8
 653  * @max_len: the maximum number of bytes to read, or -1, for no maximum or
 654  *           if @p is nul-terminated
 655  *
 656  * Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
 657  * This function checks for incomplete characters, for invalid characters
 658  * such as characters that are out of the range of Unicode, and for
 659  * overlong encodings of valid characters.
 660  *
 661  * Return value: the resulting character. If @p points to a partial
 662  *    sequence at the end of a string that could begin a valid
 663  *    character (or if @max_len is zero), returns (gunichar)-2;
 664  *    otherwise, if @p does not point to a valid UTF-8 encoded
 665  *    Unicode character, returns (gunichar)-1.
 666  **/
 667 gunichar
 668 g_utf8_get_char_validated (const  gchar *p,
 669                            gssize max_len)
 670 {
 671   gunichar result;
 672
 673   if (max_len == 0)
 674     return (gunichar)-2;
 675
 676   result = g_utf8_get_char_extended (p, max_len);
 677
 678   if (result & 0x80000000)
 679     return result;
 680   else if (!UNICODE_VALID (result))
 681     return (gunichar)-1;
 682   else
 683     return result;
 684 }
 685
 686 /**
 687  * g_utf8_to_ucs4_fast:
 688  * @str: a UTF-8 encoded string
 689  * @len: the maximum length of @str to use, in bytes. If @len < 0,
 690  *       then the string is nul-terminated.
 691  * @items_written: (allow-none): location to store the number of characters in the
 692  *                 result, or %NULL.
 693  *
 694  * Convert a string from UTF-8 to a 32-bit fixed width
 695  * representation as UCS-4, assuming valid UTF-8 input.
 696  * This function is roughly twice as fast as g_utf8_to_ucs4()
 697  * but does no error checking on the input. A trailing 0 character
 698  * will be added to the string after the converted text.
 699  *
 700  * Return value: a pointer to a newly allocated UCS-4 string.
 701  *               This value must be freed with g_free().
 702  **/
 703 gunichar *
 704 g_utf8_to_ucs4_fast (const gchar *str,
 705                      glong        len,
 706                      glong       *items_written)
 707 {
 708   gunichar *result;
 709   gint n_chars, i;
 710   const gchar *p;
 711
 712   g_return_val_if_fail (str != NULL, NULL);
 713
 714   p = str;
 715   n_chars = 0;
 716   if (len < 0)
 717     {
 718       while (*p)
 719         {
 720           p = g_utf8_next_char (p);
 721           ++n_chars;
 722         }
 723     }
 724   else
 725     {
 726       while (p < str + len && *p)
 727         {
 728           p = g_utf8_next_char (p);
 729           ++n_chars;
 730         }
 731     }
 732
 733   result = g_new (gunichar, n_chars + 1);
 734
 735   p = str;
 736   for (i=0; i < n_chars; i++)
 737     {
 738       gunichar wc = (guchar)*p++;
 739
 740       if (wc < 0x80)
 741         {
 742           result[i] = wc;
 743         }
 744       else
 745         {
 746           gunichar mask = 0x40;
 747
 748           if (G_UNLIKELY ((wc & mask) == 0))
 749             {
 750               /* It's an out-of-sequence 10xxxxxxx byte.
 751                * Rather than making an ugly hash of this and the next byte
 752                * and overrunning the buffer, it's more useful to treat it
 753                * with a replacement character */
 754               result[i] = 0xfffd;
 755               continue;
 756             }
 757
 758           do
 759             {
 760               wc <<= 6;
 761               wc |= (guchar)(*p++) & 0x3f;
 762               mask <<= 5;
 763             }
 764           while((wc & mask) != 0);
 765
 766           wc &= mask - 1;
 767
 768           result[i] = wc;
 769         }
 770     }
 771   result[i] = 0;
 772
 773   if (items_written)
 774     *items_written = i;
 775
 776   return result;
 777 }
 778
 779 static gpointer
 780 try_malloc_n (gsize n_blocks, gsize n_block_bytes, GError **error)
 781 {
 782     gpointer ptr = g_try_malloc_n (n_blocks, n_block_bytes);
 783     if (ptr == NULL)
 784       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY,
 785                            _("Failed to allocate memory"));
 786     return ptr;
 787 }
 788
 789 /**
 790  * g_utf8_to_ucs4:
 791  * @str: a UTF-8 encoded string
 792  * @len: the maximum length of @str to use, in bytes. If @len < 0,
 793  *       then the string is nul-terminated.
 794  * @items_read: (allow-none): location to store number of bytes read, or %NULL.
 795  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
 796  *              returned in case @str contains a trailing partial
 797  *              character. If an error occurs then the index of the
 798  *              invalid input is stored here.
 799  * @items_written: (allow-none): location to store number of characters written or %NULL.
 800  *                 The value here stored does not include the trailing 0
 801  *                 character.
 802  * @error: location to store the error occurring, or %NULL to ignore
 803  *         errors. Any of the errors in #GConvertError other than
 804  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 805  *
 806  * Convert a string from UTF-8 to a 32-bit fixed width
 807  * representation as UCS-4. A trailing 0 character will be added to the
 808  * string after the converted text.
 809  *
 810  * Return value: a pointer to a newly allocated UCS-4 string.
 811  *               This value must be freed with g_free(). If an
 812  *               error occurs, %NULL will be returned and
 813  *               @error set.
 814  **/
 815 gunichar *
 816 g_utf8_to_ucs4 (const gchar *str,
 817                 glong        len,
 818                 glong       *items_read,
 819                 glong       *items_written,
 820                 GError     **error)
 821 {
 822   gunichar *result = NULL;
 823   gint n_chars, i;
 824   const gchar *in;
 825
 826   in = str;
 827   n_chars = 0;
 828   while ((len < 0 || str + len - in > 0) && *in)
 829     {
 830       gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
 831       if (wc & 0x80000000)
 832         {
 833           if (wc == (gunichar)-2)
 834             {
 835               if (items_read)
 836                 break;
 837               else
 838                 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 839                                      _("Partial character sequence at end of input"));
 840             }
 841           else
 842             g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 843                                  _("Invalid byte sequence in conversion input"));
 844
 845           goto err_out;
 846         }
 847
 848       n_chars++;
 849
 850       in = g_utf8_next_char (in);
 851     }
 852
 853   result = try_malloc_n (n_chars + 1, sizeof (gunichar), error);
 854   if (result == NULL)
 855       goto err_out;
 856
 857   in = str;
 858   for (i=0; i < n_chars; i++)
 859     {
 860       result[i] = g_utf8_get_char (in);
 861       in = g_utf8_next_char (in);
 862     }
 863   result[i] = 0;
 864
 865   if (items_written)
 866     *items_written = n_chars;
 867
 868  err_out:
 869   if (items_read)
 870     *items_read = in - str;
 871
 872   return result;
 873 }
 874
 875 /**
 876  * g_ucs4_to_utf8:
 877  * @str: a UCS-4 encoded string
 878  * @len: the maximum length (number of characters) of @str to use.
 879  *       If @len < 0, then the string is nul-terminated.
 880  * @items_read: (allow-none): location to store number of characters read, or %NULL.
 881  * @items_written: (allow-none): location to store number of bytes written or %NULL.
 882  *                 The value here stored does not include the trailing 0
 883  *                 byte.
 884  * @error: location to store the error occurring, or %NULL to ignore
 885  *         errors. Any of the errors in #GConvertError other than
 886  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 887  *
 888  * Convert a string from a 32-bit fixed width representation as UCS-4.
 889  * to UTF-8. The result will be terminated with a 0 byte.
 890  *
 891  * Return value: a pointer to a newly allocated UTF-8 string.
 892  *               This value must be freed with g_free(). If an
 893  *               error occurs, %NULL will be returned and
 894  *               @error set. In that case, @items_read will be
 895  *               set to the position of the first invalid input
 896  *               character.
 897  **/
 898 gchar *
 899 g_ucs4_to_utf8 (const gunichar *str,
 900                 glong           len,
 901                 glong          *items_read,
 902                 glong          *items_written,
 903                 GError        **error)
 904 {
 905   gint result_length;
 906   gchar *result = NULL;
 907   gchar *p;
 908   gint i;
 909
 910   result_length = 0;
 911   for (i = 0; len < 0 || i < len ; i++)
 912     {
 913       if (!str[i])
 914         break;
 915
 916       if (str[i] >= 0x80000000)
 917         {
 918           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 919                                _("Character out of range for UTF-8"));
 920           goto err_out;
 921         }
 922
 923       result_length += UTF8_LENGTH (str[i]);
 924     }
 925
 926   result = try_malloc_n (result_length + 1, 1, error);
 927   if (result == NULL)
 928       goto err_out;
 929
 930   p = result;
 931
 932   i = 0;
 933   while (p < result + result_length)
 934     p += g_unichar_to_utf8 (str[i++], p);
 935
 936   *p = '\0';
 937
 938   if (items_written)
 939     *items_written = p - result;
 940
 941  err_out:
 942   if (items_read)
 943     *items_read = i;
 944
 945   return result;
 946 }
 947
 948 #define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
 949
 950 /**
 951  * g_utf16_to_utf8:
 952  * @str: a UTF-16 encoded string
 953  * @len: the maximum length (number of <type>gunichar2</type>) of @str to use.
 954  *       If @len < 0, then the string is nul-terminated.
 955  * @items_read: (allow-none): location to store number of words read, or %NULL.
 956  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
 957  *              returned in case @str contains a trailing partial
 958  *              character. If an error occurs then the index of the
 959  *              invalid input is stored here.
 960  * @items_written: (allow-none): location to store number of bytes written, or %NULL.
 961  *                 The value stored here does not include the trailing
 962  *                 0 byte.
 963  * @error: location to store the error occurring, or %NULL to ignore
 964  *         errors. Any of the errors in #GConvertError other than
 965  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 966  *
 967  * Convert a string from UTF-16 to UTF-8. The result will be
 968  * terminated with a 0 byte.
 969  *
 970  * Note that the input is expected to be already in native endianness,
 971  * an initial byte-order-mark character is not handled specially.
 972  * g_convert() can be used to convert a byte buffer of UTF-16 data of
 973  * ambiguous endianess.
 974  *
 975  * Further note that this function does not validate the result
 976  * string; it may e.g. include embedded NUL characters. The only
 977  * validation done by this function is to ensure that the input can
 978  * be correctly interpreted as UTF-16, i.e. it doesn't contain
 979  * things unpaired surrogates.
 980  *
 981  * Return value: a pointer to a newly allocated UTF-8 string.
 982  *               This value must be freed with g_free(). If an
 983  *               error occurs, %NULL will be returned and
 984  *               @error set.
 985  **/
 986 gchar *
 987 g_utf16_to_utf8 (const gunichar2  *str,
 988                  glong             len,
 989                  glong            *items_read,
 990                  glong            *items_written,
 991                  GError          **error)
 992 {
 993   /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
 994    * are marked.
 995    */
 996   const gunichar2 *in;
 997   gchar *out;
 998   gchar *result = NULL;
 999   gint n_bytes;
1000   gunichar high_surrogate;
1001
1002   g_return_val_if_fail (str != NULL, NULL);
1003
1004   n_bytes = 0;
1005   in = str;
1006   high_surrogate = 0;
1007   while ((len < 0 || in - str < len) && *in)
1008     {
1009       gunichar2 c = *in;
1010       gunichar wc;
1011
1012       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1013         {
1014           if (high_surrogate)
1015             {
1016               wc = SURROGATE_VALUE (high_surrogate, c);
1017               high_surrogate = 0;
1018             }
1019           else
1020             {
1021               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1022                                    _("Invalid sequence in conversion input"));
1023               goto err_out;
1024             }
1025         }
1026       else
1027         {
1028           if (high_surrogate)
1029             {
1030               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1031                                    _("Invalid sequence in conversion input"));
1032               goto err_out;
1033             }
1034
1035           if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1036             {
1037               high_surrogate = c;
1038               goto next1;
1039             }
1040           else
1041             wc = c;
1042         }
1043
1044       /********** DIFFERENT for UTF8/UCS4 **********/
1045       n_bytes += UTF8_LENGTH (wc);
1046
1047     next1:
1048       in++;
1049     }
1050
1051   if (high_surrogate && !items_read)
1052     {
1053       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1054                            _("Partial character sequence at end of input"));
1055       goto err_out;
1056     }
1057
1058   /* At this point, everything is valid, and we just need to convert
1059    */
1060   /********** DIFFERENT for UTF8/UCS4 **********/
1061   result = try_malloc_n (n_bytes + 1, 1, error);
1062   if (result == NULL)
1063       goto err_out;
1064
1065   high_surrogate = 0;
1066   out = result;
1067   in = str;
1068   while (out < result + n_bytes)
1069     {
1070       gunichar2 c = *in;
1071       gunichar wc;
1072
1073       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1074         {
1075           wc = SURROGATE_VALUE (high_surrogate, c);
1076           high_surrogate = 0;
1077         }
1078       else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1079         {
1080           high_surrogate = c;
1081           goto next2;
1082         }
1083       else
1084         wc = c;
1085
1086       /********** DIFFERENT for UTF8/UCS4 **********/
1087       out += g_unichar_to_utf8 (wc, out);
1088
1089     next2:
1090       in++;
1091     }
1092
1093   /********** DIFFERENT for UTF8/UCS4 **********/
1094   *out = '\0';
1095
1096   if (items_written)
1097     /********** DIFFERENT for UTF8/UCS4 **********/
1098     *items_written = out - result;
1099
1100  err_out:
1101   if (items_read)
1102     *items_read = in - str;
1103
1104   return result;
1105 }
1106
1107 /**
1108  * g_utf16_to_ucs4:
1109  * @str: a UTF-16 encoded string
1110  * @len: the maximum length (number of <type>gunichar2</type>) of @str to use.
1111  *       If @len < 0, then the string is nul-terminated.
1112  * @items_read: (allow-none): location to store number of words read, or %NULL.
1113  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1114  *              returned in case @str contains a trailing partial
1115  *              character. If an error occurs then the index of the
1116  *              invalid input is stored here.
1117  * @items_written: (allow-none): location to store number of characters written, or %NULL.
1118  *                 The value stored here does not include the trailing
1119  *                 0 character.
1120  * @error: location to store the error occurring, or %NULL to ignore
1121  *         errors. Any of the errors in #GConvertError other than
1122  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1123  *
1124  * Convert a string from UTF-16 to UCS-4. The result will be
1125  * nul-terminated.
1126  *
1127  * Return value: a pointer to a newly allocated UCS-4 string.
1128  *               This value must be freed with g_free(). If an
1129  *               error occurs, %NULL will be returned and
1130  *               @error set.
1131  **/
1132 gunichar *
1133 g_utf16_to_ucs4 (const gunichar2  *str,
1134                  glong             len,
1135                  glong            *items_read,
1136                  glong            *items_written,
1137                  GError          **error)
1138 {
1139   const gunichar2 *in;
1140   gchar *out;
1141   gchar *result = NULL;
1142   gint n_bytes;
1143   gunichar high_surrogate;
1144
1145   g_return_val_if_fail (str != NULL, NULL);
1146
1147   n_bytes = 0;
1148   in = str;
1149   high_surrogate = 0;
1150   while ((len < 0 || in - str < len) && *in)
1151     {
1152       gunichar2 c = *in;
1153
1154       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1155         {
1156           if (high_surrogate)
1157             {
1158               high_surrogate = 0;
1159             }
1160           else
1161             {
1162               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1163                                    _("Invalid sequence in conversion input"));
1164               goto err_out;
1165             }
1166         }
1167       else
1168         {
1169           if (high_surrogate)
1170             {
1171               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1172                                    _("Invalid sequence in conversion input"));
1173               goto err_out;
1174             }
1175
1176           if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1177             {
1178               high_surrogate = c;
1179               goto next1;
1180             }
1181         }
1182
1183       /********** DIFFERENT for UTF8/UCS4 **********/
1184       n_bytes += sizeof (gunichar);
1185
1186     next1:
1187       in++;
1188     }
1189
1190   if (high_surrogate && !items_read)
1191     {
1192       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1193                            _("Partial character sequence at end of input"));
1194       goto err_out;
1195     }
1196
1197   /* At this point, everything is valid, and we just need to convert
1198    */
1199   /********** DIFFERENT for UTF8/UCS4 **********/
1200   result = try_malloc_n (n_bytes + 4, 1, error);
1201   if (result == NULL)
1202       goto err_out;
1203
1204   high_surrogate = 0;
1205   out = result;
1206   in = str;
1207   while (out < result + n_bytes)
1208     {
1209       gunichar2 c = *in;
1210       gunichar wc;
1211
1212       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1213         {
1214           wc = SURROGATE_VALUE (high_surrogate, c);
1215           high_surrogate = 0;
1216         }
1217       else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1218         {
1219           high_surrogate = c;
1220           goto next2;
1221         }
1222       else
1223         wc = c;
1224
1225       /********** DIFFERENT for UTF8/UCS4 **********/
1226       *(gunichar *)out = wc;
1227       out += sizeof (gunichar);
1228
1229     next2:
1230       in++;
1231     }
1232
1233   /********** DIFFERENT for UTF8/UCS4 **********/
1234   *(gunichar *)out = 0;
1235
1236   if (items_written)
1237     /********** DIFFERENT for UTF8/UCS4 **********/
1238     *items_written = (out - result) / sizeof (gunichar);
1239
1240  err_out:
1241   if (items_read)
1242     *items_read = in - str;
1243
1244   return (gunichar *)result;
1245 }
1246
1247 /**
1248  * g_utf8_to_utf16:
1249  * @str: a UTF-8 encoded string
1250  * @len: the maximum length (number of bytes) of @str to use.
1251  *       If @len < 0, then the string is nul-terminated.
1252  * @items_read: (allow-none): location to store number of bytes read, or %NULL.
1253  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1254  *              returned in case @str contains a trailing partial
1255  *              character. If an error occurs then the index of the
1256  *              invalid input is stored here.
1257  * @items_written: (allow-none): location to store number of <type>gunichar2</type> written,
1258  *                 or %NULL.
1259  *                 The value stored here does not include the trailing 0.
1260  * @error: location to store the error occurring, or %NULL to ignore
1261  *         errors. Any of the errors in #GConvertError other than
1262  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1263  *
1264  * Convert a string from UTF-8 to UTF-16. A 0 character will be
1265  * added to the result after the converted text.
1266  *
1267  * Return value: a pointer to a newly allocated UTF-16 string.
1268  *               This value must be freed with g_free(). If an
1269  *               error occurs, %NULL will be returned and
1270  *               @error set.
1271  **/
1272 gunichar2 *
1273 g_utf8_to_utf16 (const gchar *str,
1274                  glong        len,
1275                  glong       *items_read,
1276                  glong       *items_written,
1277                  GError     **error)
1278 {
1279   gunichar2 *result = NULL;
1280   gint n16;
1281   const gchar *in;
1282   gint i;
1283
1284   g_return_val_if_fail (str != NULL, NULL);
1285
1286   in = str;
1287   n16 = 0;
1288   while ((len < 0 || str + len - in > 0) && *in)
1289     {
1290       gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
1291       if (wc & 0x80000000)
1292         {
1293           if (wc == (gunichar)-2)
1294             {
1295               if (items_read)
1296                 break;
1297               else
1298                 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1299                                      _("Partial character sequence at end of input"));
1300             }
1301           else
1302             g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1303                                  _("Invalid byte sequence in conversion input"));
1304
1305           goto err_out;
1306         }
1307
1308       if (wc < 0xd800)
1309         n16 += 1;
1310       else if (wc < 0xe000)
1311         {
1312           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1313                                _("Invalid sequence in conversion input"));
1314
1315           goto err_out;
1316         }
1317       else if (wc < 0x10000)
1318         n16 += 1;
1319       else if (wc < 0x110000)
1320         n16 += 2;
1321       else
1322         {
1323           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1324                                _("Character out of range for UTF-16"));
1325
1326           goto err_out;
1327         }
1328
1329       in = g_utf8_next_char (in);
1330     }
1331
1332   result = try_malloc_n (n16 + 1, sizeof (gunichar2), error);
1333   if (result == NULL)
1334       goto err_out;
1335
1336   in = str;
1337   for (i = 0; i < n16;)
1338     {
1339       gunichar wc = g_utf8_get_char (in);
1340
1341       if (wc < 0x10000)
1342         {
1343           result[i++] = wc;
1344         }
1345       else
1346         {
1347           result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
1348           result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
1349         }
1350
1351       in = g_utf8_next_char (in);
1352     }
1353
1354   result[i] = 0;
1355
1356   if (items_written)
1357     *items_written = n16;
1358
1359  err_out:
1360   if (items_read)
1361     *items_read = in - str;
1362
1363   return result;
1364 }
1365
1366 /**
1367  * g_ucs4_to_utf16:
1368  * @str: a UCS-4 encoded string
1369  * @len: the maximum length (number of characters) of @str to use.
1370  *       If @len < 0, then the string is nul-terminated.
1371  * @items_read: (allow-none): location to store number of bytes read, or %NULL.
1372  *              If an error occurs then the index of the invalid input
1373  *              is stored here.
1374  * @items_written: (allow-none): location to store number of <type>gunichar2</type>
1375  *                 written, or %NULL. The value stored here does not
1376  *                 include the trailing 0.
1377  * @error: location to store the error occurring, or %NULL to ignore
1378  *         errors. Any of the errors in #GConvertError other than
1379  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1380  *
1381  * Convert a string from UCS-4 to UTF-16. A 0 character will be
1382  * added to the result after the converted text.
1383  *
1384  * Return value: a pointer to a newly allocated UTF-16 string.
1385  *               This value must be freed with g_free(). If an
1386  *               error occurs, %NULL will be returned and
1387  *               @error set.
1388  **/
1389 gunichar2 *
1390 g_ucs4_to_utf16 (const gunichar  *str,
1391                  glong            len,
1392                  glong           *items_read,
1393                  glong           *items_written,
1394                  GError         **error)
1395 {
1396   gunichar2 *result = NULL;
1397   gint n16;
1398   gint i, j;
1399
1400   n16 = 0;
1401   i = 0;
1402   while ((len < 0 || i < len) && str[i])
1403     {
1404       gunichar wc = str[i];
1405
1406       if (wc < 0xd800)
1407         n16 += 1;
1408       else if (wc < 0xe000)
1409         {
1410           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1411                                _("Invalid sequence in conversion input"));
1412
1413           goto err_out;
1414         }
1415       else if (wc < 0x10000)
1416         n16 += 1;
1417       else if (wc < 0x110000)
1418         n16 += 2;
1419       else
1420         {
1421           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1422                                _("Character out of range for UTF-16"));
1423
1424           goto err_out;
1425         }
1426
1427       i++;
1428     }
1429
1430   result = try_malloc_n (n16 + 1, sizeof (gunichar2), error);
1431   if (result == NULL)
1432       goto err_out;
1433
1434   for (i = 0, j = 0; j < n16; i++)
1435     {
1436       gunichar wc = str[i];
1437
1438       if (wc < 0x10000)
1439         {
1440           result[j++] = wc;
1441         }
1442       else
1443         {
1444           result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
1445           result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
1446         }
1447     }
1448   result[j] = 0;
1449
1450   if (items_written)
1451     *items_written = n16;
1452
1453  err_out:
1454   if (items_read)
1455     *items_read = i;
1456
1457   return result;
1458 }
1459
1460 #define CONTINUATION_CHAR                           \
1461  G_STMT_START {                                     \
1462   if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
1463     goto error;                                     \
1464   val <<= 6;                                        \
1465   val |= (*(guchar *)p) & 0x3f;                     \
1466  } G_STMT_END
1467
1468 static const gchar *
1469 fast_validate (const char *str)
1470
1471 {
1472   gunichar val = 0;
1473   gunichar min = 0;
1474   const gchar *p;
1475
1476   for (p = str; *p; p++)
1477     {
1478       if (*(guchar *)p < 128)
1479         /* done */;
1480       else
1481         {
1482           const gchar *last;
1483
1484           last = p;
1485           if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
1486             {
1487               if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
1488                 goto error;
1489               p++;
1490               if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
1491                 goto error;
1492             }
1493           else
1494             {
1495               if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
1496                 {
1497                   min = (1 << 11);
1498                   val = *(guchar *)p & 0x0f;
1499                   goto TWO_REMAINING;
1500                 }
1501               else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
1502                 {
1503                   min = (1 << 16);
1504                   val = *(guchar *)p & 0x07;
1505                 }
1506               else
1507                 goto error;
1508
1509               p++;
1510               CONTINUATION_CHAR;
1511             TWO_REMAINING:
1512               p++;
1513               CONTINUATION_CHAR;
1514               p++;
1515               CONTINUATION_CHAR;
1516
1517               if (G_UNLIKELY (val < min))
1518                 goto error;
1519
1520               if (G_UNLIKELY (!UNICODE_VALID(val)))
1521                 goto error;
1522             }
1523
1524           continue;
1525
1526         error:
1527           return last;
1528         }
1529     }
1530
1531   return p;
1532 }
1533
1534 static const gchar *
1535 fast_validate_len (const char *str,
1536                    gssize      max_len)
1537
1538 {
1539   gunichar val = 0;
1540   gunichar min = 0;
1541   const gchar *p;
1542
1543   g_assert (max_len >= 0);
1544
1545   for (p = str; ((p - str) < max_len) && *p; p++)
1546     {
1547       if (*(guchar *)p < 128)
1548         /* done */;
1549       else
1550         {
1551           const gchar *last;
1552
1553           last = p;
1554           if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
1555             {
1556               if (G_UNLIKELY (max_len - (p - str) < 2))
1557                 goto error;
1558
1559               if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
1560                 goto error;
1561               p++;
1562               if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
1563                 goto error;
1564             }
1565           else
1566             {
1567               if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
1568                 {
1569                   if (G_UNLIKELY (max_len - (p - str) < 3))
1570                     goto error;
1571
1572                   min = (1 << 11);
1573                   val = *(guchar *)p & 0x0f;
1574                   goto TWO_REMAINING;
1575                 }
1576               else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
1577                 {
1578                   if (G_UNLIKELY (max_len - (p - str) < 4))
1579                     goto error;
1580
1581                   min = (1 << 16);
1582                   val = *(guchar *)p & 0x07;
1583                 }
1584               else
1585                 goto error;
1586
1587               p++;
1588               CONTINUATION_CHAR;
1589             TWO_REMAINING:
1590               p++;
1591               CONTINUATION_CHAR;
1592               p++;
1593               CONTINUATION_CHAR;
1594
1595               if (G_UNLIKELY (val < min))
1596                 goto error;
1597               if (G_UNLIKELY (!UNICODE_VALID(val)))
1598                 goto error;
1599             }
1600
1601           continue;
1602
1603         error:
1604           return last;
1605         }
1606     }
1607
1608   return p;
1609 }
1610
1611 /**
1612  * g_utf8_validate:
1613  * @str: (array length=max_len) (element-type guint8): a pointer to character data
1614  * @max_len: max bytes to validate, or -1 to go until NUL
1615  * @end: (allow-none) (out) (transfer none): return location for end of valid data
1616  *
1617  * Validates UTF-8 encoded text. @str is the text to validate;
1618  * if @str is nul-terminated, then @max_len can be -1, otherwise
1619  * @max_len should be the number of bytes to validate.
1620  * If @end is non-%NULL, then the end of the valid range
1621  * will be stored there (i.e. the start of the first invalid
1622  * character if some bytes were invalid, or the end of the text
1623  * being validated otherwise).
1624  *
1625  * Note that g_utf8_validate() returns %FALSE if @max_len is
1626  * positive and any of the @max_len bytes are NUL.
1627  *
1628  * Returns %TRUE if all of @str was valid. Many GLib and GTK+
1629  * routines <emphasis>require</emphasis> valid UTF-8 as input;
1630  * so data read from a file or the network should be checked
1631  * with g_utf8_validate() before doing anything else with it.
1632  *
1633  * Return value: %TRUE if the text was valid UTF-8
1634  **/
1635 gboolean
1636 g_utf8_validate (const char   *str,
1637                  gssize        max_len,
1638                  const gchar **end)
1639
1640 {
1641   const gchar *p;
1642
1643   if (max_len < 0)
1644     p = fast_validate (str);
1645   else
1646     p = fast_validate_len (str, max_len);
1647
1648   if (end)
1649     *end = p;
1650
1651   if ((max_len >= 0 && p != str + max_len) ||
1652       (max_len < 0 && *p != '\0'))
1653     return FALSE;
1654   else
1655     return TRUE;
1656 }
1657
1658 /**
1659  * g_unichar_validate:
1660  * @ch: a Unicode character
1661  *
1662  * Checks whether @ch is a valid Unicode character. Some possible
1663  * integer values of @ch will not be valid. 0 is considered a valid
1664  * character, though it's normally a string terminator.
1665  *
1666  * Return value: %TRUE if @ch is a valid Unicode character
1667  **/
1668 gboolean
1669 g_unichar_validate (gunichar ch)
1670 {
1671   return UNICODE_VALID (ch);
1672 }
1673
1674 /**
1675  * g_utf8_strreverse:
1676  * @str: a UTF-8 encoded string
1677  * @len: the maximum length of @str to use, in bytes. If @len < 0,
1678  *       then the string is nul-terminated.
1679  *
1680  * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text.
1681  * (Use g_utf8_validate() on all text before trying to use UTF-8
1682  * utility functions with it.)
1683  *
1684  * This function is intended for programmatic uses of reversed strings.
1685  * It pays no attention to decomposed characters, combining marks, byte
1686  * order marks, directional indicators (LRM, LRO, etc) and similar
1687  * characters which might need special handling when reversing a string
1688  * for display purposes.
1689  *
1690  * Note that unlike g_strreverse(), this function returns
1691  * newly-allocated memory, which should be freed with g_free() when
1692  * no longer needed.
1693  *
1694  * Returns: a newly-allocated string which is the reverse of @str.
1695  *
1696  * Since: 2.2
1697  */
1698 gchar *
1699 g_utf8_strreverse (const gchar *str,
1700                    gssize       len)
1701 {
1702   gchar *r, *result;
1703   const gchar *p;
1704
1705   if (len < 0)
1706     len = strlen (str);
1707
1708   result = g_new (gchar, len + 1);
1709   r = result + len;
1710   p = str;
1711   while (r > result)
1712     {
1713       gchar *m, skip = g_utf8_skip[*(guchar*) p];
1714       r -= skip;
1715       for (m = r; skip; skip--)
1716         *m++ = *p++;
1717     }
1718   result[len] = 0;
1719
1720   return result;
1721 }
1722
1723
1724 gchar *
1725 _g_utf8_make_valid (const gchar *name)
1726 {
1727   GString *string;
1728   const gchar *remainder, *invalid;
1729   gint remaining_bytes, valid_bytes;
1730
1731   g_return_val_if_fail (name != NULL, NULL);
1732
1733   string = NULL;
1734   remainder = name;
1735   remaining_bytes = strlen (name);
1736
1737   while (remaining_bytes != 0)
1738     {
1739       if (g_utf8_validate (remainder, remaining_bytes, &invalid))
1740         break;
1741       valid_bytes = invalid - remainder;
1742
1743       if (string == NULL)
1744         string = g_string_sized_new (remaining_bytes);
1745
1746       g_string_append_len (string, remainder, valid_bytes);
1747       /* append U+FFFD REPLACEMENT CHARACTER */
1748       g_string_append (string, "\357\277\275");
1749
1750       remaining_bytes -= valid_bytes + 1;
1751       remainder = invalid + 1;
1752     }
1753
1754   if (string == NULL)
1755     return g_strdup (name);
1756
1757   g_string_append (string, remainder);
1758
1759   g_assert (g_utf8_validate (string->str, -1, NULL));
1760
1761   return g_string_free (string, FALSE);
1762 }