glib/gutf8.c

   1 /* gutf8.c - Operations on UTF-8 strings.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "config.h"
  23
  24 #include <stdlib.h>
  25 #ifdef HAVE_CODESET
  26 #include <langinfo.h>
  27 #endif
  28 #include <string.h>
  29
  30 #ifdef G_PLATFORM_WIN32
  31 #include <stdio.h>
  32 #define STRICT
  33 #include <windows.h>
  34 #undef STRICT
  35 #endif
  36
  37 #include "gconvert.h"
  38 #include "ghash.h"
  39 #include "gstrfuncs.h"
  40 #include "gtestutils.h"
  41 #include "gtypes.h"
  42 #include "gthread.h"
  43 #include "glibintl.h"
  44
  45 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
  46   if (Char < 128)                                                             \
  47     {                                                                         \
  48       Len = 1;                                                                \
  49       Mask = 0x7f;                                                            \
  50     }                                                                         \
  51   else if ((Char & 0xe0) == 0xc0)                                             \
  52     {                                                                         \
  53       Len = 2;                                                                \
  54       Mask = 0x1f;                                                            \
  55     }                                                                         \
  56   else if ((Char & 0xf0) == 0xe0)                                             \
  57     {                                                                         \
  58       Len = 3;                                                                \
  59       Mask = 0x0f;                                                            \
  60     }                                                                         \
  61   else if ((Char & 0xf8) == 0xf0)                                             \
  62     {                                                                         \
  63       Len = 4;                                                                \
  64       Mask = 0x07;                                                            \
  65     }                                                                         \
  66   else if ((Char & 0xfc) == 0xf8)                                             \
  67     {                                                                         \
  68       Len = 5;                                                                \
  69       Mask = 0x03;                                                            \
  70     }                                                                         \
  71   else if ((Char & 0xfe) == 0xfc)                                             \
  72     {                                                                         \
  73       Len = 6;                                                                \
  74       Mask = 0x01;                                                            \
  75     }                                                                         \
  76   else                                                                        \
  77     Len = -1;
  78
  79 #define UTF8_LENGTH(Char)              \
  80   ((Char) < 0x80 ? 1 :                 \
  81    ((Char) < 0x800 ? 2 :               \
  82     ((Char) < 0x10000 ? 3 :            \
  83      ((Char) < 0x200000 ? 4 :          \
  84       ((Char) < 0x4000000 ? 5 : 6)))))
  85
  86
  87 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
  88   (Result) = (Chars)[0] & (Mask);                                             \
  89   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
  90     {                                                                         \
  91       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
  92         {                                                                     \
  93           (Result) = -1;                                                      \
  94           break;                                                              \
  95         }                                                                     \
  96       (Result) <<= 6;                                                         \
  97       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
  98     }
  99
 100 /*
 101  * Check whether a Unicode (5.2) char is in a valid range.
 102  *
 103  * The first check comes from the Unicode guarantee to never encode
 104  * a point above 0x0010ffff, since UTF-16 couldn't represent it.
 105  *
 106  * The second check covers surrogate pairs (category Cs).
 107  *
 108  * The last two checks cover "Noncharacter": defined as:
 109  *   "A code point that is permanently reserved for
 110  *    internal use, and that should never be interchanged. In
 111  *    Unicode 3.1, these consist of the values U+nFFFE and U+nFFFF
 112  *    (where n is from 0 to 10_16) and the values U+FDD0..U+FDEF."
 113  *
 114  * @param Char the character
 115  */
 116 #define UNICODE_VALID(Char)                   \
 117     ((Char) < 0x110000 &&                     \
 118      (((Char) & 0xFFFFF800) != 0xD800) &&     \
 119      ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
 120      ((Char) & 0xFFFE) != 0xFFFE)
 121
 122
 123 static const gchar utf8_skip_data[256] = {
 124   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 125   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 126   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 127   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 128   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 129   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 130   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 131   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
 132 };
 133
 134 const gchar * const g_utf8_skip = utf8_skip_data;
 135
 136 /**
 137  * g_utf8_find_prev_char:
 138  * @str: pointer to the beginning of a UTF-8 encoded string
 139  * @p: pointer to some position within @str
 140  *
 141  * Given a position @p with a UTF-8 encoded string @str, find the start
 142  * of the previous UTF-8 character starting before @p. Returns %NULL if no
 143  * UTF-8 characters are present in @str before @p.
 144  *
 145  * @p does not have to be at the beginning of a UTF-8 character. No check
 146  * is made to see if the character found is actually valid other than
 147  * it starts with an appropriate byte.
 148  *
 149  * Return value: a pointer to the found character or %NULL.
 150  **/
 151 gchar *
 152 g_utf8_find_prev_char (const char *str,
 153                        const char *p)
 154 {
 155   for (--p; p >= str; --p)
 156     {
 157       if ((*p & 0xc0) != 0x80)
 158         return (gchar *)p;
 159     }
 160   return NULL;
 161 }
 162
 163 /**
 164  * g_utf8_find_next_char:
 165  * @p: a pointer to a position within a UTF-8 encoded string
 166  * @end: a pointer to the byte following the end of the string,
 167  * or %NULL to indicate that the string is nul-terminated.
 168  *
 169  * Finds the start of the next UTF-8 character in the string after @p.
 170  *
 171  * @p does not have to be at the beginning of a UTF-8 character. No check
 172  * is made to see if the character found is actually valid other than
 173  * it starts with an appropriate byte.
 174  *
 175  * Return value: a pointer to the found character or %NULL
 176  **/
 177 gchar *
 178 g_utf8_find_next_char (const gchar *p,
 179                        const gchar *end)
 180 {
 181   if (*p)
 182     {
 183       if (end)
 184         for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
 185           ;
 186       else
 187         for (++p; (*p & 0xc0) == 0x80; ++p)
 188           ;
 189     }
 190   return (p == end) ? NULL : (gchar *)p;
 191 }
 192
 193 /**
 194  * g_utf8_prev_char:
 195  * @p: a pointer to a position within a UTF-8 encoded string
 196  *
 197  * Finds the previous UTF-8 character in the string before @p.
 198  *
 199  * @p does not have to be at the beginning of a UTF-8 character. No check
 200  * is made to see if the character found is actually valid other than
 201  * it starts with an appropriate byte. If @p might be the first
 202  * character of the string, you must use g_utf8_find_prev_char() instead.
 203  *
 204  * Return value: a pointer to the found character.
 205  **/
 206 gchar *
 207 g_utf8_prev_char (const gchar *p)
 208 {
 209   while (TRUE)
 210     {
 211       p--;
 212       if ((*p & 0xc0) != 0x80)
 213         return (gchar *)p;
 214     }
 215 }
 216
 217 /**
 218  * g_utf8_strlen:
 219  * @p: pointer to the start of a UTF-8 encoded string
 220  * @max: the maximum number of bytes to examine. If @max
 221  *       is less than 0, then the string is assumed to be
 222  *       nul-terminated. If @max is 0, @p will not be examined and
 223  *       may be %NULL.
 224  *
 225  * Computes the length of the string in characters, not including
 226  * the terminating nul character.
 227  *
 228  * Return value: the length of the string in characters
 229  **/
 230 glong
 231 g_utf8_strlen (const gchar *p,
 232                gssize       max)
 233 {
 234   glong len = 0;
 235   const gchar *start = p;
 236   g_return_val_if_fail (p != NULL || max == 0, 0);
 237
 238   if (max < 0)
 239     {
 240       while (*p)
 241         {
 242           p = g_utf8_next_char (p);
 243           ++len;
 244         }
 245     }
 246   else
 247     {
 248       if (max == 0 || !*p)
 249         return 0;
 250
 251       p = g_utf8_next_char (p);
 252
 253       while (p - start < max && *p)
 254         {
 255           ++len;
 256           p = g_utf8_next_char (p);
 257         }
 258
 259       /* only do the last len increment if we got a complete
 260        * char (don't count partial chars)
 261        */
 262       if (p - start <= max)
 263         ++len;
 264     }
 265
 266   return len;
 267 }
 268
 269 /**
 270  * g_utf8_substring:
 271  * @str: a UTF-8 encoded string
 272  * @start_pos: a character offset within @str
 273  * @end_pos: another character offset within @str
 274  *
 275  * Copies a substring out of a UTF-8 encoded string.
 276  * The substring will contain @end_pos - @start_pos
 277  * characters.
 278  *
 279  * Returns: a newly allocated copy of the requested
 280  *     substring. Free with g_free() when no longer needed.
 281  *
 282  * Since: 2.30
 283  */
 284 gchar *
 285 g_utf8_substring (const gchar *str,
 286                   glong        start_pos,
 287                   glong        end_pos)
 288 {
 289   gchar *start, *end, *out;
 290
 291   start = g_utf8_offset_to_pointer (str, start_pos);
 292   end = g_utf8_offset_to_pointer (start, end_pos - start_pos);
 293
 294   out = g_malloc (end - start + 1);
 295   memcpy (out, start, end - start);
 296   out[end - start] = 0;
 297
 298   return out;
 299 }
 300
 301 /**
 302  * g_utf8_get_char:
 303  * @p: a pointer to Unicode character encoded as UTF-8
 304  *
 305  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 306  * If @p does not point to a valid UTF-8 encoded character, results are
 307  * undefined. If you are not sure that the bytes are complete
 308  * valid Unicode characters, you should use g_utf8_get_char_validated()
 309  * instead.
 310  *
 311  * Return value: the resulting character
 312  **/
 313 gunichar
 314 g_utf8_get_char (const gchar *p)
 315 {
 316   int i, mask = 0, len;
 317   gunichar result;
 318   unsigned char c = (unsigned char) *p;
 319
 320   UTF8_COMPUTE (c, mask, len);
 321   if (len == -1)
 322     return (gunichar)-1;
 323   UTF8_GET (result, p, i, mask, len);
 324
 325   return result;
 326 }
 327
 328 /**
 329  * g_utf8_offset_to_pointer:
 330  * @str: a UTF-8 encoded string
 331  * @offset: a character offset within @str
 332  *
 333  * Converts from an integer character offset to a pointer to a position
 334  * within the string.
 335  *
 336  * Since 2.10, this function allows to pass a negative @offset to
 337  * step backwards. It is usually worth stepping backwards from the end
 338  * instead of forwards if @offset is in the last fourth of the string,
 339  * since moving forward is about 3 times faster than moving backward.
 340  *
 341  * <note><para>
 342  * This function doesn't abort when reaching the end of @str. Therefore
 343  * you should be sure that @offset is within string boundaries before
 344  * calling that function. Call g_utf8_strlen() when unsure.
 345  *
 346  * This limitation exists as this function is called frequently during
 347  * text rendering and therefore has to be as fast as possible.
 348  * </para></note>
 349  *
 350  * Return value: the resulting pointer
 351  **/
 352 gchar *
 353 g_utf8_offset_to_pointer  (const gchar *str,
 354                            glong        offset)
 355 {
 356   const gchar *s = str;
 357
 358   if (offset > 0)
 359     while (offset--)
 360       s = g_utf8_next_char (s);
 361   else
 362     {
 363       const char *s1;
 364
 365       /* This nice technique for fast backwards stepping
 366        * through a UTF-8 string was dubbed "stutter stepping"
 367        * by its inventor, Larry Ewing.
 368        */
 369       while (offset)
 370         {
 371           s1 = s;
 372           s += offset;
 373           while ((*s & 0xc0) == 0x80)
 374             s--;
 375
 376           offset += g_utf8_pointer_to_offset (s, s1);
 377         }
 378     }
 379
 380   return (gchar *)s;
 381 }
 382
 383 /**
 384  * g_utf8_pointer_to_offset:
 385  * @str: a UTF-8 encoded string
 386  * @pos: a pointer to a position within @str
 387  *
 388  * Converts from a pointer to position within a string to a integer
 389  * character offset.
 390  *
 391  * Since 2.10, this function allows @pos to be before @str, and returns
 392  * a negative offset in this case.
 393  *
 394  * Return value: the resulting character offset
 395  **/
 396 glong
 397 g_utf8_pointer_to_offset (const gchar *str,
 398                           const gchar *pos)
 399 {
 400   const gchar *s = str;
 401   glong offset = 0;
 402
 403   if (pos < str)
 404     offset = - g_utf8_pointer_to_offset (pos, str);
 405   else
 406     while (s < pos)
 407       {
 408         s = g_utf8_next_char (s);
 409         offset++;
 410       }
 411
 412   return offset;
 413 }
 414
 415
 416 /**
 417  * g_utf8_strncpy:
 418  * @dest: buffer to fill with characters from @src
 419  * @src: UTF-8 encoded string
 420  * @n: character count
 421  *
 422  * Like the standard C strncpy() function, but
 423  * copies a given number of characters instead of a given number of
 424  * bytes. The @src string must be valid UTF-8 encoded text.
 425  * (Use g_utf8_validate() on all text before trying to use UTF-8
 426  * utility functions with it.)
 427  *
 428  * Return value: @dest
 429  **/
 430 gchar *
 431 g_utf8_strncpy (gchar       *dest,
 432                 const gchar *src,
 433                 gsize        n)
 434 {
 435   const gchar *s = src;
 436   while (n && *s)
 437     {
 438       s = g_utf8_next_char(s);
 439       n--;
 440     }
 441   strncpy(dest, src, s - src);
 442   dest[s - src] = 0;
 443   return dest;
 444 }
 445
 446 /* unicode_strchr */
 447
 448 /**
 449  * g_unichar_to_utf8:
 450  * @c: a Unicode character code
 451  * @outbuf: output buffer, must have at least 6 bytes of space.
 452  *       If %NULL, the length will be computed and returned
 453  *       and nothing will be written to @outbuf.
 454  *
 455  * Converts a single character to UTF-8.
 456  *
 457  * Return value: number of bytes written
 458  **/
 459 int
 460 g_unichar_to_utf8 (gunichar c,
 461                    gchar   *outbuf)
 462 {
 463   /* If this gets modified, also update the copy in g_string_insert_unichar() */
 464   guint len = 0;
 465   int first;
 466   int i;
 467
 468   if (c < 0x80)
 469     {
 470       first = 0;
 471       len = 1;
 472     }
 473   else if (c < 0x800)
 474     {
 475       first = 0xc0;
 476       len = 2;
 477     }
 478   else if (c < 0x10000)
 479     {
 480       first = 0xe0;
 481       len = 3;
 482     }
 483    else if (c < 0x200000)
 484     {
 485       first = 0xf0;
 486       len = 4;
 487     }
 488   else if (c < 0x4000000)
 489     {
 490       first = 0xf8;
 491       len = 5;
 492     }
 493   else
 494     {
 495       first = 0xfc;
 496       len = 6;
 497     }
 498
 499   if (outbuf)
 500     {
 501       for (i = len - 1; i > 0; --i)
 502         {
 503           outbuf[i] = (c & 0x3f) | 0x80;
 504           c >>= 6;
 505         }
 506       outbuf[0] = c | first;
 507     }
 508
 509   return len;
 510 }
 511
 512 /**
 513  * g_utf8_strchr:
 514  * @p: a nul-terminated UTF-8 encoded string
 515  * @len: the maximum length of @p
 516  * @c: a Unicode character
 517  *
 518  * Finds the leftmost occurrence of the given Unicode character
 519  * in a UTF-8 encoded string, while limiting the search to @len bytes.
 520  * If @len is -1, allow unbounded search.
 521  *
 522  * Return value: %NULL if the string does not contain the character,
 523  *   otherwise, a pointer to the start of the leftmost occurrence of
 524  *   the character in the string.
 525  **/
 526 gchar *
 527 g_utf8_strchr (const char *p,
 528                gssize      len,
 529                gunichar    c)
 530 {
 531   gchar ch[10];
 532
 533   gint charlen = g_unichar_to_utf8 (c, ch);
 534   ch[charlen] = '\0';
 535
 536   return g_strstr_len (p, len, ch);
 537 }
 538
 539
 540 /**
 541  * g_utf8_strrchr:
 542  * @p: a nul-terminated UTF-8 encoded string
 543  * @len: the maximum length of @p
 544  * @c: a Unicode character
 545  *
 546  * Find the rightmost occurrence of the given Unicode character
 547  * in a UTF-8 encoded string, while limiting the search to @len bytes.
 548  * If @len is -1, allow unbounded search.
 549  *
 550  * Return value: %NULL if the string does not contain the character,
 551  *   otherwise, a pointer to the start of the rightmost occurrence of the
 552  *   character in the string.
 553  **/
 554 gchar *
 555 g_utf8_strrchr (const char *p,
 556                 gssize      len,
 557                 gunichar    c)
 558 {
 559   gchar ch[10];
 560
 561   gint charlen = g_unichar_to_utf8 (c, ch);
 562   ch[charlen] = '\0';
 563
 564   return g_strrstr_len (p, len, ch);
 565 }
 566
 567
 568 /* Like g_utf8_get_char, but take a maximum length
 569  * and return (gunichar)-2 on incomplete trailing character;
 570  * also check for malformed or overlong sequences
 571  * and return (gunichar)-1 in this case.
 572  */
 573 static inline gunichar
 574 g_utf8_get_char_extended (const  gchar *p,
 575                           gssize max_len)
 576 {
 577   guint i, len;
 578   gunichar min_code;
 579   gunichar wc = (guchar) *p;
 580
 581   if (wc < 0x80)
 582     {
 583       return wc;
 584     }
 585   else if (G_UNLIKELY (wc < 0xc0))
 586     {
 587       return (gunichar)-1;
 588     }
 589   else if (wc < 0xe0)
 590     {
 591       len = 2;
 592       wc &= 0x1f;
 593       min_code = 1 << 7;
 594     }
 595   else if (wc < 0xf0)
 596     {
 597       len = 3;
 598       wc &= 0x0f;
 599       min_code = 1 << 11;
 600     }
 601   else if (wc < 0xf8)
 602     {
 603       len = 4;
 604       wc &= 0x07;
 605       min_code = 1 << 16;
 606     }
 607   else if (wc < 0xfc)
 608     {
 609       len = 5;
 610       wc &= 0x03;
 611       min_code = 1 << 21;
 612     }
 613   else if (wc < 0xfe)
 614     {
 615       len = 6;
 616       wc &= 0x01;
 617       min_code = 1 << 26;
 618     }
 619   else
 620     {
 621       return (gunichar)-1;
 622     }
 623
 624   if (G_UNLIKELY (max_len >= 0 && len > max_len))
 625     {
 626       for (i = 1; i < max_len; i++)
 627         {
 628           if ((((guchar *)p)[i] & 0xc0) != 0x80)
 629             return (gunichar)-1;
 630         }
 631       return (gunichar)-2;
 632     }
 633
 634   for (i = 1; i < len; ++i)
 635     {
 636       gunichar ch = ((guchar *)p)[i];
 637
 638       if (G_UNLIKELY ((ch & 0xc0) != 0x80))
 639         {
 640           if (ch)
 641             return (gunichar)-1;
 642           else
 643             return (gunichar)-2;
 644         }
 645
 646       wc <<= 6;
 647       wc |= (ch & 0x3f);
 648     }
 649
 650   if (G_UNLIKELY (wc < min_code))
 651     return (gunichar)-1;
 652
 653   return wc;
 654 }
 655
 656 /**
 657  * g_utf8_get_char_validated:
 658  * @p: a pointer to Unicode character encoded as UTF-8
 659  * @max_len: the maximum number of bytes to read, or -1, for no maximum or
 660  *           if @p is nul-terminated
 661  *
 662  * Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
 663  * This function checks for incomplete characters, for invalid characters
 664  * such as characters that are out of the range of Unicode, and for
 665  * overlong encodings of valid characters.
 666  *
 667  * Return value: the resulting character. If @p points to a partial
 668  *    sequence at the end of a string that could begin a valid
 669  *    character (or if @max_len is zero), returns (gunichar)-2;
 670  *    otherwise, if @p does not point to a valid UTF-8 encoded
 671  *    Unicode character, returns (gunichar)-1.
 672  **/
 673 gunichar
 674 g_utf8_get_char_validated (const  gchar *p,
 675                            gssize max_len)
 676 {
 677   gunichar result;
 678
 679   if (max_len == 0)
 680     return (gunichar)-2;
 681
 682   result = g_utf8_get_char_extended (p, max_len);
 683
 684   if (result & 0x80000000)
 685     return result;
 686   else if (!UNICODE_VALID (result))
 687     return (gunichar)-1;
 688   else
 689     return result;
 690 }
 691
 692 /**
 693  * g_utf8_to_ucs4_fast:
 694  * @str: a UTF-8 encoded string
 695  * @len: the maximum length of @str to use, in bytes. If @len < 0,
 696  *       then the string is nul-terminated.
 697  * @items_written: location to store the number of characters in the
 698  *                 result, or %NULL.
 699  *
 700  * Convert a string from UTF-8 to a 32-bit fixed width
 701  * representation as UCS-4, assuming valid UTF-8 input.
 702  * This function is roughly twice as fast as g_utf8_to_ucs4()
 703  * but does no error checking on the input. A trailing 0 character
 704  * will be added to the string after the converted text.
 705  *
 706  * Return value: a pointer to a newly allocated UCS-4 string.
 707  *               This value must be freed with g_free().
 708  **/
 709 gunichar *
 710 g_utf8_to_ucs4_fast (const gchar *str,
 711                      glong        len,
 712                      glong       *items_written)
 713 {
 714   gunichar *result;
 715   gint n_chars, i;
 716   const gchar *p;
 717
 718   g_return_val_if_fail (str != NULL, NULL);
 719
 720   p = str;
 721   n_chars = 0;
 722   if (len < 0)
 723     {
 724       while (*p)
 725         {
 726           p = g_utf8_next_char (p);
 727           ++n_chars;
 728         }
 729     }
 730   else
 731     {
 732       while (p < str + len && *p)
 733         {
 734           p = g_utf8_next_char (p);
 735           ++n_chars;
 736         }
 737     }
 738
 739   result = g_new (gunichar, n_chars + 1);
 740
 741   p = str;
 742   for (i=0; i < n_chars; i++)
 743     {
 744       gunichar wc = (guchar)*p++;
 745
 746       if (wc < 0x80)
 747         {
 748           result[i] = wc;
 749         }
 750       else
 751         {
 752           gunichar mask = 0x40;
 753
 754           if (G_UNLIKELY ((wc & mask) == 0))
 755             {
 756               /* It's an out-of-sequence 10xxxxxxx byte.
 757                * Rather than making an ugly hash of this and the next byte
 758                * and overrunning the buffer, it's more useful to treat it
 759                * with a replacement character */
 760               result[i] = 0xfffd;
 761               continue;
 762             }
 763
 764           do
 765             {
 766               wc <<= 6;
 767               wc |= (guchar)(*p++) & 0x3f;
 768               mask <<= 5;
 769             }
 770           while((wc & mask) != 0);
 771
 772           wc &= mask - 1;
 773
 774           result[i] = wc;
 775         }
 776     }
 777   result[i] = 0;
 778
 779   if (items_written)
 780     *items_written = i;
 781
 782   return result;
 783 }
 784
 785 /**
 786  * g_utf8_to_ucs4:
 787  * @str: a UTF-8 encoded string
 788  * @len: the maximum length of @str to use, in bytes. If @len < 0,
 789  *       then the string is nul-terminated.
 790  * @items_read: location to store number of bytes read, or %NULL.
 791  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
 792  *              returned in case @str contains a trailing partial
 793  *              character. If an error occurs then the index of the
 794  *              invalid input is stored here.
 795  * @items_written: location to store number of characters written or %NULL.
 796  *                 The value here stored does not include the trailing 0
 797  *                 character.
 798  * @error: location to store the error occurring, or %NULL to ignore
 799  *         errors. Any of the errors in #GConvertError other than
 800  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 801  *
 802  * Convert a string from UTF-8 to a 32-bit fixed width
 803  * representation as UCS-4. A trailing 0 character will be added to the
 804  * string after the converted text.
 805  *
 806  * Return value: a pointer to a newly allocated UCS-4 string.
 807  *               This value must be freed with g_free(). If an
 808  *               error occurs, %NULL will be returned and
 809  *               @error set.
 810  **/
 811 gunichar *
 812 g_utf8_to_ucs4 (const gchar *str,
 813                 glong        len,
 814                 glong       *items_read,
 815                 glong       *items_written,
 816                 GError     **error)
 817 {
 818   gunichar *result = NULL;
 819   gint n_chars, i;
 820   const gchar *in;
 821
 822   in = str;
 823   n_chars = 0;
 824   while ((len < 0 || str + len - in > 0) && *in)
 825     {
 826       gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
 827       if (wc & 0x80000000)
 828         {
 829           if (wc == (gunichar)-2)
 830             {
 831               if (items_read)
 832                 break;
 833               else
 834                 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 835                                      _("Partial character sequence at end of input"));
 836             }
 837           else
 838             g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 839                                  _("Invalid byte sequence in conversion input"));
 840
 841           goto err_out;
 842         }
 843
 844       n_chars++;
 845
 846       in = g_utf8_next_char (in);
 847     }
 848
 849   result = g_new (gunichar, n_chars + 1);
 850
 851   in = str;
 852   for (i=0; i < n_chars; i++)
 853     {
 854       result[i] = g_utf8_get_char (in);
 855       in = g_utf8_next_char (in);
 856     }
 857   result[i] = 0;
 858
 859   if (items_written)
 860     *items_written = n_chars;
 861
 862  err_out:
 863   if (items_read)
 864     *items_read = in - str;
 865
 866   return result;
 867 }
 868
 869 /**
 870  * g_ucs4_to_utf8:
 871  * @str: a UCS-4 encoded string
 872  * @len: the maximum length (number of characters) of @str to use.
 873  *       If @len < 0, then the string is nul-terminated.
 874  * @items_read: location to store number of characters read, or %NULL.
 875  * @items_written: location to store number of bytes written or %NULL.
 876  *                 The value here stored does not include the trailing 0
 877  *                 byte.
 878  * @error: location to store the error occurring, or %NULL to ignore
 879  *         errors. Any of the errors in #GConvertError other than
 880  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 881  *
 882  * Convert a string from a 32-bit fixed width representation as UCS-4.
 883  * to UTF-8. The result will be terminated with a 0 byte.
 884  *
 885  * Return value: a pointer to a newly allocated UTF-8 string.
 886  *               This value must be freed with g_free(). If an
 887  *               error occurs, %NULL will be returned and
 888  *               @error set. In that case, @items_read will be
 889  *               set to the position of the first invalid input
 890  *               character.
 891  **/
 892 gchar *
 893 g_ucs4_to_utf8 (const gunichar *str,
 894                 glong           len,
 895                 glong          *items_read,
 896                 glong          *items_written,
 897                 GError        **error)
 898 {
 899   gint result_length;
 900   gchar *result = NULL;
 901   gchar *p;
 902   gint i;
 903
 904   result_length = 0;
 905   for (i = 0; len < 0 || i < len ; i++)
 906     {
 907       if (!str[i])
 908         break;
 909
 910       if (str[i] >= 0x80000000)
 911         {
 912           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 913                                _("Character out of range for UTF-8"));
 914           goto err_out;
 915         }
 916
 917       result_length += UTF8_LENGTH (str[i]);
 918     }
 919
 920   result = g_malloc (result_length + 1);
 921   p = result;
 922
 923   i = 0;
 924   while (p < result + result_length)
 925     p += g_unichar_to_utf8 (str[i++], p);
 926
 927   *p = '\0';
 928
 929   if (items_written)
 930     *items_written = p - result;
 931
 932  err_out:
 933   if (items_read)
 934     *items_read = i;
 935
 936   return result;
 937 }
 938
 939 #define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
 940
 941 /**
 942  * g_utf16_to_utf8:
 943  * @str: a UTF-16 encoded string
 944  * @len: the maximum length (number of <type>gunichar2</type>) of @str to use.
 945  *       If @len < 0, then the string is nul-terminated.
 946  * @items_read: location to store number of words read, or %NULL.
 947  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
 948  *              returned in case @str contains a trailing partial
 949  *              character. If an error occurs then the index of the
 950  *              invalid input is stored here.
 951  * @items_written: location to store number of bytes written, or %NULL.
 952  *                 The value stored here does not include the trailing
 953  *                 0 byte.
 954  * @error: location to store the error occurring, or %NULL to ignore
 955  *         errors. Any of the errors in #GConvertError other than
 956  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 957  *
 958  * Convert a string from UTF-16 to UTF-8. The result will be
 959  * terminated with a 0 byte.
 960  *
 961  * Note that the input is expected to be already in native endianness,
 962  * an initial byte-order-mark character is not handled specially.
 963  * g_convert() can be used to convert a byte buffer of UTF-16 data of
 964  * ambiguous endianess.
 965  *
 966  * Further note that this function does not validate the result
 967  * string; it may e.g. include embedded NUL characters. The only
 968  * validation done by this function is to ensure that the input can
 969  * be correctly interpreted as UTF-16, i.e. it doesn't contain
 970  * things unpaired surrogates.
 971  *
 972  * Return value: a pointer to a newly allocated UTF-8 string.
 973  *               This value must be freed with g_free(). If an
 974  *               error occurs, %NULL will be returned and
 975  *               @error set.
 976  **/
 977 gchar *
 978 g_utf16_to_utf8 (const gunichar2  *str,
 979                  glong             len,
 980                  glong            *items_read,
 981                  glong            *items_written,
 982                  GError          **error)
 983 {
 984   /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
 985    * are marked.
 986    */
 987   const gunichar2 *in;
 988   gchar *out;
 989   gchar *result = NULL;
 990   gint n_bytes;
 991   gunichar high_surrogate;
 992
 993   g_return_val_if_fail (str != NULL, NULL);
 994
 995   n_bytes = 0;
 996   in = str;
 997   high_surrogate = 0;
 998   while ((len < 0 || in - str < len) && *in)
 999     {
1000       gunichar2 c = *in;
1001       gunichar wc;
1002
1003       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1004         {
1005           if (high_surrogate)
1006             {
1007               wc = SURROGATE_VALUE (high_surrogate, c);
1008               high_surrogate = 0;
1009             }
1010           else
1011             {
1012               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1013                                    _("Invalid sequence in conversion input"));
1014               goto err_out;
1015             }
1016         }
1017       else
1018         {
1019           if (high_surrogate)
1020             {
1021               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1022                                    _("Invalid sequence in conversion input"));
1023               goto err_out;
1024             }
1025
1026           if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1027             {
1028               high_surrogate = c;
1029               goto next1;
1030             }
1031           else
1032             wc = c;
1033         }
1034
1035       /********** DIFFERENT for UTF8/UCS4 **********/
1036       n_bytes += UTF8_LENGTH (wc);
1037
1038     next1:
1039       in++;
1040     }
1041
1042   if (high_surrogate && !items_read)
1043     {
1044       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1045                            _("Partial character sequence at end of input"));
1046       goto err_out;
1047     }
1048
1049   /* At this point, everything is valid, and we just need to convert
1050    */
1051   /********** DIFFERENT for UTF8/UCS4 **********/
1052   result = g_malloc (n_bytes + 1);
1053
1054   high_surrogate = 0;
1055   out = result;
1056   in = str;
1057   while (out < result + n_bytes)
1058     {
1059       gunichar2 c = *in;
1060       gunichar wc;
1061
1062       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1063         {
1064           wc = SURROGATE_VALUE (high_surrogate, c);
1065           high_surrogate = 0;
1066         }
1067       else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1068         {
1069           high_surrogate = c;
1070           goto next2;
1071         }
1072       else
1073         wc = c;
1074
1075       /********** DIFFERENT for UTF8/UCS4 **********/
1076       out += g_unichar_to_utf8 (wc, out);
1077
1078     next2:
1079       in++;
1080     }
1081
1082   /********** DIFFERENT for UTF8/UCS4 **********/
1083   *out = '\0';
1084
1085   if (items_written)
1086     /********** DIFFERENT for UTF8/UCS4 **********/
1087     *items_written = out - result;
1088
1089  err_out:
1090   if (items_read)
1091     *items_read = in - str;
1092
1093   return result;
1094 }
1095
1096 /**
1097  * g_utf16_to_ucs4:
1098  * @str: a UTF-16 encoded string
1099  * @len: the maximum length (number of <type>gunichar2</type>) of @str to use.
1100  *       If @len < 0, then the string is nul-terminated.
1101  * @items_read: location to store number of words read, or %NULL.
1102  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1103  *              returned in case @str contains a trailing partial
1104  *              character. If an error occurs then the index of the
1105  *              invalid input is stored here.
1106  * @items_written: location to store number of characters written, or %NULL.
1107  *                 The value stored here does not include the trailing
1108  *                 0 character.
1109  * @error: location to store the error occurring, or %NULL to ignore
1110  *         errors. Any of the errors in #GConvertError other than
1111  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1112  *
1113  * Convert a string from UTF-16 to UCS-4. The result will be
1114  * nul-terminated.
1115  *
1116  * Return value: a pointer to a newly allocated UCS-4 string.
1117  *               This value must be freed with g_free(). If an
1118  *               error occurs, %NULL will be returned and
1119  *               @error set.
1120  **/
1121 gunichar *
1122 g_utf16_to_ucs4 (const gunichar2  *str,
1123                  glong             len,
1124                  glong            *items_read,
1125                  glong            *items_written,
1126                  GError          **error)
1127 {
1128   const gunichar2 *in;
1129   gchar *out;
1130   gchar *result = NULL;
1131   gint n_bytes;
1132   gunichar high_surrogate;
1133
1134   g_return_val_if_fail (str != NULL, NULL);
1135
1136   n_bytes = 0;
1137   in = str;
1138   high_surrogate = 0;
1139   while ((len < 0 || in - str < len) && *in)
1140     {
1141       gunichar2 c = *in;
1142
1143       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1144         {
1145           if (high_surrogate)
1146             {
1147               high_surrogate = 0;
1148             }
1149           else
1150             {
1151               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1152                                    _("Invalid sequence in conversion input"));
1153               goto err_out;
1154             }
1155         }
1156       else
1157         {
1158           if (high_surrogate)
1159             {
1160               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1161                                    _("Invalid sequence in conversion input"));
1162               goto err_out;
1163             }
1164
1165           if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1166             {
1167               high_surrogate = c;
1168               goto next1;
1169             }
1170         }
1171
1172       /********** DIFFERENT for UTF8/UCS4 **********/
1173       n_bytes += sizeof (gunichar);
1174
1175     next1:
1176       in++;
1177     }
1178
1179   if (high_surrogate && !items_read)
1180     {
1181       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1182                            _("Partial character sequence at end of input"));
1183       goto err_out;
1184     }
1185
1186   /* At this point, everything is valid, and we just need to convert
1187    */
1188   /********** DIFFERENT for UTF8/UCS4 **********/
1189   result = g_malloc (n_bytes + 4);
1190
1191   high_surrogate = 0;
1192   out = result;
1193   in = str;
1194   while (out < result + n_bytes)
1195     {
1196       gunichar2 c = *in;
1197       gunichar wc;
1198
1199       if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1200         {
1201           wc = SURROGATE_VALUE (high_surrogate, c);
1202           high_surrogate = 0;
1203         }
1204       else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1205         {
1206           high_surrogate = c;
1207           goto next2;
1208         }
1209       else
1210         wc = c;
1211
1212       /********** DIFFERENT for UTF8/UCS4 **********/
1213       *(gunichar *)out = wc;
1214       out += sizeof (gunichar);
1215
1216     next2:
1217       in++;
1218     }
1219
1220   /********** DIFFERENT for UTF8/UCS4 **********/
1221   *(gunichar *)out = 0;
1222
1223   if (items_written)
1224     /********** DIFFERENT for UTF8/UCS4 **********/
1225     *items_written = (out - result) / sizeof (gunichar);
1226
1227  err_out:
1228   if (items_read)
1229     *items_read = in - str;
1230
1231   return (gunichar *)result;
1232 }
1233
1234 /**
1235  * g_utf8_to_utf16:
1236  * @str: a UTF-8 encoded string
1237  * @len: the maximum length (number of bytes) of @str to use.
1238  *       If @len < 0, then the string is nul-terminated.
1239  * @items_read: location to store number of bytes read, or %NULL.
1240  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1241  *              returned in case @str contains a trailing partial
1242  *              character. If an error occurs then the index of the
1243  *              invalid input is stored here.
1244  * @items_written: location to store number of <type>gunichar2</type> written,
1245  *                 or %NULL.
1246  *                 The value stored here does not include the trailing 0.
1247  * @error: location to store the error occurring, or %NULL to ignore
1248  *         errors. Any of the errors in #GConvertError other than
1249  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1250  *
1251  * Convert a string from UTF-8 to UTF-16. A 0 character will be
1252  * added to the result after the converted text.
1253  *
1254  * Return value: a pointer to a newly allocated UTF-16 string.
1255  *               This value must be freed with g_free(). If an
1256  *               error occurs, %NULL will be returned and
1257  *               @error set.
1258  **/
1259 gunichar2 *
1260 g_utf8_to_utf16 (const gchar *str,
1261                  glong        len,
1262                  glong       *items_read,
1263                  glong       *items_written,
1264                  GError     **error)
1265 {
1266   gunichar2 *result = NULL;
1267   gint n16;
1268   const gchar *in;
1269   gint i;
1270
1271   g_return_val_if_fail (str != NULL, NULL);
1272
1273   in = str;
1274   n16 = 0;
1275   while ((len < 0 || str + len - in > 0) && *in)
1276     {
1277       gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
1278       if (wc & 0x80000000)
1279         {
1280           if (wc == (gunichar)-2)
1281             {
1282               if (items_read)
1283                 break;
1284               else
1285                 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1286                                      _("Partial character sequence at end of input"));
1287             }
1288           else
1289             g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1290                                  _("Invalid byte sequence in conversion input"));
1291
1292           goto err_out;
1293         }
1294
1295       if (wc < 0xd800)
1296         n16 += 1;
1297       else if (wc < 0xe000)
1298         {
1299           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1300                                _("Invalid sequence in conversion input"));
1301
1302           goto err_out;
1303         }
1304       else if (wc < 0x10000)
1305         n16 += 1;
1306       else if (wc < 0x110000)
1307         n16 += 2;
1308       else
1309         {
1310           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1311                                _("Character out of range for UTF-16"));
1312
1313           goto err_out;
1314         }
1315
1316       in = g_utf8_next_char (in);
1317     }
1318
1319   result = g_new (gunichar2, n16 + 1);
1320
1321   in = str;
1322   for (i = 0; i < n16;)
1323     {
1324       gunichar wc = g_utf8_get_char (in);
1325
1326       if (wc < 0x10000)
1327         {
1328           result[i++] = wc;
1329         }
1330       else
1331         {
1332           result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
1333           result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
1334         }
1335
1336       in = g_utf8_next_char (in);
1337     }
1338
1339   result[i] = 0;
1340
1341   if (items_written)
1342     *items_written = n16;
1343
1344  err_out:
1345   if (items_read)
1346     *items_read = in - str;
1347
1348   return result;
1349 }
1350
1351 /**
1352  * g_ucs4_to_utf16:
1353  * @str: a UCS-4 encoded string
1354  * @len: the maximum length (number of characters) of @str to use.
1355  *       If @len < 0, then the string is nul-terminated.
1356  * @items_read: location to store number of bytes read, or %NULL.
1357  *              If an error occurs then the index of the invalid input
1358  *              is stored here.
1359  * @items_written: location to store number of <type>gunichar2</type>
1360  *                 written, or %NULL. The value stored here does not
1361  *                 include the trailing 0.
1362  * @error: location to store the error occurring, or %NULL to ignore
1363  *         errors. Any of the errors in #GConvertError other than
1364  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1365  *
1366  * Convert a string from UCS-4 to UTF-16. A 0 character will be
1367  * added to the result after the converted text.
1368  *
1369  * Return value: a pointer to a newly allocated UTF-16 string.
1370  *               This value must be freed with g_free(). If an
1371  *               error occurs, %NULL will be returned and
1372  *               @error set.
1373  **/
1374 gunichar2 *
1375 g_ucs4_to_utf16 (const gunichar  *str,
1376                  glong            len,
1377                  glong           *items_read,
1378                  glong           *items_written,
1379                  GError         **error)
1380 {
1381   gunichar2 *result = NULL;
1382   gint n16;
1383   gint i, j;
1384
1385   n16 = 0;
1386   i = 0;
1387   while ((len < 0 || i < len) && str[i])
1388     {
1389       gunichar wc = str[i];
1390
1391       if (wc < 0xd800)
1392         n16 += 1;
1393       else if (wc < 0xe000)
1394         {
1395           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1396                                _("Invalid sequence in conversion input"));
1397
1398           goto err_out;
1399         }
1400       else if (wc < 0x10000)
1401         n16 += 1;
1402       else if (wc < 0x110000)
1403         n16 += 2;
1404       else
1405         {
1406           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1407                                _("Character out of range for UTF-16"));
1408
1409           goto err_out;
1410         }
1411
1412       i++;
1413     }
1414
1415   result = g_new (gunichar2, n16 + 1);
1416
1417   for (i = 0, j = 0; j < n16; i++)
1418     {
1419       gunichar wc = str[i];
1420
1421       if (wc < 0x10000)
1422         {
1423           result[j++] = wc;
1424         }
1425       else
1426         {
1427           result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
1428           result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
1429         }
1430     }
1431   result[j] = 0;
1432
1433   if (items_written)
1434     *items_written = n16;
1435
1436  err_out:
1437   if (items_read)
1438     *items_read = i;
1439
1440   return result;
1441 }
1442
1443 #define CONTINUATION_CHAR                           \
1444  G_STMT_START {                                     \
1445   if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
1446     goto error;                                     \
1447   val <<= 6;                                        \
1448   val |= (*(guchar *)p) & 0x3f;                     \
1449  } G_STMT_END
1450
1451 static const gchar *
1452 fast_validate (const char *str)
1453
1454 {
1455   gunichar val = 0;
1456   gunichar min = 0;
1457   const gchar *p;
1458
1459   for (p = str; *p; p++)
1460     {
1461       if (*(guchar *)p < 128)
1462         /* done */;
1463       else
1464         {
1465           const gchar *last;
1466
1467           last = p;
1468           if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
1469             {
1470               if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
1471                 goto error;
1472               p++;
1473               if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
1474                 goto error;
1475             }
1476           else
1477             {
1478               if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
1479                 {
1480                   min = (1 << 11);
1481                   val = *(guchar *)p & 0x0f;
1482                   goto TWO_REMAINING;
1483                 }
1484               else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
1485                 {
1486                   min = (1 << 16);
1487                   val = *(guchar *)p & 0x07;
1488                 }
1489               else
1490                 goto error;
1491
1492               p++;
1493               CONTINUATION_CHAR;
1494             TWO_REMAINING:
1495               p++;
1496               CONTINUATION_CHAR;
1497               p++;
1498               CONTINUATION_CHAR;
1499
1500               if (G_UNLIKELY (val < min))
1501                 goto error;
1502
1503               if (G_UNLIKELY (!UNICODE_VALID(val)))
1504                 goto error;
1505             }
1506
1507           continue;
1508
1509         error:
1510           return last;
1511         }
1512     }
1513
1514   return p;
1515 }
1516
1517 static const gchar *
1518 fast_validate_len (const char *str,
1519                    gssize      max_len)
1520
1521 {
1522   gunichar val = 0;
1523   gunichar min = 0;
1524   const gchar *p;
1525
1526   g_assert (max_len >= 0);
1527
1528   for (p = str; ((p - str) < max_len) && *p; p++)
1529     {
1530       if (*(guchar *)p < 128)
1531         /* done */;
1532       else
1533         {
1534           const gchar *last;
1535
1536           last = p;
1537           if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
1538             {
1539               if (G_UNLIKELY (max_len - (p - str) < 2))
1540                 goto error;
1541
1542               if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
1543                 goto error;
1544               p++;
1545               if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
1546                 goto error;
1547             }
1548           else
1549             {
1550               if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
1551                 {
1552                   if (G_UNLIKELY (max_len - (p - str) < 3))
1553                     goto error;
1554
1555                   min = (1 << 11);
1556                   val = *(guchar *)p & 0x0f;
1557                   goto TWO_REMAINING;
1558                 }
1559               else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
1560                 {
1561                   if (G_UNLIKELY (max_len - (p - str) < 4))
1562                     goto error;
1563
1564                   min = (1 << 16);
1565                   val = *(guchar *)p & 0x07;
1566                 }
1567               else
1568                 goto error;
1569
1570               p++;
1571               CONTINUATION_CHAR;
1572             TWO_REMAINING:
1573               p++;
1574               CONTINUATION_CHAR;
1575               p++;
1576               CONTINUATION_CHAR;
1577
1578               if (G_UNLIKELY (val < min))
1579                 goto error;
1580               if (G_UNLIKELY (!UNICODE_VALID(val)))
1581                 goto error;
1582             }
1583
1584           continue;
1585
1586         error:
1587           return last;
1588         }
1589     }
1590
1591   return p;
1592 }
1593
1594 /**
1595  * g_utf8_validate:
1596  * @str: a pointer to character data
1597  * @max_len: max bytes to validate, or -1 to go until NUL
1598  * @end: (allow-none) (out): return location for end of valid data
1599  *
1600  * Validates UTF-8 encoded text. @str is the text to validate;
1601  * if @str is nul-terminated, then @max_len can be -1, otherwise
1602  * @max_len should be the number of bytes to validate.
1603  * If @end is non-%NULL, then the end of the valid range
1604  * will be stored there (i.e. the start of the first invalid
1605  * character if some bytes were invalid, or the end of the text
1606  * being validated otherwise).
1607  *
1608  * Note that g_utf8_validate() returns %FALSE if @max_len is
1609  * positive and NUL is met before @max_len bytes have been read.
1610  *
1611  * Returns %TRUE if all of @str was valid. Many GLib and GTK+
1612  * routines <emphasis>require</emphasis> valid UTF-8 as input;
1613  * so data read from a file or the network should be checked
1614  * with g_utf8_validate() before doing anything else with it.
1615  *
1616  * Return value: %TRUE if the text was valid UTF-8
1617  **/
1618 gboolean
1619 g_utf8_validate (const char   *str,
1620                  gssize        max_len,
1621                  const gchar **end)
1622
1623 {
1624   const gchar *p;
1625
1626   if (max_len < 0)
1627     p = fast_validate (str);
1628   else
1629     p = fast_validate_len (str, max_len);
1630
1631   if (end)
1632     *end = p;
1633
1634   if ((max_len >= 0 && p != str + max_len) ||
1635       (max_len < 0 && *p != '\0'))
1636     return FALSE;
1637   else
1638     return TRUE;
1639 }
1640
1641 /**
1642  * g_unichar_validate:
1643  * @ch: a Unicode character
1644  *
1645  * Checks whether @ch is a valid Unicode character. Some possible
1646  * integer values of @ch will not be valid. 0 is considered a valid
1647  * character, though it's normally a string terminator.
1648  *
1649  * Return value: %TRUE if @ch is a valid Unicode character
1650  **/
1651 gboolean
1652 g_unichar_validate (gunichar ch)
1653 {
1654   return UNICODE_VALID (ch);
1655 }
1656
1657 /**
1658  * g_utf8_strreverse:
1659  * @str: a UTF-8 encoded string
1660  * @len: the maximum length of @str to use, in bytes. If @len < 0,
1661  *       then the string is nul-terminated.
1662  *
1663  * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text.
1664  * (Use g_utf8_validate() on all text before trying to use UTF-8
1665  * utility functions with it.)
1666  *
1667  * This function is intended for programmatic uses of reversed strings.
1668  * It pays no attention to decomposed characters, combining marks, byte
1669  * order marks, directional indicators (LRM, LRO, etc) and similar
1670  * characters which might need special handling when reversing a string
1671  * for display purposes.
1672  *
1673  * Note that unlike g_strreverse(), this function returns
1674  * newly-allocated memory, which should be freed with g_free() when
1675  * no longer needed.
1676  *
1677  * Returns: a newly-allocated string which is the reverse of @str.
1678  *
1679  * Since: 2.2
1680  */
1681 gchar *
1682 g_utf8_strreverse (const gchar *str,
1683                    gssize       len)
1684 {
1685   gchar *r, *result;
1686   const gchar *p;
1687
1688   if (len < 0)
1689     len = strlen (str);
1690
1691   result = g_new (gchar, len + 1);
1692   r = result + len;
1693   p = str;
1694   while (r > result)
1695     {
1696       gchar *m, skip = g_utf8_skip[*(guchar*) p];
1697       r -= skip;
1698       for (m = r; skip; skip--)
1699         *m++ = *p++;
1700     }
1701   result[len] = 0;
1702
1703   return result;
1704 }
1705
1706
1707 gchar *
1708 _g_utf8_make_valid (const gchar *name)
1709 {
1710   GString *string;
1711   const gchar *remainder, *invalid;
1712   gint remaining_bytes, valid_bytes;
1713
1714   g_return_val_if_fail (name != NULL, NULL);
1715
1716   string = NULL;
1717   remainder = name;
1718   remaining_bytes = strlen (name);
1719
1720   while (remaining_bytes != 0)
1721     {
1722       if (g_utf8_validate (remainder, remaining_bytes, &invalid))
1723         break;
1724       valid_bytes = invalid - remainder;
1725
1726       if (string == NULL)
1727         string = g_string_sized_new (remaining_bytes);
1728
1729       g_string_append_len (string, remainder, valid_bytes);
1730       /* append U+FFFD REPLACEMENT CHARACTER */
1731       g_string_append (string, "\357\277\275");
1732
1733       remaining_bytes -= valid_bytes + 1;
1734       remainder = invalid + 1;
1735     }
1736
1737   if (string == NULL)
1738     return g_strdup (name);
1739
1740   g_string_append (string, remainder);
1741
1742   g_assert (g_utf8_validate (string->str, -1, NULL));
1743
1744   return g_string_free (string, FALSE);
1745 }