glib/gconvert.c

   1 /* GLIB - Library of useful routines for C programming
   2  *
   3  * gconvert.c: Convert between character sets using iconv
   4  * Copyright Red Hat Inc., 2000
   5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the
  19  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20  * Boston, MA 02111-1307, USA.
  21  */
  22
  23 #include <iconv.h>
  24 #include <errno.h>
  25 #include <string.h>
  26 #include <stdlib.h>
  27
  28 #include "glib.h"
  29 #include "config.h"
  30
  31 #ifdef G_PLATFORM_WIN32
  32 #define STRICT
  33 #include <windows.h>
  34 #undef STRICT
  35 #endif
  36
  37 #include "glibintl.h"
  38
  39 GQuark
  40 g_convert_error_quark()
  41 {
  42   static GQuark quark;
  43   if (!quark)
  44     quark = g_quark_from_static_string ("g_convert_error");
  45
  46   return quark;
  47 }
  48
  49 #if defined(USE_LIBICONV) && !defined (_LIBICONV_H)
  50 #error libiconv in use but included iconv.h not from libiconv
  51 #endif
  52 #if !defined(USE_LIBICONV) && defined (_LIBICONV_H)
  53 #error libiconv not in use but included iconv.h is from libiconv
  54 #endif
  55
  56 /**
  57  * g_iconv_open:
  58  * @to_codeset: destination codeset
  59  * @from_codeset: source codeset
  60  *
  61  * Same as the standard UNIX routine iconv_open(), but
  62  * may be implemented via libiconv on UNIX flavors that lack
  63  * a native implementation.
  64  *
  65  * GLib provides g_convert() and g_locale_to_utf8() which are likely
  66  * more convenient than the raw iconv wrappers.
  67  *
  68  * Return value: a "conversion descriptor"
  69  **/
  70 GIConv
  71 g_iconv_open (const gchar  *to_codeset,
  72               const gchar  *from_codeset)
  73 {
  74   iconv_t cd = iconv_open (to_codeset, from_codeset);
  75
  76   return (GIConv)cd;
  77 }
  78
  79 /**
  80  * g_iconv:
  81  * @converter: conversion descriptor from g_iconv_open()
  82  * @inbuf: bytes to convert
  83  * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
  84  * @outbuf: converted output bytes
  85  * @outbytes_left: inout parameter, bytes available to fill in @outbuf
  86  *
  87  * Same as the standard UNIX routine iconv(), but
  88  * may be implemented via libiconv on UNIX flavors that lack
  89  * a native implementation.
  90  *
  91  * GLib provides g_convert() and g_locale_to_utf8() which are likely
  92  * more convenient than the raw iconv wrappers.
  93  *
  94  * Return value: count of non-reversible conversions, or -1 on error
  95  **/
  96 size_t
  97 g_iconv (GIConv   converter,
  98          gchar  **inbuf,
  99          gsize   *inbytes_left,
 100          gchar  **outbuf,
 101          gsize   *outbytes_left)
 102 {
 103   iconv_t cd = (iconv_t)converter;
 104
 105   return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
 106 }
 107
 108 /**
 109  * g_iconv_close:
 110  * @converter: a conversion descriptor from g_iconv_open()
 111  *
 112  * Same as the standard UNIX routine iconv_close(), but
 113  * may be implemented via libiconv on UNIX flavors that lack
 114  * a native implementation. Should be called to clean up
 115  * the conversion descriptor from iconv_open() when
 116  * you are done converting things.
 117  *
 118  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 119  * more convenient than the raw iconv wrappers.
 120  *
 121  * Return value: -1 on error, 0 on success
 122  **/
 123 gint
 124 g_iconv_close (GIConv converter)
 125 {
 126   iconv_t cd = (iconv_t)converter;
 127
 128   return iconv_close (cd);
 129 }
 130
 131 static GIConv
 132 open_converter (const gchar *to_codeset,
 133                 const gchar *from_codeset,
 134                 GError     **error)
 135 {
 136   GIConv cd = g_iconv_open (to_codeset, from_codeset);
 137
 138   if (cd == (iconv_t) -1)
 139     {
 140       /* Something went wrong.  */
 141       if (errno == EINVAL)
 142         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 143                      _("Conversion from character set `%s' to `%s' is not supported"),
 144                      from_codeset, to_codeset);
 145       else
 146         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 147                      _("Could not open converter from `%s' to `%s': %s"),
 148                      from_codeset, to_codeset, strerror (errno));
 149     }
 150
 151   return cd;
 152
 153 }
 154
 155 /**
 156  * g_convert:
 157  * @str:           the string to convert
 158  * @len:           the length of the string
 159  * @to_codeset:    name of character set into which to convert @str
 160  * @from_codeset:  character set of @str.
 161  * @bytes_read:    location to store the number of bytes in the
 162  *                 input string that were successfully converted, or %NULL.
 163  *                 Even if the conversion was succesful, this may be
 164  *                 less than len if there were partial characters
 165  *                 at the end of the input. If the error
 166  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 167  *                 stored will the byte fofset after the last valid
 168  *                 input sequence.
 169  * @bytes_written: the stored in the output buffer (not including the
 170  *                 terminating nul.
 171  * @error:         location to store the error occuring, or %NULL to ignore
 172  *                 errors. Any of the errors in #GConvertError may occur.
 173  *
 174  * Convert a string from one character set to another.
 175  *
 176  * Return value: If the conversion was successful, a newly allocated
 177  *               NUL-terminated string, which must be freed with
 178  *               g_free. Otherwise %NULL and @error will be set.
 179  **/
 180 gchar*
 181 g_convert (const gchar *str,
 182            gssize       len,
 183            const gchar *to_codeset,
 184            const gchar *from_codeset,
 185            gsize       *bytes_read,
 186            gsize       *bytes_written,
 187            GError     **error)
 188 {
 189   gchar *res;
 190   GIConv cd;
 191
 192   g_return_val_if_fail (str != NULL, NULL);
 193   g_return_val_if_fail (to_codeset != NULL, NULL);
 194   g_return_val_if_fail (from_codeset != NULL, NULL);
 195
 196   cd = open_converter (to_codeset, from_codeset, error);
 197
 198   if (cd == (GIConv) -1)
 199     {
 200       if (bytes_read)
 201         *bytes_read = 0;
 202
 203       if (bytes_written)
 204         *bytes_written = 0;
 205
 206       return NULL;
 207     }
 208
 209   res = g_convert_with_iconv (str, len, cd,
 210                               bytes_read, bytes_written,
 211                               error);
 212
 213   g_iconv_close (cd);
 214
 215   return res;
 216 }
 217
 218 /**
 219  * g_convert_with_iconv:
 220  * @str:           the string to convert
 221  * @len:           the length of the string
 222  * @converter:     conversion descriptor from g_iconv_open()
 223  * @bytes_read:    location to store the number of bytes in the
 224  *                 input string that were successfully converted, or %NULL.
 225  *                 Even if the conversion was succesful, this may be
 226  *                 less than len if there were partial characters
 227  *                 at the end of the input. If the error
 228  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 229  *                 stored will the byte fofset after the last valid
 230  *                 input sequence.
 231  * @bytes_written: the stored in the output buffer (not including the
 232  *                 terminating nul.
 233  * @error:         location to store the error occuring, or %NULL to ignore
 234  *                 errors. Any of the errors in #GConvertError may occur.
 235  *
 236  * Convert a string from one character set to another.
 237  *
 238  * Return value: If the conversion was successful, a newly allocated
 239  *               NUL-terminated string, which must be freed with
 240  *               g_free. Otherwise %NULL and @error will be set.
 241  **/
 242 gchar*
 243 g_convert_with_iconv (const gchar *str,
 244                       gssize       len,
 245                       GIConv       converter,
 246                       gsize       *bytes_read,
 247                       gsize       *bytes_written,
 248                       GError     **error)
 249 {
 250   gchar *dest;
 251   gchar *outp;
 252   const gchar *p;
 253   gsize inbytes_remaining;
 254   gsize outbytes_remaining;
 255   gsize err;
 256   gsize outbuf_size;
 257   gboolean have_error = FALSE;
 258
 259   g_return_val_if_fail (str != NULL, NULL);
 260   g_return_val_if_fail (converter != (GIConv) -1, NULL);
 261
 262   if (len < 0)
 263     len = strlen (str);
 264
 265   p = str;
 266   inbytes_remaining = len;
 267   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 268
 269   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 270   outp = dest = g_malloc (outbuf_size);
 271
 272  again:
 273
 274   err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
 275
 276   if (err == (size_t) -1)
 277     {
 278       switch (errno)
 279         {
 280         case EINVAL:
 281           /* Incomplete text, do not report an error */
 282           break;
 283         case E2BIG:
 284           {
 285             size_t used = outp - dest;
 286
 287             outbuf_size *= 2;
 288             dest = g_realloc (dest, outbuf_size);
 289
 290             outp = dest + used;
 291             outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 292
 293             goto again;
 294           }
 295         case EILSEQ:
 296           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 297                        _("Invalid byte sequence in conversion input"));
 298           have_error = TRUE;
 299           break;
 300         default:
 301           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 302                        _("Error during conversion: %s"),
 303                        strerror (errno));
 304           have_error = TRUE;
 305           break;
 306         }
 307     }
 308
 309   *outp = '\0';
 310
 311   if (bytes_read)
 312     *bytes_read = p - str;
 313   else
 314     {
 315       if ((p - str) != len)
 316         {
 317           if (!have_error)
 318             {
 319               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 320                            _("Partial character sequence at end of input"));
 321               have_error = TRUE;
 322             }
 323         }
 324     }
 325
 326   if (bytes_written)
 327     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 328
 329   if (have_error)
 330     {
 331       g_free (dest);
 332       return NULL;
 333     }
 334   else
 335     return dest;
 336 }
 337
 338 /**
 339  * g_convert_with_fallback:
 340  * @str:          the string to convert
 341  * @len:          the length of the string
 342  * @to_codeset:   name of character set into which to convert @str
 343  * @from_codeset: character set of @str.
 344  * @fallback:     UTF-8 string to use in place of character not
 345  *                present in the target encoding. (This must be
 346  *                in the target encoding), if %NULL, characters
 347  *                not in the target encoding will be represented
 348  *                as Unicode escapes \x{XXXX} or \x{XXXXXX}.
 349  * @bytes_read:   location to store the number of bytes in the
 350  *                input string that were successfully converted, or %NULL.
 351  *                Even if the conversion was succesful, this may be
 352  *                less than len if there were partial characters
 353  *                at the end of the input.
 354  * @bytes_written: the stored in the output buffer (not including the
 355  *                 terminating nul.
 356  * @error:        location to store the error occuring, or %NULL to ignore
 357  *                errors. Any of the errors in #GConvertError may occur.
 358  *
 359  * Convert a string from one character set to another, possibly
 360  * including fallback sequences for characters not representable
 361  * in the output. Note that it is not guaranteed that the specification
 362  * for the fallback sequences in @fallback will be honored. Some
 363  * systems may do a approximate conversion from @from_codeset
 364  * to @to_codeset in their iconv() functions, in which case GLib
 365  * will simply return that approximate conversion.
 366  *
 367  * Return value: If the conversion was successful, a newly allocated
 368  *               NUL-terminated string, which must be freed with
 369  *               g_free. Otherwise %NULL and @error will be set.
 370  **/
 371 gchar*
 372 g_convert_with_fallback (const gchar *str,
 373                          gssize       len,
 374                          const gchar *to_codeset,
 375                          const gchar *from_codeset,
 376                          gchar       *fallback,
 377                          gsize       *bytes_read,
 378                          gsize       *bytes_written,
 379                          GError     **error)
 380 {
 381   gchar *utf8;
 382   gchar *dest;
 383   gchar *outp;
 384   const gchar *insert_str = NULL;
 385   const gchar *p;
 386   gsize inbytes_remaining;
 387   const gchar *save_p = NULL;
 388   gsize save_inbytes = 0;
 389   gsize outbytes_remaining;
 390   gsize err;
 391   GIConv cd;
 392   gsize outbuf_size;
 393   gboolean have_error = FALSE;
 394   gboolean done = FALSE;
 395
 396   GError *local_error = NULL;
 397
 398   g_return_val_if_fail (str != NULL, NULL);
 399   g_return_val_if_fail (to_codeset != NULL, NULL);
 400   g_return_val_if_fail (from_codeset != NULL, NULL);
 401
 402   if (len < 0)
 403     len = strlen (str);
 404
 405   /* Try an exact conversion; we only proceed if this fails
 406    * due to an illegal sequence in the input string.
 407    */
 408   dest = g_convert (str, len, to_codeset, from_codeset,
 409                     bytes_read, bytes_written, &local_error);
 410   if (!local_error)
 411     return dest;
 412
 413   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
 414     {
 415       g_propagate_error (error, local_error);
 416       return NULL;
 417     }
 418   else
 419     g_error_free (local_error);
 420
 421   local_error = NULL;
 422
 423   /* No go; to proceed, we need a converter from "UTF-8" to
 424    * to_codeset, and the string as UTF-8.
 425    */
 426   cd = open_converter (to_codeset, "UTF-8", error);
 427   if (cd == (GIConv) -1)
 428     {
 429       if (bytes_read)
 430         *bytes_read = 0;
 431
 432       if (bytes_written)
 433         *bytes_written = 0;
 434
 435       return NULL;
 436     }
 437
 438   utf8 = g_convert (str, len, "UTF-8", from_codeset,
 439                     bytes_read, &inbytes_remaining, error);
 440   if (!utf8)
 441     return NULL;
 442
 443   /* Now the heart of the code. We loop through the UTF-8 string, and
 444    * whenever we hit an offending character, we form fallback, convert
 445    * the fallback to the target codeset, and then go back to
 446    * converting the original string after finishing with the fallback.
 447    *
 448    * The variables save_p and save_inbytes store the input state
 449    * for the original string while we are converting the fallback
 450    */
 451   p = utf8;
 452
 453   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 454   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 455   outp = dest = g_malloc (outbuf_size);
 456
 457   while (!done && !have_error)
 458     {
 459       size_t inbytes_tmp = inbytes_remaining;
 460       err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
 461       inbytes_remaining = inbytes_tmp;
 462
 463       if (err == (size_t) -1)
 464         {
 465           switch (errno)
 466             {
 467             case EINVAL:
 468               g_assert_not_reached();
 469               break;
 470             case E2BIG:
 471               {
 472                 size_t used = outp - dest;
 473
 474                 outbuf_size *= 2;
 475                 dest = g_realloc (dest, outbuf_size);
 476
 477                 outp = dest + used;
 478                 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 479
 480                 break;
 481               }
 482             case EILSEQ:
 483               if (save_p)
 484                 {
 485                   /* Error converting fallback string - fatal
 486                    */
 487                   g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 488                                _("Cannot convert fallback '%s' to codeset '%s'"),
 489                                insert_str, to_codeset);
 490                   have_error = TRUE;
 491                   break;
 492                 }
 493               else
 494                 {
 495                   if (!fallback)
 496                     {
 497                       gunichar ch = g_utf8_get_char (p);
 498                       insert_str = g_strdup_printf ("\\x{%0*X}",
 499                                                     (ch < 0x10000) ? 4 : 6,
 500                                                     ch);
 501                     }
 502                   else
 503                     insert_str = fallback;
 504
 505                   save_p = g_utf8_next_char (p);
 506                   save_inbytes = inbytes_remaining - (save_p - p);
 507                   p = insert_str;
 508                   inbytes_remaining = strlen (p);
 509                 }
 510               break;
 511             default:
 512               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 513                            _("Error during conversion: %s"),
 514                            strerror (errno));
 515               have_error = TRUE;
 516               break;
 517             }
 518         }
 519       else
 520         {
 521           if (save_p)
 522             {
 523               if (!fallback)
 524                 g_free ((gchar *)insert_str);
 525               p = save_p;
 526               inbytes_remaining = save_inbytes;
 527               save_p = NULL;
 528             }
 529           else
 530             done = TRUE;
 531         }
 532     }
 533
 534   /* Cleanup
 535    */
 536   *outp = '\0';
 537
 538   g_iconv_close (cd);
 539
 540   if (bytes_written)
 541     *bytes_written = outp - str;        /* Doesn't include '\0' */
 542
 543   g_free (utf8);
 544
 545   if (have_error)
 546     {
 547       if (save_p && !fallback)
 548         g_free ((gchar *)insert_str);
 549       g_free (dest);
 550       return NULL;
 551     }
 552   else
 553     return dest;
 554 }
 555
 556 /*
 557  * g_locale_to_utf8
 558  *
 559  *
 560  */
 561
 562 static gchar *
 563 strdup_len (const gchar *string,
 564             gssize       len,
 565             gsize       *bytes_written,
 566             gsize       *bytes_read)
 567
 568 {
 569   gsize real_len;
 570
 571   if (len < 0)
 572     real_len = strlen (string);
 573   else
 574     {
 575       real_len = 0;
 576
 577       while (real_len < len && string[real_len])
 578         real_len++;
 579     }
 580
 581   if (bytes_read)
 582     *bytes_read = real_len;
 583   if (bytes_written)
 584     *bytes_written = real_len;
 585
 586   return g_strndup (string, real_len);
 587 }
 588
 589 /**
 590  * g_locale_to_utf8:
 591  * @opsysstring:   a string in the encoding of the current locale
 592  * @len:           the length of the string, or -1 if the string is
 593  *                 NULL-terminated.
 594  * @bytes_read:    location to store the number of bytes in the
 595  *                 input string that were successfully converted, or %NULL.
 596  *                 Even if the conversion was succesful, this may be
 597  *                 less than len if there were partial characters
 598  *                 at the end of the input. If the error
 599  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 600  *                 stored will the byte fofset after the last valid
 601  *                 input sequence.
 602  * @bytes_written: the stored in the output buffer (not including the
 603  *                 terminating nul.
 604  * @error: location to store the error occuring, or %NULL to ignore
 605  *                 errors. Any of the errors in #GConvertError may occur.
 606  *
 607  * Converts a string which is in the encoding used for strings by
 608  * the C runtime (usually the same as that used by the operating
 609  * system) in the current locale into a UTF-8 string.
 610  *
 611  * Return value: The converted string, or %NULL on an error.
 612  **/
 613 gchar *
 614 g_locale_to_utf8 (const gchar  *opsysstring,
 615                   gssize        len,
 616                   gsize        *bytes_read,
 617                   gsize        *bytes_written,
 618                   GError      **error)
 619 {
 620 #ifdef G_PLATFORM_WIN32
 621
 622   gint i, clen, total_len, wclen, first;
 623   wchar_t *wcs, wc;
 624   gchar *result, *bp;
 625   const wchar_t *wcp;
 626
 627   if (len == -1)
 628     len = strlen (opsysstring);
 629
 630   wcs = g_new (wchar_t, len);
 631   wclen = MultiByteToWideChar (CP_ACP, 0, opsysstring, len, wcs, len);
 632
 633   wcp = wcs;
 634   total_len = 0;
 635   for (i = 0; i < wclen; i++)
 636     {
 637       wc = *wcp++;
 638
 639       if (wc < 0x80)
 640         total_len += 1;
 641       else if (wc < 0x800)
 642         total_len += 2;
 643       else if (wc < 0x10000)
 644         total_len += 3;
 645       else if (wc < 0x200000)
 646         total_len += 4;
 647       else if (wc < 0x4000000)
 648         total_len += 5;
 649       else
 650         total_len += 6;
 651     }
 652
 653   result = g_malloc (total_len + 1);
 654
 655   wcp = wcs;
 656   bp = result;
 657   for (i = 0; i < wclen; i++)
 658     {
 659       wc = *wcp++;
 660
 661       if (wc < 0x80)
 662         {
 663           first = 0;
 664           clen = 1;
 665         }
 666       else if (wc < 0x800)
 667         {
 668           first = 0xc0;
 669           clen = 2;
 670         }
 671       else if (wc < 0x10000)
 672         {
 673           first = 0xe0;
 674           clen = 3;
 675         }
 676       else if (wc < 0x200000)
 677         {
 678           first = 0xf0;
 679           clen = 4;
 680         }
 681       else if (wc < 0x4000000)
 682         {
 683           first = 0xf8;
 684           clen = 5;
 685         }
 686       else
 687         {
 688           first = 0xfc;
 689           clen = 6;
 690         }
 691
 692       /* Woo-hoo! */
 693       switch (clen)
 694         {
 695         case 6: bp[5] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 696         case 5: bp[4] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 697         case 4: bp[3] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 698         case 3: bp[2] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 699         case 2: bp[1] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 700         case 1: bp[0] = wc | first;
 701         }
 702
 703       bp += clen;
 704     }
 705   *bp = 0;
 706
 707   g_free (wcs);
 708
 709   if (bytes_read)
 710     *bytes_read = len;
 711   if (bytes_written)
 712     *bytes_written = total_len;
 713
 714   return result;
 715
 716 #else  /* !G_PLATFORM_WIN32 */
 717
 718   const char *charset;
 719
 720   if (g_get_charset (&charset))
 721     return strdup_len (opsysstring, len, bytes_read, bytes_written);
 722   else
 723     return g_convert (opsysstring, len,
 724                       "UTF-8", charset, bytes_read, bytes_written, error);
 725
 726 #endif /* !G_PLATFORM_WIN32 */
 727 }
 728
 729 /**
 730  * g_locale_from_utf8:
 731  * @utf8string:    a UTF-8 encoded string
 732  * @len:           the length of the string, or -1 if the string is
 733  *                 NULL-terminated.
 734  * @bytes_read:    location to store the number of bytes in the
 735  *                 input string that were successfully converted, or %NULL.
 736  *                 Even if the conversion was succesful, this may be
 737  *                 less than len if there were partial characters
 738  *                 at the end of the input. If the error
 739  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 740  *                 stored will the byte fofset after the last valid
 741  *                 input sequence.
 742  * @bytes_written: the stored in the output buffer (not including the
 743  *                 terminating nul.
 744  * @error: location to store the error occuring, or %NULL to ignore
 745  *                 errors. Any of the errors in #GConvertError may occur.
 746  *
 747  * Converts a string from UTF-8 to the encoding used for strings by
 748  * the C runtime (usually the same as that used by the operating
 749  * system) in the current locale.
 750  *
 751  * Return value: The converted string, or %NULL on an error.
 752  **/
 753 gchar *
 754 g_locale_from_utf8 (const gchar *utf8string,
 755                     gssize       len,
 756                     gsize       *bytes_read,
 757                     gsize       *bytes_written,
 758                     GError     **error)
 759 {
 760 #ifdef G_PLATFORM_WIN32
 761
 762   gint i, mask, clen, mblen;
 763   wchar_t *wcs, *wcp;
 764   gchar *result;
 765   guchar *cp, *end, c;
 766   gint n;
 767
 768   if (len == -1)
 769     len = strlen (utf8string);
 770
 771   /* First convert to wide chars */
 772   cp = (guchar *) utf8string;
 773   end = cp + len;
 774   n = 0;
 775   wcs = g_new (wchar_t, len + 1);
 776   wcp = wcs;
 777   while (cp != end)
 778     {
 779       mask = 0;
 780       c = *cp;
 781
 782       if (c < 0x80)
 783         {
 784           clen = 1;
 785           mask = 0x7f;
 786         }
 787       else if ((c & 0xe0) == 0xc0)
 788         {
 789           clen = 2;
 790           mask = 0x1f;
 791         }
 792       else if ((c & 0xf0) == 0xe0)
 793         {
 794           clen = 3;
 795           mask = 0x0f;
 796         }
 797       else if ((c & 0xf8) == 0xf0)
 798         {
 799           clen = 4;
 800           mask = 0x07;
 801         }
 802       else if ((c & 0xfc) == 0xf8)
 803         {
 804           clen = 5;
 805           mask = 0x03;
 806         }
 807       else if ((c & 0xfc) == 0xfc)
 808         {
 809           clen = 6;
 810           mask = 0x01;
 811         }
 812       else
 813         {
 814           g_free (wcs);
 815           return NULL;
 816         }
 817
 818       if (cp + clen > end)
 819         {
 820           g_free (wcs);
 821           return NULL;
 822         }
 823
 824       *wcp = (cp[0] & mask);
 825       for (i = 1; i < clen; i++)
 826         {
 827           if ((cp[i] & 0xc0) != 0x80)
 828             {
 829               g_free (wcs);
 830               return NULL;
 831             }
 832           *wcp <<= 6;
 833           *wcp |= (cp[i] & 0x3f);
 834         }
 835
 836       cp += clen;
 837       wcp++;
 838       n++;
 839     }
 840   if (cp != end)
 841     {
 842       g_free (wcs);
 843       return NULL;
 844     }
 845
 846   /* n is the number of wide chars constructed */
 847
 848   /* Convert to a string in the current ANSI codepage */
 849
 850   result = g_new (gchar, 3 * n + 1);
 851   mblen = WideCharToMultiByte (CP_ACP, 0, wcs, n, result, 3*n, NULL, NULL);
 852   result[mblen] = 0;
 853   g_free (wcs);
 854
 855   if (bytes_read)
 856     *bytes_read = len;
 857   if (bytes_written)
 858     *bytes_written = mblen;
 859
 860   return result;
 861
 862 #else  /* !G_PLATFORM_WIN32 */
 863
 864   const gchar *charset;
 865
 866   if (g_get_charset (&charset))
 867     return strdup_len (utf8string, len, bytes_read, bytes_written);
 868   else
 869     return g_convert (utf8string, len,
 870                       charset, "UTF-8", bytes_read, bytes_written, error);
 871
 872 #endif /* !G_PLATFORM_WIN32 */
 873 }
 874
 875 /**
 876  * g_filename_to_utf8:
 877  * @opsysstring:   a string in the encoding for filenames
 878  * @len:           the length of the string, or -1 if the string is
 879  *                 NULL-terminated.
 880  * @bytes_read:    location to store the number of bytes in the
 881  *                 input string that were successfully converted, or %NULL.
 882  *                 Even if the conversion was succesful, this may be
 883  *                 less than len if there were partial characters
 884  *                 at the end of the input. If the error
 885  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 886  *                 stored will the byte fofset after the last valid
 887  *                 input sequence.
 888  * @bytes_written: the stored in the output buffer (not including the
 889  *                 terminating nul.
 890  * @error: location to store the error occuring, or %NULL to ignore
 891  *                 errors. Any of the errors in #GConvertError may occur.
 892  *
 893  * Converts a string which is in the encoding used for filenames
 894  * into a UTF-8 string.
 895  *
 896  * Return value: The converted string, or %NULL on an error.
 897  **/
 898 gchar*
 899 g_filename_to_utf8 (const gchar *opsysstring,
 900                     gssize       len,
 901                     gsize       *bytes_read,
 902                     gsize       *bytes_written,
 903                     GError     **error)
 904 {
 905 #ifdef G_PLATFORM_WIN32
 906   return g_locale_to_utf8 (opsysstring, len,
 907                            bytes_read, bytes_written,
 908                            error);
 909 #else  /* !G_PLATFORM_WIN32 */
 910   if (getenv ("G_BROKEN_FILENAMES"))
 911     return g_locale_to_utf8 (opsysstring, len,
 912                              bytes_read, bytes_written,
 913                              error);
 914   else
 915     return strdup_len (opsysstring, len, bytes_read, bytes_written);
 916 #endif /* !G_PLATFORM_WIN32 */
 917 }
 918
 919 /**
 920  * g_filename_from_utf8:
 921  * @utf8string:    a UTF-8 encoded string
 922  * @len:           the length of the string, or -1 if the string is
 923  *                 NULL-terminated.
 924  * @bytes_read:    location to store the number of bytes in the
 925  *                 input string that were successfully converted, or %NULL.
 926  *                 Even if the conversion was succesful, this may be
 927  *                 less than len if there were partial characters
 928  *                 at the end of the input. If the error
 929  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 930  *                 stored will the byte fofset after the last valid
 931  *                 input sequence.
 932  * @bytes_written: the stored in the output buffer (not including the
 933  *                 terminating nul.
 934  * @error: location to store the error occuring, or %NULL to ignore
 935  *                 errors. Any of the errors in #GConvertError may occur.
 936  *
 937  * Converts a string from UTF-8 to the encoding used for filenames.
 938  *
 939  * Return value: The converted string, or %NULL on an error.
 940  **/
 941 gchar*
 942 g_filename_from_utf8 (const gchar *utf8string,
 943                       gssize       len,
 944                       gsize       *bytes_read,
 945                       gsize       *bytes_written,
 946                       GError     **error)
 947 {
 948 #ifdef G_PLATFORM_WIN32
 949   return g_locale_from_utf8 (utf8string, len,
 950                              bytes_read, bytes_written,
 951                              error);
 952 #else  /* !G_PLATFORM_WIN32 */
 953   if (getenv ("G_BROKEN_FILENAMES"))
 954     return g_locale_from_utf8 (utf8string, len,
 955                                bytes_read, bytes_written,
 956                                error);
 957   else
 958     return strdup_len (utf8string, len, bytes_read, bytes_written);
 959 #endif /* !G_PLATFORM_WIN32 */
 960 }