gconvert.c

   1 /* GLIB - Library of useful routines for C programming
   2  *
   3  * gconvert.c: Convert between character sets using iconv
   4  * Copyright Red Hat Inc., 2000
   5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the
  19  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20  * Boston, MA 02111-1307, USA.
  21  */
  22
  23 #include <iconv.h>
  24 #include <errno.h>
  25 #include <string.h>
  26 #include <stdlib.h>
  27
  28 #include "glib.h"
  29 #include "config.h"
  30
  31 #ifdef G_PLATFORM_WIN32
  32 #define STRICT
  33 #include <windows.h>
  34 #undef STRICT
  35 #endif
  36
  37 #include "glibintl.h"
  38
  39 GQuark
  40 g_convert_error_quark()
  41 {
  42   static GQuark quark;
  43   if (!quark)
  44     quark = g_quark_from_static_string ("g_convert_error");
  45
  46   return quark;
  47 }
  48
  49 #if defined(USE_LIBICONV) && !defined (_LIBICONV_H)
  50 #error libiconv in use but included iconv.h not from libiconv
  51 #endif
  52 #if !defined(USE_LIBICONV) && defined (_LIBICONV_H)
  53 #error libiconv not in use but included iconv.h is from libiconv
  54 #endif
  55
  56 /**
  57  * g_iconv_open:
  58  * @to_codeset: destination codeset
  59  * @from_codeset: source codeset
  60  *
  61  * Same as the standard UNIX routine iconv_open(), but
  62  * may be implemented via libiconv on UNIX flavors that lack
  63  * a native implementation.
  64  *
  65  * GLib provides g_convert() and g_locale_to_utf8() which are likely
  66  * more convenient than the raw iconv wrappers.
  67  *
  68  * Return value: a "conversion descriptor"
  69  **/
  70 GIConv
  71 g_iconv_open (const gchar  *to_codeset,
  72               const gchar  *from_codeset)
  73 {
  74   iconv_t cd = iconv_open (to_codeset, from_codeset);
  75
  76   return (GIConv)cd;
  77 }
  78
  79 /**
  80  * g_iconv:
  81  * @converter: conversion descriptor from g_iconv_open()
  82  * @inbuf: bytes to convert
  83  * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
  84  * @outbuf: converted output bytes
  85  * @outbytes_left: inout parameter, bytes available to fill in @outbuf
  86  *
  87  * Same as the standard UNIX routine iconv(), but
  88  * may be implemented via libiconv on UNIX flavors that lack
  89  * a native implementation.
  90  *
  91  * GLib provides g_convert() and g_locale_to_utf8() which are likely
  92  * more convenient than the raw iconv wrappers.
  93  *
  94  * Return value: count of non-reversible conversions, or -1 on error
  95  **/
  96 size_t
  97 g_iconv (GIConv   converter,
  98          gchar  **inbuf,
  99          gsize   *inbytes_left,
 100          gchar  **outbuf,
 101          gsize   *outbytes_left)
 102 {
 103   iconv_t cd = (iconv_t)converter;
 104
 105   return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
 106 }
 107
 108 /**
 109  * g_iconv_close:
 110  * @converter: a conversion descriptor from g_iconv_open()
 111  *
 112  * Same as the standard UNIX routine iconv_close(), but
 113  * may be implemented via libiconv on UNIX flavors that lack
 114  * a native implementation. Should be called to clean up
 115  * the conversion descriptor from iconv_open() when
 116  * you are done converting things.
 117  *
 118  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 119  * more convenient than the raw iconv wrappers.
 120  *
 121  * Return value: -1 on error, 0 on success
 122  **/
 123 gint
 124 g_iconv_close (GIConv converter)
 125 {
 126   iconv_t cd = (iconv_t)converter;
 127
 128   return iconv_close (cd);
 129 }
 130
 131 static GIConv
 132 open_converter (const gchar *to_codeset,
 133                 const gchar *from_codeset,
 134                 GError     **error)
 135 {
 136   GIConv cd = g_iconv_open (to_codeset, from_codeset);
 137
 138   if (cd == (iconv_t) -1)
 139     {
 140       /* Something went wrong.  */
 141       if (errno == EINVAL)
 142         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 143                      _("Conversion from character set `%s' to `%s' is not supported"),
 144                      from_codeset, to_codeset);
 145       else
 146         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 147                      _("Could not open converter from `%s' to `%s': %s"),
 148                      from_codeset, to_codeset, strerror (errno));
 149     }
 150
 151   return cd;
 152
 153 }
 154
 155 /**
 156  * g_convert:
 157  * @str:           the string to convert
 158  * @len:           the length of the string
 159  * @to_codeset:    name of character set into which to convert @str
 160  * @from_codeset:  character set of @str.
 161  * @bytes_read:    location to store the number of bytes in the
 162  *                 input string that were successfully converted, or %NULL.
 163  *                 Even if the conversion was succesful, this may be
 164  *                 less than len if there were partial characters
 165  *                 at the end of the input. If the error
 166  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 167  *                 stored will the byte fofset after the last valid
 168  *                 input sequence.
 169  * @bytes_written: the stored in the output buffer (not including the
 170  *                 terminating nul.
 171  * @error:         location to store the error occuring, or %NULL to ignore
 172  *                 errors. Any of the errors in #GConvertError may occur.
 173  *
 174  * Convert a string from one character set to another.
 175  *
 176  * Return value: If the conversion was successful, a newly allocated
 177  *               NUL-terminated string, which must be freed with
 178  *               g_free. Otherwise %NULL and @error will be set.
 179  **/
 180 gchar*
 181 g_convert (const gchar *str,
 182            gssize       len,
 183            const gchar *to_codeset,
 184            const gchar *from_codeset,
 185            gsize       *bytes_read,
 186            gsize       *bytes_written,
 187            GError     **error)
 188 {
 189   gchar *dest;
 190   gchar *outp;
 191   const gchar *p;
 192   gsize inbytes_remaining;
 193   gsize outbytes_remaining;
 194   gsize err;
 195   GIConv cd;
 196   gsize outbuf_size;
 197   gboolean have_error = FALSE;
 198
 199   g_return_val_if_fail (str != NULL, NULL);
 200   g_return_val_if_fail (to_codeset != NULL, NULL);
 201   g_return_val_if_fail (from_codeset != NULL, NULL);
 202
 203   cd = open_converter (to_codeset, from_codeset, error);
 204
 205   if (cd == (GIConv) -1)
 206     {
 207       if (bytes_read)
 208         *bytes_read = 0;
 209
 210       if (bytes_written)
 211         *bytes_written = 0;
 212
 213       return NULL;
 214     }
 215
 216   if (len < 0)
 217     len = strlen (str);
 218
 219   p = str;
 220   inbytes_remaining = len;
 221   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 222
 223   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 224   outp = dest = g_malloc (outbuf_size);
 225
 226  again:
 227
 228   err = g_iconv (cd, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
 229
 230   if (err == (size_t) -1)
 231     {
 232       switch (errno)
 233         {
 234         case EINVAL:
 235           /* Incomplete text, do not report an error */
 236           break;
 237         case E2BIG:
 238           {
 239             size_t used = outp - dest;
 240
 241             outbuf_size *= 2;
 242             dest = g_realloc (dest, outbuf_size);
 243
 244             outp = dest + used;
 245             outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 246
 247             goto again;
 248           }
 249         case EILSEQ:
 250           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 251                        _("Invalid byte sequence in conversion input"));
 252           have_error = TRUE;
 253           break;
 254         default:
 255           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 256                        _("Error during conversion: %s"),
 257                        strerror (errno));
 258           have_error = TRUE;
 259           break;
 260         }
 261     }
 262
 263   *outp = '\0';
 264
 265   g_iconv_close (cd);
 266
 267   if (bytes_read)
 268     *bytes_read = p - str;
 269   else
 270     {
 271       if ((p - str) != len)
 272         {
 273           if (!have_error)
 274             {
 275               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 276                            _("Partial character sequence at end of input"));
 277               have_error = TRUE;
 278             }
 279         }
 280     }
 281
 282   if (bytes_written)
 283     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 284
 285   if (have_error)
 286     {
 287       g_free (dest);
 288       return NULL;
 289     }
 290   else
 291     return dest;
 292 }
 293
 294 /**
 295  * g_convert_with_fallback:
 296  * @str:          the string to convert
 297  * @len:          the length of the string
 298  * @to_codeset:   name of character set into which to convert @str
 299  * @from_codeset: character set of @str.
 300  * @fallback:     UTF-8 string to use in place of character not
 301  *                present in the target encoding. (This must be
 302  *                in the target encoding), if %NULL, characters
 303  *                not in the target encoding will be represented
 304  *                as Unicode escapes \x{XXXX} or \x{XXXXXX}.
 305  * @bytes_read:   location to store the number of bytes in the
 306  *                input string that were successfully converted, or %NULL.
 307  *                Even if the conversion was succesful, this may be
 308  *                less than len if there were partial characters
 309  *                at the end of the input.
 310  * @bytes_written: the stored in the output buffer (not including the
 311  *                 terminating nul.
 312  * @error:        location to store the error occuring, or %NULL to ignore
 313  *                errors. Any of the errors in #GConvertError may occur.
 314  *
 315  * Convert a string from one character set to another, possibly
 316  * including fallback sequences for characters not representable
 317  * in the output. Note that it is not guaranteed that the specification
 318  * for the fallback sequences in @fallback will be honored. Some
 319  * systems may do a approximate conversion from @from_codeset
 320  * to @to_codeset in their iconv() functions, in which case GLib
 321  * will simply return that approximate conversion.
 322  *
 323  * Return value: If the conversion was successful, a newly allocated
 324  *               NUL-terminated string, which must be freed with
 325  *               g_free. Otherwise %NULL and @error will be set.
 326  **/
 327 gchar*
 328 g_convert_with_fallback (const gchar *str,
 329                          gssize       len,
 330                          const gchar *to_codeset,
 331                          const gchar *from_codeset,
 332                          gchar       *fallback,
 333                          gsize       *bytes_read,
 334                          gsize       *bytes_written,
 335                          GError     **error)
 336 {
 337   gchar *utf8;
 338   gchar *dest;
 339   gchar *outp;
 340   const gchar *insert_str = NULL;
 341   const gchar *p;
 342   gsize inbytes_remaining;
 343   const gchar *save_p = NULL;
 344   gsize save_inbytes = 0;
 345   gsize outbytes_remaining;
 346   gsize err;
 347   GIConv cd;
 348   gsize outbuf_size;
 349   gboolean have_error = FALSE;
 350   gboolean done = FALSE;
 351
 352   GError *local_error = NULL;
 353
 354   g_return_val_if_fail (str != NULL, NULL);
 355   g_return_val_if_fail (to_codeset != NULL, NULL);
 356   g_return_val_if_fail (from_codeset != NULL, NULL);
 357
 358   if (len < 0)
 359     len = strlen (str);
 360
 361   /* Try an exact conversion; we only proceed if this fails
 362    * due to an illegal sequence in the input string.
 363    */
 364   dest = g_convert (str, len, to_codeset, from_codeset,
 365                     bytes_read, bytes_written, &local_error);
 366   if (!local_error)
 367     return dest;
 368
 369   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
 370     {
 371       g_propagate_error (error, local_error);
 372       return NULL;
 373     }
 374   else
 375     g_error_free (local_error);
 376
 377   local_error = NULL;
 378
 379   /* No go; to proceed, we need a converter from "UTF-8" to
 380    * to_codeset, and the string as UTF-8.
 381    */
 382   cd = open_converter (to_codeset, "UTF-8", error);
 383   if (cd == (GIConv) -1)
 384     {
 385       if (bytes_read)
 386         *bytes_read = 0;
 387
 388       if (bytes_written)
 389         *bytes_written = 0;
 390
 391       return NULL;
 392     }
 393
 394   utf8 = g_convert (str, len, "UTF-8", from_codeset,
 395                     bytes_read, &inbytes_remaining, error);
 396   if (!utf8)
 397     return NULL;
 398
 399   /* Now the heart of the code. We loop through the UTF-8 string, and
 400    * whenever we hit an offending character, we form fallback, convert
 401    * the fallback to the target codeset, and then go back to
 402    * converting the original string after finishing with the fallback.
 403    *
 404    * The variables save_p and save_inbytes store the input state
 405    * for the original string while we are converting the fallback
 406    */
 407   p = utf8;
 408
 409   outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
 410   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 411   outp = dest = g_malloc (outbuf_size);
 412
 413   while (!done && !have_error)
 414     {
 415       size_t inbytes_tmp = inbytes_remaining;
 416       err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
 417       inbytes_remaining = inbytes_tmp;
 418
 419       if (err == (size_t) -1)
 420         {
 421           switch (errno)
 422             {
 423             case EINVAL:
 424               g_assert_not_reached();
 425               break;
 426             case E2BIG:
 427               {
 428                 size_t used = outp - dest;
 429
 430                 outbuf_size *= 2;
 431                 dest = g_realloc (dest, outbuf_size);
 432
 433                 outp = dest + used;
 434                 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 435
 436                 break;
 437               }
 438             case EILSEQ:
 439               if (save_p)
 440                 {
 441                   /* Error converting fallback string - fatal
 442                    */
 443                   g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 444                                _("Cannot convert fallback '%s' to codeset '%s'"),
 445                                insert_str, to_codeset);
 446                   have_error = TRUE;
 447                   break;
 448                 }
 449               else
 450                 {
 451                   if (!fallback)
 452                     {
 453                       gunichar ch = g_utf8_get_char (p);
 454                       insert_str = g_strdup_printf ("\\x{%0*X}",
 455                                                     (ch < 0x10000) ? 4 : 6,
 456                                                     ch);
 457                     }
 458                   else
 459                     insert_str = fallback;
 460
 461                   save_p = g_utf8_next_char (p);
 462                   save_inbytes = inbytes_remaining - (save_p - p);
 463                   p = insert_str;
 464                   inbytes_remaining = strlen (p);
 465                 }
 466               break;
 467             default:
 468               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 469                            _("Error during conversion: %s"),
 470                            strerror (errno));
 471               have_error = TRUE;
 472               break;
 473             }
 474         }
 475       else
 476         {
 477           if (save_p)
 478             {
 479               if (!fallback)
 480                 g_free ((gchar *)insert_str);
 481               p = save_p;
 482               inbytes_remaining = save_inbytes;
 483               save_p = NULL;
 484             }
 485           else
 486             done = TRUE;
 487         }
 488     }
 489
 490   /* Cleanup
 491    */
 492   *outp = '\0';
 493
 494   g_iconv_close (cd);
 495
 496   if (bytes_written)
 497     *bytes_written = outp - str;        /* Doesn't include '\0' */
 498
 499   g_free (utf8);
 500
 501   if (have_error)
 502     {
 503       if (save_p && !fallback)
 504         g_free ((gchar *)insert_str);
 505       g_free (dest);
 506       return NULL;
 507     }
 508   else
 509     return dest;
 510 }
 511
 512 /*
 513  * g_locale_to_utf8
 514  *
 515  *
 516  */
 517
 518 static gchar *
 519 strdup_len (const gchar *string,
 520             gssize       len,
 521             gsize       *bytes_written,
 522             gsize       *bytes_read)
 523
 524 {
 525   gsize real_len;
 526
 527   if (len < 0)
 528     real_len = strlen (string);
 529   else
 530     {
 531       real_len = 0;
 532
 533       while (real_len < len && string[real_len])
 534         real_len++;
 535     }
 536
 537   if (bytes_read)
 538     *bytes_read = real_len;
 539   if (bytes_written)
 540     *bytes_written = real_len;
 541
 542   return g_strndup (string, real_len);
 543 }
 544
 545 /**
 546  * g_locale_to_utf8:
 547  * @opsysstring:   a string in the encoding of the current locale
 548  * @len:           the length of the string, or -1 if the string is
 549  *                 NULL-terminated.
 550  * @bytes_read:    location to store the number of bytes in the
 551  *                 input string that were successfully converted, or %NULL.
 552  *                 Even if the conversion was succesful, this may be
 553  *                 less than len if there were partial characters
 554  *                 at the end of the input. If the error
 555  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 556  *                 stored will the byte fofset after the last valid
 557  *                 input sequence.
 558  * @bytes_written: the stored in the output buffer (not including the
 559  *                 terminating nul.
 560  * @error: location to store the error occuring, or %NULL to ignore
 561  *                 errors. Any of the errors in #GConvertError may occur.
 562  *
 563  * Converts a string which is in the encoding used for strings by
 564  * the C runtime (usually the same as that used by the operating
 565  * system) in the current locale into a UTF-8 string.
 566  *
 567  * Return value: The converted string, or %NULL on an error.
 568  **/
 569 gchar *
 570 g_locale_to_utf8 (const gchar  *opsysstring,
 571                   gssize        len,
 572                   gsize        *bytes_read,
 573                   gsize        *bytes_written,
 574                   GError      **error)
 575 {
 576 #ifdef G_PLATFORM_WIN32
 577
 578   gint i, clen, total_len, wclen, first;
 579   wchar_t *wcs, wc;
 580   gchar *result, *bp;
 581   const wchar_t *wcp;
 582
 583   if (len == -1)
 584     len = strlen (opsysstring);
 585
 586   wcs = g_new (wchar_t, len);
 587   wclen = MultiByteToWideChar (CP_ACP, 0, opsysstring, len, wcs, len);
 588
 589   wcp = wcs;
 590   total_len = 0;
 591   for (i = 0; i < wclen; i++)
 592     {
 593       wc = *wcp++;
 594
 595       if (wc < 0x80)
 596         total_len += 1;
 597       else if (wc < 0x800)
 598         total_len += 2;
 599       else if (wc < 0x10000)
 600         total_len += 3;
 601       else if (wc < 0x200000)
 602         total_len += 4;
 603       else if (wc < 0x4000000)
 604         total_len += 5;
 605       else
 606         total_len += 6;
 607     }
 608
 609   result = g_malloc (total_len + 1);
 610
 611   wcp = wcs;
 612   bp = result;
 613   for (i = 0; i < wclen; i++)
 614     {
 615       wc = *wcp++;
 616
 617       if (wc < 0x80)
 618         {
 619           first = 0;
 620           clen = 1;
 621         }
 622       else if (wc < 0x800)
 623         {
 624           first = 0xc0;
 625           clen = 2;
 626         }
 627       else if (wc < 0x10000)
 628         {
 629           first = 0xe0;
 630           clen = 3;
 631         }
 632       else if (wc < 0x200000)
 633         {
 634           first = 0xf0;
 635           clen = 4;
 636         }
 637       else if (wc < 0x4000000)
 638         {
 639           first = 0xf8;
 640           clen = 5;
 641         }
 642       else
 643         {
 644           first = 0xfc;
 645           clen = 6;
 646         }
 647
 648       /* Woo-hoo! */
 649       switch (clen)
 650         {
 651         case 6: bp[5] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 652         case 5: bp[4] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 653         case 4: bp[3] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 654         case 3: bp[2] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 655         case 2: bp[1] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 656         case 1: bp[0] = wc | first;
 657         }
 658
 659       bp += clen;
 660     }
 661   *bp = 0;
 662
 663   g_free (wcs);
 664
 665   if (bytes_read)
 666     *bytes_read = len;
 667   if (bytes_written)
 668     *bytes_written = total_len;
 669
 670   return result;
 671
 672 #else  /* !G_PLATFORM_WIN32 */
 673
 674   const char *charset;
 675
 676   if (g_get_charset (&charset))
 677     return strdup_len (opsysstring, len, bytes_read, bytes_written);
 678   else
 679     return g_convert (opsysstring, len,
 680                       "UTF-8", charset, bytes_read, bytes_written, error);
 681
 682 #endif /* !G_PLATFORM_WIN32 */
 683 }
 684
 685 /**
 686  * g_locale_from_utf8:
 687  * @utf8string:    a UTF-8 encoded string
 688  * @len:           the length of the string, or -1 if the string is
 689  *                 NULL-terminated.
 690  * @bytes_read:    location to store the number of bytes in the
 691  *                 input string that were successfully converted, or %NULL.
 692  *                 Even if the conversion was succesful, this may be
 693  *                 less than len if there were partial characters
 694  *                 at the end of the input. If the error
 695  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 696  *                 stored will the byte fofset after the last valid
 697  *                 input sequence.
 698  * @bytes_written: the stored in the output buffer (not including the
 699  *                 terminating nul.
 700  * @error: location to store the error occuring, or %NULL to ignore
 701  *                 errors. Any of the errors in #GConvertError may occur.
 702  *
 703  * Converts a string from UTF-8 to the encoding used for strings by
 704  * the C runtime (usually the same as that used by the operating
 705  * system) in the current locale.
 706  *
 707  * Return value: The converted string, or %NULL on an error.
 708  **/
 709 gchar *
 710 g_locale_from_utf8 (const gchar *utf8string,
 711                     gssize       len,
 712                     gsize       *bytes_read,
 713                     gsize       *bytes_written,
 714                     GError     **error)
 715 {
 716 #ifdef G_PLATFORM_WIN32
 717
 718   gint i, mask, clen, mblen;
 719   wchar_t *wcs, *wcp;
 720   gchar *result;
 721   guchar *cp, *end, c;
 722   gint n;
 723
 724   if (len == -1)
 725     len = strlen (utf8string);
 726
 727   /* First convert to wide chars */
 728   cp = (guchar *) utf8string;
 729   end = cp + len;
 730   n = 0;
 731   wcs = g_new (wchar_t, len + 1);
 732   wcp = wcs;
 733   while (cp != end)
 734     {
 735       mask = 0;
 736       c = *cp;
 737
 738       if (c < 0x80)
 739         {
 740           clen = 1;
 741           mask = 0x7f;
 742         }
 743       else if ((c & 0xe0) == 0xc0)
 744         {
 745           clen = 2;
 746           mask = 0x1f;
 747         }
 748       else if ((c & 0xf0) == 0xe0)
 749         {
 750           clen = 3;
 751           mask = 0x0f;
 752         }
 753       else if ((c & 0xf8) == 0xf0)
 754         {
 755           clen = 4;
 756           mask = 0x07;
 757         }
 758       else if ((c & 0xfc) == 0xf8)
 759         {
 760           clen = 5;
 761           mask = 0x03;
 762         }
 763       else if ((c & 0xfc) == 0xfc)
 764         {
 765           clen = 6;
 766           mask = 0x01;
 767         }
 768       else
 769         {
 770           g_free (wcs);
 771           return NULL;
 772         }
 773
 774       if (cp + clen > end)
 775         {
 776           g_free (wcs);
 777           return NULL;
 778         }
 779
 780       *wcp = (cp[0] & mask);
 781       for (i = 1; i < clen; i++)
 782         {
 783           if ((cp[i] & 0xc0) != 0x80)
 784             {
 785               g_free (wcs);
 786               return NULL;
 787             }
 788           *wcp <<= 6;
 789           *wcp |= (cp[i] & 0x3f);
 790         }
 791
 792       cp += clen;
 793       wcp++;
 794       n++;
 795     }
 796   if (cp != end)
 797     {
 798       g_free (wcs);
 799       return NULL;
 800     }
 801
 802   /* n is the number of wide chars constructed */
 803
 804   /* Convert to a string in the current ANSI codepage */
 805
 806   result = g_new (gchar, 3 * n + 1);
 807   mblen = WideCharToMultiByte (CP_ACP, 0, wcs, n, result, 3*n, NULL, NULL);
 808   result[mblen] = 0;
 809   g_free (wcs);
 810
 811   if (bytes_read)
 812     *bytes_read = len;
 813   if (bytes_written)
 814     *bytes_written = mblen;
 815
 816   return result;
 817
 818 #else  /* !G_PLATFORM_WIN32 */
 819
 820   const gchar *charset;
 821
 822   if (g_get_charset (&charset))
 823     return strdup_len (utf8string, len, bytes_read, bytes_written);
 824   else
 825     return g_convert (utf8string, len,
 826                       charset, "UTF-8", bytes_read, bytes_written, error);
 827
 828 #endif /* !G_PLATFORM_WIN32 */
 829 }
 830
 831 /**
 832  * g_filename_to_utf8:
 833  * @opsysstring:   a string in the encoding for filenames
 834  * @len:           the length of the string, or -1 if the string is
 835  *                 NULL-terminated.
 836  * @bytes_read:    location to store the number of bytes in the
 837  *                 input string that were successfully converted, or %NULL.
 838  *                 Even if the conversion was succesful, this may be
 839  *                 less than len if there were partial characters
 840  *                 at the end of the input. If the error
 841  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 842  *                 stored will the byte fofset after the last valid
 843  *                 input sequence.
 844  * @bytes_written: the stored in the output buffer (not including the
 845  *                 terminating nul.
 846  * @error: location to store the error occuring, or %NULL to ignore
 847  *                 errors. Any of the errors in #GConvertError may occur.
 848  *
 849  * Converts a string which is in the encoding used for filenames
 850  * into a UTF-8 string.
 851  *
 852  * Return value: The converted string, or %NULL on an error.
 853  **/
 854 gchar*
 855 g_filename_to_utf8 (const gchar *opsysstring,
 856                     gssize       len,
 857                     gsize       *bytes_read,
 858                     gsize       *bytes_written,
 859                     GError     **error)
 860 {
 861 #ifdef G_PLATFORM_WIN32
 862   return g_locale_to_utf8 (opsysstring, len,
 863                            bytes_read, bytes_written,
 864                            error);
 865 #else  /* !G_PLATFORM_WIN32 */
 866   if (getenv ("G_BROKEN_FILENAMES"))
 867     return g_locale_to_utf8 (opsysstring, len,
 868                              bytes_read, bytes_written,
 869                              error);
 870   else
 871     return strdup_len (opsysstring, len, bytes_read, bytes_written);
 872 #endif /* !G_PLATFORM_WIN32 */
 873 }
 874
 875 /**
 876  * g_filename_from_utf8:
 877  * @utf8string:    a UTF-8 encoded string
 878  * @len:           the length of the string, or -1 if the string is
 879  *                 NULL-terminated.
 880  * @bytes_read:    location to store the number of bytes in the
 881  *                 input string that were successfully converted, or %NULL.
 882  *                 Even if the conversion was succesful, this may be
 883  *                 less than len if there were partial characters
 884  *                 at the end of the input. If the error
 885  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 886  *                 stored will the byte fofset after the last valid
 887  *                 input sequence.
 888  * @bytes_written: the stored in the output buffer (not including the
 889  *                 terminating nul.
 890  * @error: location to store the error occuring, or %NULL to ignore
 891  *                 errors. Any of the errors in #GConvertError may occur.
 892  *
 893  * Converts a string from UTF-8 to the encoding used for filenames.
 894  *
 895  * Return value: The converted string, or %NULL on an error.
 896  **/
 897 gchar*
 898 g_filename_from_utf8 (const gchar *utf8string,
 899                       gssize       len,
 900                       gsize       *bytes_read,
 901                       gsize       *bytes_written,
 902                       GError     **error)
 903 {
 904 #ifdef G_PLATFORM_WIN32
 905   return g_locale_from_utf8 (utf8string, len,
 906                              bytes_read, bytes_written,
 907                              error);
 908 #else  /* !G_PLATFORM_WIN32 */
 909   if (getenv ("G_BROKEN_FILENAMES"))
 910     return g_locale_from_utf8 (utf8string, len,
 911                                bytes_read, bytes_written,
 912                                error);
 913   else
 914     return strdup_len (utf8string, len, bytes_read, bytes_written);
 915 #endif /* !G_PLATFORM_WIN32 */
 916 }