glib/gconvert.c

   1 /* GLIB - Library of useful routines for C programming
   2  *
   3  * gconvert.c: Convert between character sets using iconv
   4  * Copyright Red Hat Inc., 2000
   5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com
   6  *
   7  * This library is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2 of the License, or (at your option) any later version.
  11  *
  12  * This library is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with this library; if not, write to the
  19  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20  * Boston, MA 02111-1307, USA.
  21  */
  22
  23 #include <iconv.h>
  24 #include <errno.h>
  25 #include <string.h>
  26 #include <stdlib.h>
  27
  28 #include "glib.h"
  29 #include "config.h"
  30
  31 #ifdef G_PLATFORM_WIN32
  32 #define STRICT
  33 #include <windows.h>
  34 #undef STRICT
  35 #endif
  36
  37 #include "glibintl.h"
  38
  39 GQuark
  40 g_convert_error_quark()
  41 {
  42   static GQuark quark;
  43   if (!quark)
  44     quark = g_quark_from_static_string ("g_convert_error");
  45
  46   return quark;
  47 }
  48
  49 #if defined(USE_LIBICONV) && !defined (_LIBICONV_H)
  50 #error libiconv in use but included iconv.h not from libiconv
  51 #endif
  52 #if !defined(USE_LIBICONV) && defined (_LIBICONV_H)
  53 #error libiconv not in use but included iconv.h is from libiconv
  54 #endif
  55
  56 /**
  57  * g_iconv_open:
  58  * @to_codeset: destination codeset
  59  * @from_codeset: source codeset
  60  *
  61  * Same as the standard UNIX routine iconv_open(), but
  62  * may be implemented via libiconv on UNIX flavors that lack
  63  * a native implementation.
  64  *
  65  * GLib provides g_convert() and g_locale_to_utf8() which are likely
  66  * more convenient than the raw iconv wrappers.
  67  *
  68  * Return value: a "conversion descriptor"
  69  **/
  70 GIConv
  71 g_iconv_open (const gchar  *to_codeset,
  72               const gchar  *from_codeset)
  73 {
  74   iconv_t cd = iconv_open (to_codeset, from_codeset);
  75
  76   return (GIConv)cd;
  77 }
  78
  79 /**
  80  * g_iconv:
  81  * @converter: conversion descriptor from g_iconv_open()
  82  * @inbuf: bytes to convert
  83  * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
  84  * @outbuf: converted output bytes
  85  * @outbytes_left: inout parameter, bytes available to fill in @outbuf
  86  *
  87  * Same as the standard UNIX routine iconv(), but
  88  * may be implemented via libiconv on UNIX flavors that lack
  89  * a native implementation.
  90  *
  91  * GLib provides g_convert() and g_locale_to_utf8() which are likely
  92  * more convenient than the raw iconv wrappers.
  93  *
  94  * Return value: count of non-reversible conversions, or -1 on error
  95  **/
  96 size_t
  97 g_iconv (GIConv   converter,
  98          gchar  **inbuf,
  99          size_t  *inbytes_left,
 100          gchar  **outbuf,
 101          size_t  *outbytes_left)
 102 {
 103   iconv_t cd = (iconv_t)converter;
 104
 105   return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
 106 }
 107
 108 /**
 109  * g_iconv_close:
 110  * @converter: a conversion descriptor from g_iconv_open()
 111  *
 112  * Same as the standard UNIX routine iconv_close(), but
 113  * may be implemented via libiconv on UNIX flavors that lack
 114  * a native implementation. Should be called to clean up
 115  * the conversion descriptor from iconv_open() when
 116  * you are done converting things.
 117  *
 118  * GLib provides g_convert() and g_locale_to_utf8() which are likely
 119  * more convenient than the raw iconv wrappers.
 120  *
 121  * Return value: -1 on error, 0 on success
 122  **/
 123 gint
 124 g_iconv_close (GIConv converter)
 125 {
 126   iconv_t cd = (iconv_t)converter;
 127
 128   return iconv_close (cd);
 129 }
 130
 131 static GIConv
 132 open_converter (const gchar *to_codeset,
 133                 const gchar *from_codeset,
 134                 GError     **error)
 135 {
 136   GIConv cd = g_iconv_open (to_codeset, from_codeset);
 137
 138   if (cd == (iconv_t) -1)
 139     {
 140       /* Something went wrong.  */
 141       if (errno == EINVAL)
 142         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
 143                      _("Conversion from character set `%s' to `%s' is not supported"),
 144                      from_codeset, to_codeset);
 145       else
 146         g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 147                      _("Could not open converter from `%s' to `%s': %s"),
 148                      from_codeset, to_codeset, strerror (errno));
 149     }
 150
 151   return cd;
 152
 153 }
 154
 155 /**
 156  * g_convert:
 157  * @str:           the string to convert
 158  * @len:           the length of the string
 159  * @to_codeset:    name of character set into which to convert @str
 160  * @from_codeset:  character set of @str.
 161  * @bytes_read:    location to store the number of bytes in the
 162  *                 input string that were successfully converted, or %NULL.
 163  *                 Even if the conversion was succesful, this may be
 164  *                 less than len if there were partial characters
 165  *                 at the end of the input. If the error
 166  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 167  *                 stored will the byte fofset after the last valid
 168  *                 input sequence.
 169  * @bytes_written: the stored in the output buffer (not including the
 170  *                 terminating nul.
 171  * @error:         location to store the error occuring, or %NULL to ignore
 172  *                 errors. Any of the errors in #GConvertError may occur.
 173  *
 174  * Convert a string from one character set to another.
 175  *
 176  * Return value: If the conversion was successful, a newly allocated
 177  *               NUL-terminated string, which must be freed with
 178  *               g_free. Otherwise %NULL and @error will be set.
 179  **/
 180 gchar*
 181 g_convert (const gchar *str,
 182            gint         len,
 183            const gchar *to_codeset,
 184            const gchar *from_codeset,
 185            gint        *bytes_read,
 186            gint        *bytes_written,
 187            GError     **error)
 188 {
 189   gchar *dest;
 190   gchar *outp;
 191   const gchar *p;
 192   size_t inbytes_remaining;
 193   size_t outbytes_remaining;
 194   size_t err;
 195   GIConv cd;
 196   size_t outbuf_size;
 197   gboolean have_error = FALSE;
 198
 199   g_return_val_if_fail (str != NULL, NULL);
 200   g_return_val_if_fail (to_codeset != NULL, NULL);
 201   g_return_val_if_fail (from_codeset != NULL, NULL);
 202
 203   cd = open_converter (to_codeset, from_codeset, error);
 204
 205   if (cd == (GIConv) -1)
 206     {
 207       if (bytes_read)
 208         *bytes_read = 0;
 209
 210       if (bytes_written)
 211         *bytes_written = 0;
 212
 213       return NULL;
 214     }
 215
 216   if (len < 0)
 217     len = strlen (str);
 218
 219   p = str;
 220   inbytes_remaining = len;
 221
 222   /* Due to a GLIBC bug, round outbuf_size up to a multiple of 4 */
 223   /* + 1 for nul in case len == 1 */
 224   outbuf_size = ((len + 3) & ~3) + 1;
 225
 226   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 227   outp = dest = g_malloc (outbuf_size);
 228
 229  again:
 230
 231   err = g_iconv (cd, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
 232
 233   if (err == (size_t) -1)
 234     {
 235       switch (errno)
 236         {
 237         case EINVAL:
 238           /* Incomplete text, do not report an error */
 239           break;
 240         case E2BIG:
 241           {
 242             size_t used = outp - dest;
 243
 244             /* glibc's iconv can return E2BIG even if there is space
 245              * remaining if an internal buffer is exhausted. The
 246              * folllowing is a heuristic to catch this. The 16 is
 247              * pretty arbitrary.
 248              */
 249             if (used + 16 > outbuf_size)
 250               {
 251                 outbuf_size = (outbuf_size - 1) * 2 + 1;
 252                 dest = g_realloc (dest, outbuf_size);
 253
 254                 outp = dest + used;
 255                 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 256               }
 257
 258             goto again;
 259           }
 260         case EILSEQ:
 261           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 262                        _("Invalid byte sequence in conversion input"));
 263           have_error = TRUE;
 264           break;
 265         default:
 266           g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 267                        _("Error during conversion: %s"),
 268                        strerror (errno));
 269           have_error = TRUE;
 270           break;
 271         }
 272     }
 273
 274   *outp = '\0';
 275
 276   g_iconv_close (cd);
 277
 278   if (bytes_read)
 279     *bytes_read = p - str;
 280   else
 281     {
 282       if ((p - str) != len)
 283         {
 284           if (!have_error)
 285             {
 286               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
 287                            _("Partial character sequence at end of input"));
 288               have_error = TRUE;
 289             }
 290         }
 291     }
 292
 293   if (bytes_written)
 294     *bytes_written = outp - dest;       /* Doesn't include '\0' */
 295
 296   if (have_error)
 297     {
 298       g_free (dest);
 299       return NULL;
 300     }
 301   else
 302     return dest;
 303 }
 304
 305 /**
 306  * g_convert_with_fallback:
 307  * @str:          the string to convert
 308  * @len:          the length of the string
 309  * @to_codeset:   name of character set into which to convert @str
 310  * @from_codeset: character set of @str.
 311  * @fallback:     UTF-8 string to use in place of character not
 312  *                present in the target encoding. (This must be
 313  *                in the target encoding), if %NULL, characters
 314  *                not in the target encoding will be represented
 315  *                as Unicode escapes \x{XXXX} or \x{XXXXXX}.
 316  * @bytes_read:   location to store the number of bytes in the
 317  *                input string that were successfully converted, or %NULL.
 318  *                Even if the conversion was succesful, this may be
 319  *                less than len if there were partial characters
 320  *                at the end of the input. If the error
 321  *                G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 322  *                stored will the byte fofset after the last valid
 323  *                input sequence.
 324  * @bytes_written: the stored in the output buffer (not including the
 325  *                 terminating nul.
 326  * @error:        location to store the error occuring, or %NULL to ignore
 327  *                errors. Any of the errors in #GConvertError may occur.
 328  *
 329  * Convert a string from one character set to another, possibly
 330  * including fallback sequences for characters not representable
 331  * in the output. Note that it is not guaranteed that the specification
 332  * for the fallback sequences in @fallback will be honored. Some
 333  * systems may do a approximate conversion from @from_codeset
 334  * to @to_codeset in their iconv() functions, in which case GLib
 335  * will simply return that approximate conversion.
 336  *
 337  * Return value: If the conversion was successful, a newly allocated
 338  *               NUL-terminated string, which must be freed with
 339  *               g_free. Otherwise %NULL and @error will be set.
 340  **/
 341 gchar*
 342 g_convert_with_fallback (const gchar *str,
 343                          gint         len,
 344                          const gchar *to_codeset,
 345                          const gchar *from_codeset,
 346                          gchar       *fallback,
 347                          gint        *bytes_read,
 348                          gint        *bytes_written,
 349                          GError     **error)
 350 {
 351   gchar *utf8;
 352   gchar *dest;
 353   gchar *outp;
 354   const gchar *insert_str = NULL;
 355   const gchar *p;
 356   int inbytes_remaining;
 357   const gchar *save_p = NULL;
 358   size_t save_inbytes = 0;
 359   size_t outbytes_remaining;
 360   size_t err;
 361   GIConv cd;
 362   size_t outbuf_size;
 363   gboolean have_error = FALSE;
 364   gboolean done = FALSE;
 365
 366   GError *local_error = NULL;
 367
 368   g_return_val_if_fail (str != NULL, NULL);
 369   g_return_val_if_fail (to_codeset != NULL, NULL);
 370   g_return_val_if_fail (from_codeset != NULL, NULL);
 371
 372   if (len < 0)
 373     len = strlen (str);
 374
 375   /* Try an exact conversion; we only proceed if this fails
 376    * due to an illegal sequence in the input string.
 377    */
 378   dest = g_convert (str, len, to_codeset, from_codeset,
 379                     bytes_read, bytes_written, &local_error);
 380   if (!local_error)
 381     return dest;
 382
 383   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
 384     {
 385       g_propagate_error (error, local_error);
 386       return NULL;
 387     }
 388   else
 389     g_error_free (local_error);
 390
 391   local_error = NULL;
 392
 393   /* No go; to proceed, we need a converter from "UTF-8" to
 394    * to_codeset, and the string as UTF-8.
 395    */
 396   cd = open_converter (to_codeset, "UTF-8", error);
 397   if (cd == (GIConv) -1)
 398     {
 399       if (bytes_read)
 400         *bytes_read = 0;
 401
 402       if (bytes_written)
 403         *bytes_written = 0;
 404
 405       return NULL;
 406     }
 407
 408   utf8 = g_convert (str, len, "UTF-8", from_codeset,
 409                     bytes_read, &inbytes_remaining, error);
 410   if (!utf8)
 411     return NULL;
 412
 413   /* Now the heart of the code. We loop through the UTF-8 string, and
 414    * whenever we hit an offending character, we form fallback, convert
 415    * the fallback to the target codeset, and then go back to
 416    * converting the original string after finishing with the fallback.
 417    *
 418    * The variables save_p and save_inbytes store the input state
 419    * for the original string while we are converting the fallback
 420    */
 421   p = utf8;
 422   /* Due to a GLIBC bug, round outbuf_size up to a multiple of 4 */
 423   /* + 1 for nul in case len == 1 */
 424   outbuf_size = ((len + 3) & ~3) + 1;
 425   outbytes_remaining = outbuf_size - 1; /* -1 for nul */
 426   outp = dest = g_malloc (outbuf_size);
 427
 428   while (!done && !have_error)
 429     {
 430       size_t inbytes_tmp = inbytes_remaining;
 431       err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
 432       inbytes_remaining = inbytes_tmp;
 433
 434       if (err == (size_t) -1)
 435         {
 436           switch (errno)
 437             {
 438             case EINVAL:
 439               g_assert_not_reached();
 440               break;
 441             case E2BIG:
 442               {
 443                 size_t used = outp - dest;
 444
 445                 /* glibc's iconv can return E2BIG even if there is space
 446                  * remaining if an internal buffer is exhausted. The
 447                  * folllowing is a heuristic to catch this. The 16 is
 448                  * pretty arbitrary.
 449                  */
 450                 if (used + 16 > outbuf_size)
 451                   {
 452                     outbuf_size = (outbuf_size - 1) * 2 + 1;
 453                     dest = g_realloc (dest, outbuf_size);
 454
 455                     outp = dest + used;
 456                     outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
 457                   }
 458
 459                 break;
 460               }
 461             case EILSEQ:
 462               if (save_p)
 463                 {
 464                   /* Error converting fallback string - fatal
 465                    */
 466                   g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 467                                _("Cannot convert fallback '%s' to codeset '%s'"),
 468                                insert_str, to_codeset);
 469                   have_error = TRUE;
 470                   break;
 471                 }
 472               else
 473                 {
 474                   if (!fallback)
 475                     {
 476                       gunichar ch = g_utf8_get_char (p);
 477                       insert_str = g_strdup_printf ("\\x{%0*X}",
 478                                                     (ch < 0x10000) ? 4 : 6,
 479                                                     ch);
 480                     }
 481                   else
 482                     insert_str = fallback;
 483
 484                   save_p = g_utf8_next_char (p);
 485                   save_inbytes = inbytes_remaining - (save_p - p);
 486                   p = insert_str;
 487                   inbytes_remaining = strlen (p);
 488                 }
 489               break;
 490             default:
 491               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
 492                            _("Error during conversion: %s"),
 493                            strerror (errno));
 494               have_error = TRUE;
 495               break;
 496             }
 497         }
 498       else
 499         {
 500           if (save_p)
 501             {
 502               if (!fallback)
 503                 g_free ((gchar *)insert_str);
 504               p = save_p;
 505               inbytes_remaining = save_inbytes;
 506               save_p = NULL;
 507             }
 508           else
 509             done = TRUE;
 510         }
 511     }
 512
 513   /* Cleanup
 514    */
 515   *outp = '\0';
 516
 517   g_iconv_close (cd);
 518
 519   if (bytes_written)
 520     *bytes_written = outp - str;        /* Doesn't include '\0' */
 521
 522   g_free (utf8);
 523
 524   if (have_error)
 525     {
 526       if (save_p && !fallback)
 527         g_free ((gchar *)insert_str);
 528       g_free (dest);
 529       return NULL;
 530     }
 531   else
 532     return dest;
 533 }
 534
 535 /*
 536  * g_locale_to_utf8
 537  *
 538  *
 539  */
 540
 541 /**
 542  * g_locale_to_utf8:
 543  * @opsysstring:   a string in the encoding of the current locale
 544  * @len:           the length of the string, or -1 if the string is
 545  *                 NULL-terminated.
 546  * @bytes_read:    location to store the number of bytes in the
 547  *                 input string that were successfully converted, or %NULL.
 548  *                 Even if the conversion was succesful, this may be
 549  *                 less than len if there were partial characters
 550  *                 at the end of the input. If the error
 551  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 552  *                 stored will the byte fofset after the last valid
 553  *                 input sequence.
 554  * @bytes_written: the stored in the output buffer (not including the
 555  *                 terminating nul.
 556  * @error: location to store the error occuring, or %NULL to ignore
 557  *                 errors. Any of the errors in #GConvertError may occur.
 558  *
 559  * Converts a string which is in the encoding used for strings by
 560  * the C runtime (usually the same as that used by the operating
 561  * system) in the current locale into a UTF-8 string.
 562  *
 563  * Return value: The converted string, or %NULL on an error.
 564  **/
 565 gchar *
 566 g_locale_to_utf8 (const gchar  *opsysstring,
 567                   gint          len,
 568                   gint         *bytes_read,
 569                   gint         *bytes_written,
 570                   GError      **error)
 571 {
 572 #ifdef G_PLATFORM_WIN32
 573
 574   gint i, clen, total_len, wclen, first;
 575   wchar_t *wcs, wc;
 576   gchar *result, *bp;
 577   const wchar_t *wcp;
 578
 579   if (len == -1)
 580     len = strlen (opsysstring);
 581
 582   wcs = g_new (wchar_t, len);
 583   wclen = MultiByteToWideChar (CP_ACP, 0, opsysstring, len, wcs, len);
 584
 585   wcp = wcs;
 586   total_len = 0;
 587   for (i = 0; i < wclen; i++)
 588     {
 589       wc = *wcp++;
 590
 591       if (wc < 0x80)
 592         total_len += 1;
 593       else if (wc < 0x800)
 594         total_len += 2;
 595       else if (wc < 0x10000)
 596         total_len += 3;
 597       else if (wc < 0x200000)
 598         total_len += 4;
 599       else if (wc < 0x4000000)
 600         total_len += 5;
 601       else
 602         total_len += 6;
 603     }
 604
 605   result = g_malloc (total_len + 1);
 606
 607   wcp = wcs;
 608   bp = result;
 609   for (i = 0; i < wclen; i++)
 610     {
 611       wc = *wcp++;
 612
 613       if (wc < 0x80)
 614         {
 615           first = 0;
 616           clen = 1;
 617         }
 618       else if (wc < 0x800)
 619         {
 620           first = 0xc0;
 621           clen = 2;
 622         }
 623       else if (wc < 0x10000)
 624         {
 625           first = 0xe0;
 626           clen = 3;
 627         }
 628       else if (wc < 0x200000)
 629         {
 630           first = 0xf0;
 631           clen = 4;
 632         }
 633       else if (wc < 0x4000000)
 634         {
 635           first = 0xf8;
 636           clen = 5;
 637         }
 638       else
 639         {
 640           first = 0xfc;
 641           clen = 6;
 642         }
 643
 644       /* Woo-hoo! */
 645       switch (clen)
 646         {
 647         case 6: bp[5] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 648         case 5: bp[4] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 649         case 4: bp[3] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 650         case 3: bp[2] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 651         case 2: bp[1] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
 652         case 1: bp[0] = wc | first;
 653         }
 654
 655       bp += clen;
 656     }
 657   *bp = 0;
 658
 659   g_free (wcs);
 660
 661   if (bytes_read)
 662     *bytes_read = len;
 663   if (bytes_written)
 664     *bytes_written = total_len;
 665
 666   return result;
 667
 668 #else  /* !G_PLATFORM_WIN32 */
 669
 670   char *charset, *str;
 671
 672   if (g_get_charset (&charset))
 673     return g_strdup (opsysstring);
 674
 675   str = g_convert (opsysstring, len,
 676                    "UTF-8", charset, bytes_read, bytes_written, error);
 677
 678   return str;
 679 #endif /* !G_PLATFORM_WIN32 */
 680 }
 681
 682 /**
 683  * g_locale_from_utf8:
 684  * @utf8string:    a UTF-8 encoded string
 685  * @len:           the length of the string, or -1 if the string is
 686  *                 NULL-terminated.
 687  * @bytes_read:    location to store the number of bytes in the
 688  *                 input string that were successfully converted, or %NULL.
 689  *                 Even if the conversion was succesful, this may be
 690  *                 less than len if there were partial characters
 691  *                 at the end of the input. If the error
 692  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 693  *                 stored will the byte fofset after the last valid
 694  *                 input sequence.
 695  * @bytes_written: the stored in the output buffer (not including the
 696  *                 terminating nul.
 697  * @error: location to store the error occuring, or %NULL to ignore
 698  *                 errors. Any of the errors in #GConvertError may occur.
 699  *
 700  * Converts a string from UTF-8 to the encoding used for strings by
 701  * the C runtime (usually the same as that used by the operating
 702  * system) in the current locale.
 703  *
 704  * Return value: The converted string, or %NULL on an error.
 705  **/
 706 gchar *
 707 g_locale_from_utf8 (const gchar *utf8string,
 708                     gint         len,
 709                     gint        *bytes_read,
 710                     gint        *bytes_written,
 711                     GError     **error)
 712 {
 713 #ifdef G_PLATFORM_WIN32
 714
 715   gint i, mask, clen, mblen;
 716   wchar_t *wcs, *wcp;
 717   gchar *result;
 718   guchar *cp, *end, c;
 719   gint n;
 720
 721   if (len == -1)
 722     len = strlen (utf8string);
 723
 724   /* First convert to wide chars */
 725   cp = (guchar *) utf8string;
 726   end = cp + len;
 727   n = 0;
 728   wcs = g_new (wchar_t, len + 1);
 729   wcp = wcs;
 730   while (cp != end)
 731     {
 732       mask = 0;
 733       c = *cp;
 734
 735       if (c < 0x80)
 736         {
 737           clen = 1;
 738           mask = 0x7f;
 739         }
 740       else if ((c & 0xe0) == 0xc0)
 741         {
 742           clen = 2;
 743           mask = 0x1f;
 744         }
 745       else if ((c & 0xf0) == 0xe0)
 746         {
 747           clen = 3;
 748           mask = 0x0f;
 749         }
 750       else if ((c & 0xf8) == 0xf0)
 751         {
 752           clen = 4;
 753           mask = 0x07;
 754         }
 755       else if ((c & 0xfc) == 0xf8)
 756         {
 757           clen = 5;
 758           mask = 0x03;
 759         }
 760       else if ((c & 0xfc) == 0xfc)
 761         {
 762           clen = 6;
 763           mask = 0x01;
 764         }
 765       else
 766         {
 767           g_free (wcs);
 768           return NULL;
 769         }
 770
 771       if (cp + clen > end)
 772         {
 773           g_free (wcs);
 774           return NULL;
 775         }
 776
 777       *wcp = (cp[0] & mask);
 778       for (i = 1; i < clen; i++)
 779         {
 780           if ((cp[i] & 0xc0) != 0x80)
 781             {
 782               g_free (wcs);
 783               return NULL;
 784             }
 785           *wcp <<= 6;
 786           *wcp |= (cp[i] & 0x3f);
 787         }
 788
 789       cp += clen;
 790       wcp++;
 791       n++;
 792     }
 793   if (cp != end)
 794     {
 795       g_free (wcs);
 796       return NULL;
 797     }
 798
 799   /* n is the number of wide chars constructed */
 800
 801   /* Convert to a string in the current ANSI codepage */
 802
 803   result = g_new (gchar, 3 * n + 1);
 804   mblen = WideCharToMultiByte (CP_ACP, 0, wcs, n, result, 3*n, NULL, NULL);
 805   result[mblen] = 0;
 806   g_free (wcs);
 807
 808   if (bytes_read)
 809     *bytes_read = len;
 810   if (bytes_written)
 811     *bytes_written = mblen;
 812
 813   return result;
 814
 815 #else  /* !G_PLATFORM_WIN32 */
 816
 817   gchar *charset, *str;
 818
 819   if (g_get_charset (&charset))
 820     return g_strdup (utf8string);
 821
 822   str = g_convert (utf8string, strlen (utf8string),
 823                    charset, "UTF-8", bytes_read, bytes_written, error);
 824
 825   return str;
 826
 827 #endif /* !G_PLATFORM_WIN32 */
 828 }
 829
 830 /**
 831  * g_filename_to_utf8:
 832  * @opsysstring:   a string in the encoding for filenames
 833  * @len:           the length of the string, or -1 if the string is
 834  *                 NULL-terminated.
 835  * @bytes_read:    location to store the number of bytes in the
 836  *                 input string that were successfully converted, or %NULL.
 837  *                 Even if the conversion was succesful, this may be
 838  *                 less than len if there were partial characters
 839  *                 at the end of the input. If the error
 840  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 841  *                 stored will the byte fofset after the last valid
 842  *                 input sequence.
 843  * @bytes_written: the stored in the output buffer (not including the
 844  *                 terminating nul.
 845  * @error: location to store the error occuring, or %NULL to ignore
 846  *                 errors. Any of the errors in #GConvertError may occur.
 847  *
 848  * Converts a string which is in the encoding used for filenames
 849  * into a UTF-8 string.
 850  *
 851  * Return value: The converted string, or %NULL on an error.
 852  **/
 853 gchar*
 854 g_filename_to_utf8 (const gchar *opsysstring,
 855                     gint         len,
 856                     gint        *bytes_read,
 857                     gint        *bytes_written,
 858                     GError     **error)
 859 {
 860 #ifdef G_PLATFORM_WIN32
 861   return g_locale_to_utf8 (opsysstring, len,
 862                            bytes_read, bytes_written,
 863                            error);
 864 #else  /* !G_PLATFORM_WIN32 */
 865   if (getenv ("G_BROKEN_FILENAMES"))
 866     return g_locale_to_utf8 (opsysstring, len,
 867                              bytes_read, bytes_written,
 868                              error);
 869
 870   if (bytes_read || bytes_written)
 871     {
 872       gint len = strlen (opsysstring);
 873
 874       if (bytes_read)
 875         *bytes_read = len;
 876       if (bytes_written)
 877         *bytes_written = len;
 878     }
 879
 880   if (len < 0)
 881     return g_strdup (opsysstring);
 882   else
 883     return g_strndup (opsysstring, len);
 884 #endif /* !G_PLATFORM_WIN32 */
 885 }
 886
 887 /**
 888  * g_filename_from_utf8:
 889  * @utf8string:    a UTF-8 encoded string
 890  * @len:           the length of the string, or -1 if the string is
 891  *                 NULL-terminated.
 892  * @bytes_read:    location to store the number of bytes in the
 893  *                 input string that were successfully converted, or %NULL.
 894  *                 Even if the conversion was succesful, this may be
 895  *                 less than len if there were partial characters
 896  *                 at the end of the input. If the error
 897  *                 G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 898  *                 stored will the byte fofset after the last valid
 899  *                 input sequence.
 900  * @bytes_written: the stored in the output buffer (not including the
 901  *                 terminating nul.
 902  * @error: location to store the error occuring, or %NULL to ignore
 903  *                 errors. Any of the errors in #GConvertError may occur.
 904  *
 905  * Converts a string from UTF-8 to the encoding used for filenames.
 906  *
 907  * Return value: The converted string, or %NULL on an error.
 908  **/
 909 gchar*
 910 g_filename_from_utf8 (const gchar *utf8string,
 911                       gint         len,
 912                       gint        *bytes_read,
 913                       gint        *bytes_written,
 914                       GError     **error)
 915 {
 916 #ifdef G_PLATFORM_WIN32
 917   return g_locale_from_utf8 (utf8string, len,
 918                              bytes_read, bytes_written,
 919                              error);
 920 #else  /* !G_PLATFORM_WIN32 */
 921   if (getenv ("G_BROKEN_FILENAMES"))
 922     return g_locale_from_utf8 (utf8string, len,
 923                                bytes_read, bytes_written,
 924                                error);
 925
 926   if (bytes_read || bytes_written)
 927     {
 928       gint len = strlen (utf8string);
 929
 930       if (bytes_read)
 931         *bytes_read = len;
 932       if (bytes_written)
 933         *bytes_written = len;
 934     }
 935
 936   if (len < 0)
 937     return g_strdup (utf8string);
 938   else
 939     return g_strndup (utf8string, len);
 940 #endif /* !G_PLATFORM_WIN32 */
 941 }